Comment: | catch up with v1.65 |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.65-end-of-run |
Files: | files | file ages | folders |
SHA1: |
fe83540f6ad304de3f03fda268112557 |
User & Date: | pjhatwal on 2019-03-18 10:57:44 |
Other Links: | branch diff | manifest | tags |
2019-03-22
| ||
14:27 | merged v1.65-end-of-run check-in: 2be9e62191 user: pjhatwal tags: v1.65, v1.6527 | |
2019-03-18
| ||
10:57 | catch up with v1.65 Closed-Leaf check-in: fe83540f6a user: pjhatwal tags: v1.65-end-of-run | |
2019-03-14
| ||
10:39 | Made changes to only create and process 4 packets per mtutil process run check-in: 4b37eea7d0 user: jmoon18 tags: v1.65 | |
2019-03-11
| ||
17:56 | end of run detection check-in: e719f22355 user: pjhatwal tags: v1.65-end-of-run | |
Modified api.scm from [4abe8743ea] to [1958ec5960].
︙ | ︙ | |||
47 48 49 50 51 52 53 54 55 56 57 58 59 60 | get-testinfo-state-status test-get-top-process-pid test-get-paths-matching-keynames-target-new get-prereqs-not-met get-count-tests-running-for-run-id get-run-info get-run-status get-run-stats get-run-times get-targets get-target ;; register-run get-tests-tags get-test-times | > | 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 | get-testinfo-state-status test-get-top-process-pid test-get-paths-matching-keynames-target-new get-prereqs-not-met get-count-tests-running-for-run-id get-run-info get-run-status get-run-state get-run-stats get-run-times get-targets get-target ;; register-run get-tests-tags get-test-times |
︙ | ︙ | |||
156 157 158 159 160 161 162 163 164 165 166 167 168 169 | cmd-in (string->symbol cmd-in))) (params (vector-ref dat 1)) (start-t (current-milliseconds)) (readonly-mode (dbr:dbstruct-read-only dbstruct)) (readonly-command (member cmd api:read-only-queries)) (writecmd-in-readonly-mode (and readonly-mode (not readonly-command))) (res (if writecmd-in-readonly-mode (conc "attempt to run write command "cmd" on a read-only database") (case cmd ;;=============================================== ;; READ/WRITE QUERIES ;;=============================================== | > > > > > | 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 | cmd-in (string->symbol cmd-in))) (params (vector-ref dat 1)) (start-t (current-milliseconds)) (readonly-mode (dbr:dbstruct-read-only dbstruct)) (readonly-command (member cmd api:read-only-queries)) (writecmd-in-readonly-mode (and readonly-mode (not readonly-command))) (foo (begin (common:telemetry-log (conc "api-in:"(->string cmd)) payload: `((params . ,params))) #t)) (res (if writecmd-in-readonly-mode (conc "attempt to run write command "cmd" on a read-only database") (case cmd ;;=============================================== ;; READ/WRITE QUERIES ;;=============================================== |
︙ | ︙ | |||
279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 | ((synchash-get) (apply synchash:server-get dbstruct params)) ((get-raw-run-stats) (apply db:get-raw-run-stats dbstruct params)) ((get-test-times) (apply db:get-test-times dbstruct params)) ;; RUNS ((get-run-info) (apply db:get-run-info dbstruct params)) ((get-run-status) (apply db:get-run-status dbstruct params)) ((set-run-status) (apply db:set-run-status dbstruct params)) ((get-tests-for-run) (apply db:get-tests-for-run dbstruct params)) ((get-test-id) (apply db:get-test-id dbstruct params)) ((get-tests-for-run-mindata) (apply db:get-tests-for-run-mindata dbstruct params)) ((get-tests-for-runs-mindata) (apply db:get-tests-for-runs-mindata dbstruct params)) ((get-runs) (apply db:get-runs dbstruct params)) ((simple-get-runs) (apply db:simple-get-runs dbstruct params)) ((get-num-runs) (apply db:get-num-runs dbstruct params)) | > > | 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 | ((synchash-get) (apply synchash:server-get dbstruct params)) ((get-raw-run-stats) (apply db:get-raw-run-stats dbstruct params)) ((get-test-times) (apply db:get-test-times dbstruct params)) ;; RUNS ((get-run-info) (apply db:get-run-info dbstruct params)) ((get-run-status) (apply db:get-run-status dbstruct params)) ((get-run-state) (apply db:get-run-state dbstruct params)) ((set-run-status) (apply db:set-run-status dbstruct params)) ((set-run-state-status) (apply db:set-run-state-status dbstruct params)) ((get-tests-for-run) (apply db:get-tests-for-run dbstruct params)) ((get-test-id) (apply db:get-test-id dbstruct params)) ((get-tests-for-run-mindata) (apply db:get-tests-for-run-mindata dbstruct params)) ((get-tests-for-runs-mindata) (apply db:get-tests-for-runs-mindata dbstruct params)) ((get-runs) (apply db:get-runs dbstruct params)) ((simple-get-runs) (apply db:simple-get-runs dbstruct params)) ((get-num-runs) (apply db:get-num-runs dbstruct params)) |
︙ | ︙ | |||
328 329 330 331 332 333 334 335 336 337 338 339 340 341 | ((testmeta-get-record) (apply db:testmeta-get-record dbstruct params)) ;; TASKS ((find-task-queue-records) (apply tasks:find-task-queue-records dbstruct params)) (else (debug:print 0 *default-log-port* "ERROR: bad api call " cmd) (conc "ERROR: BAD api call " cmd)))))) ;; save all stats (let ((delta-t (- (current-milliseconds) start-t))) (hash-table-set! *db-api-call-time* cmd (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '())))) (if writecmd-in-readonly-mode | > > > > > | > > > > | | 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 | ((testmeta-get-record) (apply db:testmeta-get-record dbstruct params)) ;; TASKS ((find-task-queue-records) (apply tasks:find-task-queue-records dbstruct params)) (else (debug:print 0 *default-log-port* "ERROR: bad api call " cmd) (conc "ERROR: BAD api call " cmd)))))) ;; save all stats (let ((delta-t (- (current-milliseconds) start-t))) (hash-table-set! *db-api-call-time* cmd (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '())))) (if writecmd-in-readonly-mode (begin (common:telemetry-log (conc "api-out:"(->string cmd)) payload: `((params . ,params) (ok-res . #t))) (vector #f res)) (begin (common:telemetry-log (conc "api-out:"(->string cmd)) payload: `((params . ,params) (ok-res . #f))) (vector #t res)))))))) ;; http-server send-response ;; api:process-request ;; db:* ;; ;; NB// Runs on the server as part of the server loop ;; |
︙ | ︙ |
Modified common.scm from [b6c40dc319] to [c41ac723cd].
︙ | ︙ | |||
15 16 17 18 19 20 21 | ;; ;; You should have received a copy of the GNU General Public License ;; along with Megatest. If not, see <http://www.gnu.org/licenses/>. ;;====================================================================== (use srfi-1 data-structures posix regex-case (prefix base64 base64:) | | | 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 | ;; ;; You should have received a copy of the GNU General Public License ;; along with Megatest. If not, see <http://www.gnu.org/licenses/>. ;;====================================================================== (use srfi-1 data-structures posix regex-case (prefix base64 base64:) format dot-locking csv-xml z3 udp ;; sql-de-lite hostinfo md5 message-digest typed-records directory-utils stack matchable regex posix (srfi 18) extras ;; tcp (prefix nanomsg nmsg:) (prefix sqlite3 sqlite3:) pkts (prefix dbi dbi:) ) |
︙ | ︙ | |||
79 80 81 82 83 84 85 86 87 88 89 90 91 92 | (define (get-file-descriptor-count #!key (pid (current-process-id ))) (list (length (glob (conc "/proc/" pid "/fd/*"))) (length (filter identity (map socket? (glob (conc "/proc/" pid "/fd/*"))))) ) ) ;; GLOBALS ;; CONTEXTS (defstruct cxt (taskdb #f) (cmutex (make-mutex))) | > | 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 | (define (get-file-descriptor-count #!key (pid (current-process-id ))) (list (length (glob (conc "/proc/" pid "/fd/*"))) (length (filter identity (map socket? (glob (conc "/proc/" pid "/fd/*"))))) ) ) ;; GLOBALS ;; CONTEXTS (defstruct cxt (taskdb #f) (cmutex (make-mutex))) |
︙ | ︙ | |||
885 886 887 888 889 890 891 892 893 894 895 896 897 898 | (debug:print-info 13 *default-log-port* "loading writable-watchdog.") (server:writable-watchdog dbstruct))) (debug:print-info 13 *default-log-port* "watchdog done.")) (debug:print-info 13 *default-log-port* "no need for watchdog on non-homehost")))) (define (std-exit-procedure) (on-exit (lambda () 0)) ;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*) (let ((no-hurry (if *time-to-exit* ;; hurry up #f (begin (set! *time-to-exit* #t) #t)))) | > | 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 | (debug:print-info 13 *default-log-port* "loading writable-watchdog.") (server:writable-watchdog dbstruct))) (debug:print-info 13 *default-log-port* "watchdog done.")) (debug:print-info 13 *default-log-port* "no need for watchdog on non-homehost")))) (define (std-exit-procedure) ;;(common:telemetry-log-close) (on-exit (lambda () 0)) ;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*) (let ((no-hurry (if *time-to-exit* ;; hurry up #f (begin (set! *time-to-exit* #t) #t)))) |
︙ | ︙ | |||
1589 1590 1591 1592 1593 1594 1595 | exn #f (with-output-to-file fullpath (lambda ()(pp dat)))))) ;; get cpu load by reading from /proc/loadavg, return all three values ;; (define (common:get-cpu-load remote-host) | > > > | | | | | | | | | | | | 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 | exn #f (with-output-to-file fullpath (lambda ()(pp dat)))))) ;; get cpu load by reading from /proc/loadavg, return all three values ;; (define (common:get-cpu-load remote-host) (handle-exceptions exn '(99 99 99) (let* ((actual-hostname (or remote-host (get-host-name) "localhost"))) (or (common:get-cached-info actual-hostname "cpu-load") (let ((result (if remote-host (map (lambda (res) (if (eof-object? res) 9e99 res)) (with-input-from-pipe (conc "ssh " remote-host " cat /proc/loadavg") (lambda ()(list (read)(read)(read))))) (with-input-from-file "/proc/loadavg" (lambda ()(list (read)(read)(read))))))) (common:write-cached-info actual-hostname "cpu-load" result) result))))) ;; get normalized cpu load by reading from /proc/loadavg and /proc/cpuinfo return all three values and the number of real cpus and the number of threads ;; returns alist '((adj-cpu-load . normalized-proc-load) ... etc. ;; keys: adj-proc-load, adj-core-load, 1m-load, 5m-load, 15m-load ;; (define (common:get-normalized-cpu-load remote-host) (let ((res (common:get-normalized-cpu-load-raw remote-host)) |
︙ | ︙ | |||
3047 3048 3049 3050 3051 3052 3053 | (if thread (handle-exceptions exn #t ;; just ignore it, it might have died in the meantime so joining it will throw an exception (thread-join! thread)) ))) (hash-table-keys *common:thread-punchlist*))) | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 | (if thread (handle-exceptions exn #t ;; just ignore it, it might have died in the meantime so joining it will throw an exception (thread-join! thread)) ))) (hash-table-keys *common:thread-punchlist*))) (define *common:telemetry-log-state* 'startup) (define *common:telemetry-log-socket* #f) (define (common:telemetry-log-open) (if (eq? *common:telemetry-log-state* 'startup) (let* ((serverhost (configf:lookup *configdat* "telemetry" "host")) (serverport (configf:lookup-number *configdat* "telemetry" "port")) (user (or (get-environment-variable "USER") "unknown")) (host (or (get-environment-variable "HOST") "unknown"))) (set! *common:telemetry-log-state* (handle-exceptions exn (begin (debug:print-info 0 *default-log-port* "common-telemetry-log open udp port failure") 'broken) (if (and serverhost serverport user host) (let* ((s (udp-open-socket))) ;;(udp-bind! s #f 0) (udp-connect! s serverhost serverport) (set! *common:telemetry-log-socket* s) 'open) 'not-needed)))))) (define (common:telemetry-log event #!key (payload '())) (if (eq? *common:telemetry-log-state* 'startup) (common:telemetry-log-open)) (if (eq? 'open *common:telemetry-log-state*) (handle-exceptions exn (begin (debug:print-info 0 *default-log-port* "common-telemetry-log comms failure ; disabled (no server?)") ;;(define *common:telemetry-log-state* 'broken-or-no-server-preclose) ;;(common:telemetry-log-close) (define *common:telemetry-log-state* 'broken-or-no-server) (set! *common:telemetry-log-socket* #f) ) (if (and *common:telemetry-log-socket* event) ;; TODO - filter on event against telemetry.want-events (let* ((user (or (get-environment-variable "USER") "unknown")) (host (or (get-environment-variable "HOST") "unknown")) (start (conc "[megatest "event"]")) (toppath (or *toppath* "/dev/null")) (payload-serialized (base64:base64-encode (z3:encode-buffer (with-output-to-string (lambda () (pp payload)))))) (msg (conc user":"host":"start":"(current-process-id)":"(car (argv))":" toppath":"payload-serialized))) (udp-send *common:telemetry-log-socket* msg)))))) (define (common:telemetry-log-close) (when (or (member *common:telemetry-log-state* '(broken-or-no-server-preclose open)) *common:telemetry-log-socket*) (handle-exceptions exn (begin (define *common:telemetry-log-state* 'closed-fail) (debug:print-info 0 *default-log-port* "common-telemetry-log closure failure") ) (begin (define *common:telemetry-log-state* 'closed) (udp-close-socket *common:telemetry-log-socket*) (set! *common:telemetry-log-socket* #f))))) |
Modified configf.scm from [77100eae92] to [c596e07f23].
︙ | ︙ | |||
772 773 774 775 776 777 778 | (hash-table-set! ht (car section)(cdr section))) adat) ht)) ;; if (define (configf:read-alist fname) (handle-exceptions | | | | | | 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 | (hash-table-set! ht (car section)(cdr section))) adat) ht)) ;; if (define (configf:read-alist fname) (handle-exceptions exn #f (configf:alist->config (with-input-from-file fname read)))) (define (configf:write-alist cdat fname) (if (not (common:faux-lock fname)) (debug:print 0 *default-log-port* "INFO: Could not get lock on " fname)) (let* ((dat (configf:config->alist cdat)) (res (begin |
︙ | ︙ |
Modified db.scm from [6a0d8206a6] to [73e0450409].
︙ | ︙ | |||
404 405 406 407 408 409 410 411 412 413 414 415 416 417 | ;; (debug:print 0 *default-log-port* "ERROR: attempt to open database when not on homehost. Exiting. Homehost: " (common:get-homehost)) ;; (exit 1)))) ;; Open the classic megatest.db file (defaults to open in toppath) ;; ;; NOTE: returns a dbdat not a dbstruct! ;; (define (db:open-megatest-db #!key (path #f)(name #f)) (let* ((dbdir (or path *toppath*)) (dbpath (conc dbdir "/" (or name "megatest.db"))) (dbexists (common:file-exists? dbpath)) (db (db:lock-create-open dbpath (lambda (db) (db:initialize-main-db db) | > > > | 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 | ;; (debug:print 0 *default-log-port* "ERROR: attempt to open database when not on homehost. Exiting. Homehost: " (common:get-homehost)) ;; (exit 1)))) ;; Open the classic megatest.db file (defaults to open in toppath) ;; ;; NOTE: returns a dbdat not a dbstruct! ;; ;;(define (db:reopen-megatest-db (define (db:open-megatest-db #!key (path #f)(name #f)) (let* ((dbdir (or path *toppath*)) (dbpath (conc dbdir "/" (or name "megatest.db"))) (dbexists (common:file-exists? dbpath)) (db (db:lock-create-open dbpath (lambda (db) (db:initialize-main-db db) |
︙ | ︙ | |||
1640 1641 1642 1643 1644 1645 1646 | ;; end_time,strftime('%s','now') as now from tests where state in ;; ('RUNNING','REMOTEHOSTSTART','LAUNCHED')); (define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime) (let* ((incompleted '()) (oldlaunched '()) (toplevels '()) | | > > > > > | | | | > < < | > > > > > > > > > > > > > > > > > > | | | > > | | | | | 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 | ;; end_time,strftime('%s','now') as now from tests where state in ;; ('RUNNING','REMOTEHOSTSTART','LAUNCHED')); (define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime) (let* ((incompleted '()) (oldlaunched '()) (toplevels '()) (deadtime-trim (or ovr-deadtime (configf:lookup-number *configdat* "setup" "deadtime"))) (server-start-allowance 200) (server-overloaded-budget 200) (launch-monitor-off-time (or (configf:lookup-number *configdat* "setup" "test-stats-update-period") 30)) (launch-monitor-on-time-budget 30) (launch-monitor-period (+ launch-monitor-off-time launch-monitor-on-time-budget server-overloaded-budget)) (remotehoststart-deadtime-default (+ server-start-allowance server-overloaded-budget 30)) (remotehoststart-deadtime (or deadtime-trim remotehoststart-deadtime-default)) (running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period))) (running-deadtime (or deadtime-trim running-deadtime-default)) ;; two minutes (30 seconds between updates, this leaves 3x grace period) ) (db:with-db dbstruct #f #f (lambda (db) ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes ;; ;; HOWEVER: this code in run:test seems to work fine ;; (> (- (current-seconds)(+ (db:test-get-event_time testdat) ;; (db:test-get-run_duration testdat))) ;; 600) ;; (db:delay-if-busy dbdat) (sqlite3:for-each-row (lambda (test-id run-dir uname testname item-path event-time run-duration) (if (and (equal? uname "n/a") (equal? item-path "")) ;; this is a toplevel test ;; what to do with toplevel? call rollup? (begin (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id)) (begin (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)) (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time" run-duration="run-duration)))) db "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('RUNNING');" run-id running-deadtime) (sqlite3:for-each-row (lambda (test-id run-dir uname testname item-path event-time run-duration) (if (and (equal? uname "n/a") (equal? item-path "")) ;; this is a toplevel test ;; what to do with toplevel? call rollup? (begin (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id)) (begin (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time" run-duration="run-duration) (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))) db "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('REMOTEHOSTSTART');" run-id remotehoststart-deadtime) ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config ;; ;; (db:delay-if-busy dbdat) (sqlite3:for-each-row (lambda (test-id run-dir uname testname item-path) (if (and (equal? uname "n/a") (equal? item-path "")) ;; this is a toplevel test ;; what to do with toplevel? call rollup? (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) (begin (debug:print-info 0 *default-log-port* "Found old test in LAUNCHED state, test-id=" test-id" 1 day since event_time marked") (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched))))) db "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400 AND state IN ('LAUNCHED');" run-id) (debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, " (length toplevels) " old LAUNCHED toplevel tests and " (length incompleted) " tests marked RUNNING but apparently dead.") ;; These are defunct tests, do not do all the overhead of set-state-status. Force them to INCOMPLETE. ;; ;; (db:delay-if-busy dbdat) (let* (;; (min-incompleted (filter (lambda (x) ;; (let* ((testpath (cadr x)) ;; (tdatpath (conc testpath "/testdat.db")) ;; (dbexists (common:file-exists? tdatpath))) ;; (or (not dbexists) ;; if no file then something wrong - mark as incomplete ;; (> (- (current-seconds)(file-modification-time tdatpath)) 600)))) ;; no change in 10 minutes to testdat.db - she's dead Jim ;; incompleted)) (min-incompleted-ids (map car incompleted)) ;; do 'em all (all-ids (append min-incompleted-ids (map car oldlaunched)))) (if (> (length all-ids) 0) (begin (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") " as DEAD") (for-each (lambda (test-id) (db:set-state-status-and-roll-up-items dbstruct run-id test-id 'foo "COMPLETED" "DEAD" "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.")) ;;(db:test-set-state-status dbstruct run-id test-id "COMPLETED" "DEAD" "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.")) ;; fix for one aspect of Randy's ticket 1405717332 ;; TODO - fix problem where test goes to COMPLETED/DEAD while in progress, only later to go to COMPLETED/PASS. ref ticket 220546828 all-ids)))))))) ;; ALL REPLACED BY THE BLOCK ABOVE ;; ;; (sqlite3:execute ;; db ;; (conc "UPDATE tests SET state='INCOMPLETE' WHERE run_id=? AND id IN (" |
︙ | ︙ | |||
2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 | (sqlite3:for-each-row (lambda (status) (set! res status)) db "SELECT status FROM runs WHERE id=?;" run-id) res)))) ;;====================================================================== ;; K E Y S ;;====================================================================== ;; get key val pairs for a given run-id ;; ( (FIELDNAME1 keyval1) (FIELDNAME2 keyval2) ... ) | > > > > > > > > > > > > > > | 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 | (sqlite3:for-each-row (lambda (status) (set! res status)) db "SELECT status FROM runs WHERE id=?;" run-id) res)))) (define (db:get-run-state dbstruct run-id) (let ((res "n/a")) (db:with-db dbstruct #f #f (lambda (db) (sqlite3:for-each-row (lambda (status) (set! res status)) db "SELECT state FROM runs WHERE id=?;" run-id) res)))) ;;====================================================================== ;; K E Y S ;;====================================================================== ;; get key val pairs for a given run-id ;; ( (FIELDNAME1 keyval1) (FIELDNAME2 keyval2) ... ) |
︙ | ︙ | |||
3006 3007 3008 3009 3010 3011 3012 | (define (db:get-not-completed-cnt dbstruct run-id) (db:with-db dbstruct run-id #f (lambda (db) | | | 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 | (define (db:get-not-completed-cnt dbstruct run-id) (db:with-db dbstruct run-id #f (lambda (db) ;(print "SELECT count(id) FROM tests WHERE state not in ('COMPLETED', 'DELETED') AND run_id=" run-id) (sqlite3:first-result db "SELECT count(id) FROM tests WHERE state not in ('COMPLETED', 'DELETED') AND run_id=?;" run-id)))) (define (db:get-count-tests-running-in-jobgroup dbstruct run-id jobgroup) (if (not jobgroup) 0 ;; |
︙ | ︙ | |||
3697 3698 3699 3700 3701 3702 3703 | (db:test-get-testname testdat) test-name)) (item-path (db:test-get-item-path testdat)) (tl-testdat (db:get-test-info dbstruct run-id test-name "")) (tl-test-id (if tl-testdat (db:test-get-id tl-testdat) #f))) | | | 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 | (db:test-get-testname testdat) test-name)) (item-path (db:test-get-item-path testdat)) (tl-testdat (db:get-test-info dbstruct run-id test-name "")) (tl-test-id (if tl-testdat (db:test-get-id tl-testdat) #f))) (if (member state '("LAUNCHED" "REMOTEHOSTSTART")) (db:general-call dbstruct 'set-test-start-time (list test-id))) (mutex-lock! *db-transaction-mutex*) (db:with-db dbstruct #f #f (lambda (db) (let ((tr-res (sqlite3:with-transaction |
︙ | ︙ | |||
3785 3786 3787 3788 3789 3790 3791 | "\n--> newstate "newstate "\n--> newstatus "newstatus "\n\n") ;; NB// Pass the db so it is part of the transaction (list newstate newstatus))) | | | | | 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 | "\n--> newstate "newstate "\n--> newstatus "newstatus "\n\n") ;; NB// Pass the db so it is part of the transaction (list newstate newstatus))) (define (db:set-state-status-and-roll-up-run dbstruct run-id curr-state curr-status) (mutex-lock! *db-transaction-mutex*) (db:with-db dbstruct #f #f (lambda (db) (let ((tr-res (sqlite3:with-transaction db (lambda () (let* ((state-status-counts (db:get-all-state-status-counts-for-run dbstruct run-id)) (state-stauses (db:roll-up-rules state-status-counts #f #f )) (newstate (car state-stauses)) (newstatus (cadr state-stauses))) (if (or (not (eq? newstate curr-state)) (not (eq? newstatus curr-status))) (db:set-run-state-status dbstruct run-id newstate newstatus ))))))) (mutex-unlock! *db-transaction-mutex*) tr-res)))) (define (db:get-all-state-status-counts-for-run dbstruct run-id) (let* ((test-count-recs (db:with-db dbstruct #f #f |
︙ | ︙ |
Added docs/manual/devnotes.txt version [e6b6b73f5f].
> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 | Developer Notes --------------- Collected here are some topics that may interest the megatest developer. telemetry ~~~~~~~~~ A new feature introduced in v1.6525 allows a centralized debug messaging system. Debugging client-server issues is greatly aided by a centralized, time coherent log of events across test execution, server, and runner. This is provided by the telemetry feature source code call example [source,ini] [telemetry] host <IP where telemetry daemon will run> port <UDP port to listen -- we recommend 5929> want-events <comma separated list of telemetry-log keywords [source,ini] [telemetry] host 10.38.32.91 port 5929 want-events ALL [source,scheme] (common:telemetry-log <keyword string> Usage: 1. Add telemetry section to megatest.config 2. Start telemetry daemon telemetry-daemon -a start -l /tmp/my-telemetry.log 3. Run megatest 4. examine / parse telemetry log |
Modified docs/manual/megatest_manual.html from [453b3aeb82] to [eadc4c2938].
︙ | ︙ | |||
898 899 900 901 902 903 904 | which can launch jobs on local and remote Linux hosts. Currently megatest uses the network filesystem to call home to your master sqlite3 database. Megatest has been used with the Intel Netbatch and lsf (also known as openlava) batch systems and it should be straightforward to use it with other similar systems.</p></div> </div> </div> | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | 898 899 900 901 902 903 904 905 906 907 908 909 910 911 | which can launch jobs on local and remote Linux hosts. Currently megatest uses the network filesystem to call home to your master sqlite3 database. Megatest has been used with the Intel Netbatch and lsf (also known as openlava) batch systems and it should be straightforward to use it with other similar systems.</p></div> </div> </div> </div> <div class="sect1"> <h2 id="_installation">Installation</h2> <div class="sectionbody"> <div class="sect2"> <h3 id="_dependencies">Dependencies</h3> <div class="paragraph"><p>Chicken scheme and a number of "eggs" are required for building |
︙ | ︙ | |||
3031 3032 3033 3034 3035 3036 3037 | </div> </div> </div> <div id="footnotes"><hr></div> <div id="footer"> <div id="footer-text"> Version 1.0<br> | | | 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 | </div> </div> </div> <div id="footnotes"><hr></div> <div id="footer"> <div id="footer-text"> Version 1.0<br> Last updated 2019-02-19 11:13:54 PST </div> </div> </body> </html> |
Modified docs/megatest-training.odp from [ba7ab2ab9e] to [f923ec026f].
cannot compute difference between binary files
Modified launch.scm from [b89d82dc9d] to [bd83b25556].
︙ | ︙ | |||
375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 | (- (current-seconds) start-seconds))))) (kill-tries 0)) ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area) ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10) (let loop ((minutes (calc-minutes)) (cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (disk-free (get-df (current-directory))) (last-sync (current-seconds))) (let* ((over-time (> (current-seconds) (+ last-sync update-period))) (new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (delta (abs (- load cpu-load)))) (if (> delta 0.1) ;; don't bother updating with small changes load #f))) (new-disk-free (let* ((df (if over-time ;; only get df every 30 seconds (get-df (current-directory)) disk-free)) (delta (abs (- df disk-free)))) (if (and (> df 0) (> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg df #f))) | > > | | > > > > > > > | > > | < < < | | > > > > | > > > | > > > | > > > | 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 | (- (current-seconds) start-seconds))))) (kill-tries 0)) ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area) ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10) (let loop ((minutes (calc-minutes)) (cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (disk-free (get-df (current-directory))) (last-sync (current-seconds))) (common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync)) (let* ((over-time (> (current-seconds) (+ last-sync update-period))) (new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (delta (abs (- load cpu-load)))) (if (> delta 0.1) ;; don't bother updating with small changes load #f))) (new-disk-free (let* ((df (if over-time ;; only get df every 30 seconds (get-df (current-directory)) disk-free)) (delta (abs (- df disk-free)))) (if (and (> df 0) (> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg df #f))) (do-sync (or new-cpu-load new-disk-free over-time)) (test-info (rmt:get-test-info-by-id run-id test-id)) (state (db:test-get-state test-info)) (status (db:test-get-status test-info)) (kill-reason "no kill reason specified") (kill-job? #f)) (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period)) (cond ((test-get-kill-request run-id test-id) (set! kill-reason "KILLING TEST since received kill request (KILLREQ)") (set! kill-job? #t)) ((and runtlim (> (- (current-seconds) start-seconds) runtlim)) (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim)) (set! kill-job? #t)) ((equal? status "DEAD") (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.") ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING (set! kill-job? #f))) (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync) (launch:handle-zombie-tests run-id) (when do-sync ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append) ;; (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes))))) (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync started at "(current-seconds))) (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds)))) (if kill-job? (begin (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason) (mutex-lock! m) ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this ;; section and the runit section? Or add a loop that tries three times with a 1/4 second ;; between tries? (let* ((pid1 (launch:einf-pid exit-info)) ;; (vector-ref exit-info 0)) (pid2 (rmt:test-get-top-process-pid run-id test-id)) (pids (delete-duplicates (filter number? (list pid1 pid2))))) |
︙ | ︙ | |||
440 441 442 443 444 445 446 | (handle-exceptions exn #f (process-signal pid-num signal/kill))) (process:get-sub-pids pid)))) ;; (debug:print-info 0 *default-log-port* "not killing process " pid " as it is not alive")))) pids) | > | | | > | 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 | (handle-exceptions exn #f (process-signal pid-num signal/kill))) (process:get-sub-pids pid)))) ;; (debug:print-info 0 *default-log-port* "not killing process " pid " as it is not alive")))) pids) ;; BB: question to Matt -- does the tests:test-state-status! encompass rollup to toplevel? If not, should it? (tests:test-set-status! run-id test-id "KILLED" "KILLED" (conc (args:get-arg "-m")" "kill-reason) #f)) ;; BB ADDED kill-reason -- confirm OK with Matt (begin (debug:print-error 0 *default-log-port* "Nothing to kill, pid1=" pid1 ", pid2=" pid2) (tests:test-set-status! run-id test-id "KILLED" "FAILED TO KILL" (conc (args:get-arg "-m")" "kill-reason) #f) ;; BB ADDED kill-reason -- confirm OK with Matt ))) (mutex-unlock! m) ;; no point in sticking around. Exit now. But run end of run before exiting? (launch:end-of-run-check run-id) (exit))) (if (hash-table-ref/default misc-flags 'keep-going #f) (begin (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta (loop (calc-minutes) (or new-cpu-load cpu-load) |
︙ | ︙ | |||
571 572 573 574 575 576 577 | (set! tconfigreg (tests:get-all)) ;; mapping of testname => test source path (let ((sighand (lambda (signum) ;; (signal-mask! signum) ;; to mask or not? seems to cause issues in exiting (if (eq? signum signal/stop) (debug:print-error 0 *default-log-port* "attempt to STOP process. Exiting.")) (set! *time-to-exit* #t) | | > | > < | | 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 | (set! tconfigreg (tests:get-all)) ;; mapping of testname => test source path (let ((sighand (lambda (signum) ;; (signal-mask! signum) ;; to mask or not? seems to cause issues in exiting (if (eq? signum signal/stop) (debug:print-error 0 *default-log-port* "attempt to STOP process. Exiting.")) (set! *time-to-exit* #t) (print "Received signal " signum ", cleaning up before exit (set this test to COMPLETED/ABORT) . Please wait...") (let ((th1 (make-thread (lambda () (print "set test to COMPLETED/ABORT begin.") (rmt:test-set-state-status run-id test-id "COMPLETED" "ABORT" "received kill signal") (print "set test to COMPLETED/ABORT complete.") (print "Killed by signal " signum ". Exiting") (exit 1)))) (th2 (make-thread (lambda () (thread-sleep! 20) (debug:print 0 *default-log-port* "Done") (exit 4))))) (thread-start! th2) (thread-start! th1) (thread-join! th2))))) (set-signal-handler! signal/int sighand) (set-signal-handler! signal/term sighand) |
︙ | ︙ | |||
603 604 605 606 607 608 609 | (exit)))) (test-pid (db:test-get-process_id test-info))) (cond ;; -mrw- I'm removing KILLREQ from this list so that a test in KILLREQ state is treated as a "do not run" flag. ((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun (debug:print 0 *default-log-port* "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request") ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") | > > | > | | 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 | (exit)))) (test-pid (db:test-get-process_id test-info))) (cond ;; -mrw- I'm removing KILLREQ from this list so that a test in KILLREQ state is treated as a "do not run" flag. ((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun (debug:print 0 *default-log-port* "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request") ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") (rmt:general-call 'set-test-start-time #f test-id) (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f) ) ;; prime it for running ((member (db:test-get-state test-info) '("RUNNING" "REMOTEHOSTSTART")) (if (process:alive-on-host? test-host test-pid) (debug:print-error 0 *default-log-port* "test state is " (db:test-get-state test-info) " and process " test-pid " is still running on host " test-host ", cannot proceed") (exit))) ((not (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ"))) ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") (rmt:general-call 'set-test-start-time #f test-id) (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f) ) (else ;; (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ")) (debug:print-error 0 *default-log-port* "test state is " (db:test-get-state test-info) ", cannot proceed") (exit)))) (debug:print 2 *default-log-port* "Executing " test-name " (id: " test-id ") on " (get-host-name)) (set! keys (rmt:get-keys)) ;; (runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals) ;; these may be needed by the launching process ;; one of these is defunct/redundant ... (if (not (launch:setup force-reread: #t)) (begin (debug:print 0 *default-log-port* "Failed to setup, exiting") ;; (sqlite3:finalize! db) |
︙ | ︙ | |||
833 834 835 836 837 838 839 | ;; > 3 RUNNING with not test_dead do nothing (run should already be RUNNING/ na ;; > 0 RUNNING and test_dead then send KILLREQ ==> COMPLETED ;; 0 RUNNING ==> this is actually the first condition, should not get here (define (launch:end-of-run-check run-id ) (let* ((not-completed-cnt (rmt:get-not-completed-cnt run-id)) (running-cnt (rmt:get-count-tests-running-for-run-id run-id)) | | > > | > > > < < | | > | | | | | 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 | ;; > 3 RUNNING with not test_dead do nothing (run should already be RUNNING/ na ;; > 0 RUNNING and test_dead then send KILLREQ ==> COMPLETED ;; 0 RUNNING ==> this is actually the first condition, should not get here (define (launch:end-of-run-check run-id ) (let* ((not-completed-cnt (rmt:get-not-completed-cnt run-id)) (running-cnt (rmt:get-count-tests-running-for-run-id run-id)) (all-test-launched (rmt:get-var (conc "lunch-complete-" run-id))) (current-state (rmt:get-run-state run-id)) (current-status (rmt:get-run-status run-id))) ;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing (debug:print 0 *default-log-port* "rollup run state/status") (rmt:set-state-status-and-roll-up-run run-id current-state current-status) (cond ((and all-test-launched (eq? not-completed-cnt 0) (equal? all-test-launched "yes" )) (debug:print 0 *default-log-port* "look for post hook.") (runs:run-post-hook run-id)) ((> running-cnt 3) (debug:print 0 *default-log-port* "There are " running-cnt " tests running." )) ((> running-cnt 0) (debug:print 0 *default-log-port* "running cnt > 0 but <= 3 kill-running-tests-if-dead" ) (let ((kill-cnt (launch:kill-tests-if-dead run-id))) (if (and all-test-launched (equal? all-test-launched "yes") (eq? kill-cnt running-cnt)) (launch:end-of-run-check run-id)))) ;;todo (else (debug:print 0 *default-log-port* "Should it get here?? May be everything is not launched yet. Running test cnt:" running-cnt " Not completed test cnt:" not-completed-cnt) (let* ((not-completed-tests (rmt:get-tests-for-run run-id "%" `("NOT_STARTED" "RUNNING" "LAUNCHED" "REMOTEHOSTSTART") `() #f #f #f #f #f #f #f #f))) (if (> (length not-completed-tests) 0) (let loop ((running-test (car not-completed-tests)) (tal (cdr not-completed-tests))) (let* ((test-name (vector-ref running-test 2)) (item-path (vector-ref running-test 11))) (debug:print 0 *default-log-port* "test " test-name "/" item-path " not completed") (if (not (null? tal)) (loop (car tal) (cdr tal))))))))))) (define (launch:is-test-alive host pid) (if (and host pid (not (equal? host "n/a"))) (let* ((cmd (conc "ssh " host " pstree -A " pid)) (output (with-input-from-pipe cmd read-lines))) (print "cmd: " cmd "\n op: " output ) (if(eq? (length output) 0) |
︙ | ︙ | |||
1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 | (list lnkpathf lnkpath )) (if (and test-src-path (> remtries 0)) (begin (debug:print-error 0 *default-log-port* "Failed to create work area at " test-path " with link at " lnktarget ", remaining attempts " remtries) ;; (create-work-area run-id run-info keyvals test-id test-src-path disk-path testname itemdat remtries: (- remtries 1))) (list #f #f))))) ;; 1. look though disks list for disk with most space ;; 2. create run dir on disk, path name is meaningful ;; 3. create link from run dir to megatest runs area ;; 4. remotely run the test on allocated host ;; - could be ssh to host from hosts table (update regularly with load) ;; - could be netbatch | > > > > > > > > > > > > > > > > > > > > > > | 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 | (list lnkpathf lnkpath )) (if (and test-src-path (> remtries 0)) (begin (debug:print-error 0 *default-log-port* "Failed to create work area at " test-path " with link at " lnktarget ", remaining attempts " remtries) ;; (create-work-area run-id run-info keyvals test-id test-src-path disk-path testname itemdat remtries: (- remtries 1))) (list #f #f))))) (define (launch:handle-zombie-tests run-id) (let* ((key (conc "zombiescan-runid-"run-id)) (now (current-seconds)) (threshold (- (current-seconds) (* 2 (or (configf:lookup-number *configdat* "setup" "deadtime") 120)))) (val (rmt:get-var key)) (do-scan? (cond ((not val) #t) ((< val threshold) #t) (else #f)))) (when do-scan? (debug:print 1 *default-log-port* "INFO: search and mark zombie tests") (rmt:set-var key (current-seconds)) (rmt:find-and-mark-incomplete run-id #f)))) ;; 1. look though disks list for disk with most space ;; 2. create run dir on disk, path name is meaningful ;; 3. create link from run dir to megatest runs area ;; 4. remotely run the test on allocated host ;; - could be ssh to host from hosts table (update regularly with load) ;; - could be netbatch |
︙ | ︙ |
Modified megatest-version.scm from [f713858d59] to [cc55965633].
︙ | ︙ | |||
16 17 18 19 20 21 22 | ;; along with Megatest. If not, see <http://www.gnu.org/licenses/>. ;; Always use two or four digit decimal ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00.. (declare (unit megatest-version)) | | | 16 17 18 19 20 21 22 23 | ;; along with Megatest. If not, see <http://www.gnu.org/licenses/>. ;; Always use two or four digit decimal ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00.. (declare (unit megatest-version)) (define megatest-version 1.6525) |
Modified megatest.scm from [bd9e9c775d] to [a7dc9766fe].
︙ | ︙ | |||
107 108 109 110 111 112 113 | -manual : show the Megatest user manual -version : print megatest version (currently " megatest-version ") Launching and managing runs -run : run all tests or as specified by -testpatt -remove-runs : remove the data for a run, requires -runname and -testpatt Optionally use :state and :status, use -keep-records to remove only | | > | | | | 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | -manual : show the Megatest user manual -version : print megatest version (currently " megatest-version ") Launching and managing runs -run : run all tests or as specified by -testpatt -remove-runs : remove the data for a run, requires -runname and -testpatt Optionally use :state and :status, use -keep-records to remove only the run data. Use -kill-wait to override the 10 second per test wait after kill delay. -kill-runs : kill existing run(s) (all incomplete tests killed) -kill-rerun : kill an existing run (all incomplete tests killed and run is rerun) -set-state-status X,Y : set state to X and status to Y, requires controls per -remove-runs -rerun FAIL,WARN... : force re-run for tests with specificed status(s) -rerun-clean : set all tests not COMPLETED+PASS,WARN,WAIVED to NOT_STARTED,n/a and then run the specified testpatt with -preclean -rerun-all : set all tests to NOT_STARTED,n/a and run with -preclean -lock : lock run specified by target and runname -unlock : unlock run specified by target and runname -set-run-status status : sets status for run to status, requires -target and -runname -get-run-status : gets status for run specified by target and runname -run-wait : wait on run specified by target and runname -preclean : remove the existing test directory before running the test -clean-cache : remove the cached megatest.config and runconfigs.config files -no-cache : do not use the cached config files. -one-pass : launch as many tests as you can but do not wait for more to be ready -remove-keep N : remove all but N most recent runs per target; use '-actions, -age, -precmd' -age <age> : 120d,3h,20m to apply only to runs older than the specified age. NB// M=month, m=minute -actions <action>[,...] : actions to take; print,remove-runs,archive,kill-runs -precmd : insert a wrapper command in front of the commands run Selectors (e.g. use for -runtests, -remove-runs, -set-state-status, -list-runs etc.) -target key1/key2/... : run for key1, key2, etc. -reqtarg key1/key2/... : run for key1, key2, etc. but key1/key2 must be in runconfigs -testpatt patt1/patt2,patt3/... : % is wildcard -runname : required, name for this particular test run |
︙ | ︙ | |||
355 356 357 358 359 360 361 362 363 364 365 366 367 368 | "-prefix-target" "-src-target" "-src-runname" "-diff-email" "-sync-to" "-pgsync" "-diff-html" ) (list "-h" "-help" "--help" "-manual" "-version" "-force" "-xterm" | > | 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 | "-prefix-target" "-src-target" "-src-runname" "-diff-email" "-sync-to" "-pgsync" "-kill-wait" ;; wait this long before removing test (default is 10 sec) "-diff-html" ) (list "-h" "-help" "--help" "-manual" "-version" "-force" "-xterm" |
︙ | ︙ | |||
378 379 380 381 382 383 384 385 386 387 388 389 390 391 | "-rerun-clean" "-rerun-all" "-clean-cache" "-no-cache" "-cache-db" "-use-db-cache" "-prepend-contour" ;; misc "-repl" "-lock" "-unlock" "-list-servers" "-kill-servers" "-run-wait" ;; wait on a run to complete (i.e. no RUNNING) | > | | | > | 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 | "-rerun-clean" "-rerun-all" "-clean-cache" "-no-cache" "-cache-db" "-use-db-cache" "-prepend-contour" ;; misc "-repl" "-lock" "-unlock" "-list-servers" "-kill-servers" "-run-wait" ;; wait on a run to complete (i.e. no RUNNING) "-one-pass" ;; "-local" ;; run some commands using local db access "-generate-html" "-generate-html-structure" "-list-run-time" "-list-test-time" ;; misc queries "-list-disks" "-list-targets" "-list-db-targets" "-show-runconfig" "-show-config" "-show-cmdinfo" |
︙ | ︙ |
Modified mt-pg.sql from [1d0cdb4bfc] to [a25c58b313].
︙ | ︙ | |||
80 81 82 83 84 85 86 | CREATE TABLE IF NOT EXISTS area_tags ( id SERIAL PRIMARY KEY, tag_id INTEGER DEFAULT 0, area_id INTEGER DEFAULT 0, CONSTRAINT areatagconstraint UNIQUE (tag_id, area_id)); CREATE VIEW area_tag_view as | | | 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 | CREATE TABLE IF NOT EXISTS area_tags ( id SERIAL PRIMARY KEY, tag_id INTEGER DEFAULT 0, area_id INTEGER DEFAULT 0, CONSTRAINT areatagconstraint UNIQUE (tag_id, area_id)); CREATE VIEW area_tag_view as select a.id as aid, t.id as tid,area_name,tag_name,area_path from areas as a inner join area_tags as at on at.area_id = a.id inner join tags as t on t.id = at.tag_id ; INSERT INTO areas (id,area_name,area_path) VALUES (0,'local','.'); CREATE TABLE IF NOT EXISTS ttype ( id SERIAL PRIMARY KEY, target_spec TEXT DEFAULT ''); |
︙ | ︙ | |||
307 308 309 310 311 312 313 | deleted INTEGER default 0 ); CREATE TABLE IF NOT EXISTS users_webviews( id SERIAL PRIMARY KEY , | | | | | | > | 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 | deleted INTEGER default 0 ); CREATE TABLE IF NOT EXISTS users_webviews( id SERIAL PRIMARY KEY , user_id INTEGER NOT NULL, webview_id INTEGER NOT NULL, deleted INTEGER default 0, searchpattern TEXT Default '', web_page TEXT Default '', is_default boolean default 'f', other_search_data TEXT Default '' ); CREATE TABLE IF NOT EXISTS cctrl_info( id SERIAL PRIMARY KEY , user_id INTEGER NOT NULL, input TEXT Default '', |
︙ | ︙ |
Modified mtut.scm from [5cc50f038e] to [d7026e5f70].
︙ | ︙ | |||
826 827 828 829 830 831 832 | ;; (use trace)(trace create-run-pkt) (define (contains list x) (cond ((null? list) #f) ((eq? (car list) x) #t) (else (contains (cdr list) x)))) ;; collect all needed data and create run pkts for contours with changed inputs ;; (define (generate-run-pkts mtconf toppath) | | > | 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 | ;; (use trace)(trace create-run-pkt) (define (contains list x) (cond ((null? list) #f) ((eq? (car list) x) #t) (else (contains (cdr list) x)))) ;; collect all needed data and create run pkts for contours with changed inputs ;; (define (generate-run-pkts mtconf toppath) (let ((std-runname (conc "sched" (time->string (seconds->local-time (current-seconds)) "%M%H%d"))) (packets-generated 0)) (common:with-queue-db mtconf (lambda (pktsdirs pktsdir pdb) (let* ((rgconfdat (find-and-read-config (conc toppath "/runconfigs.config"))) (rgconf (car rgconfdat)) (all-areas (map car (configf:get-section mtconf "areas"))) (contours (configf:get-section mtconf "contours")) |
︙ | ︙ | |||
1009 1010 1011 1012 1013 1014 1015 | (lambda (cmd) ;;(print "cmd: " cmd) ;;(print "Areas: " all-areas) (for-each (lambda (area) ;Add code to check whether area is valid (if | > | | | 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 | (lambda (cmd) ;;(print "cmd: " cmd) ;;(print "Areas: " all-areas) (for-each (lambda (area) ;Add code to check whether area is valid (if ;; This code checks whether the target has been passed in via argument, and only runs the specified target (and (< packets-generated 4) (if (args:get-arg "-target") (if (string= (args:get-arg "-target") runkey) (area-allowed? area "area-needs-to-be-run" runkey contour #f) #f) (area-allowed? area "area-needs-to-be-run" runkey contour #f))) (let* ((script (car cmd)) (params (cdr cmd)) (cmd (conc script " " contour " " area " " runkey " " std-runname " " action " " params)) (res (handle-exceptions exn #f |
︙ | ︙ | |||
1094 1095 1096 1097 1098 1099 1100 | ;;(target . ,(list new-target)) ;; overriding with result from runing the script )) (aval (or (configf:lookup mtconf "areas" area) "")) (aval-alist (common:val->alist aval)) (targets (map-targets mtconf aval-alist runkey area contour))) (pp targets) | > | | > > | 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 | ;;(target . ,(list new-target)) ;; overriding with result from runing the script )) (aval (or (configf:lookup mtconf "areas" area) "")) (aval-alist (common:val->alist aval)) (targets (map-targets mtconf aval-alist runkey area contour))) (pp targets) (for-each (lambda (target) (create-run-pkt mtconf action area runkey target new-runname mode-patt tag-expr pktsdir reason contour sched dbdest append runtrans) (set! packets-generated (+ packets-generated 1)) ) targets) ;; Add filter for targets ;;(create-run-pkt mtconf action area runkey target runname ;; pktsdir reason contour dbdest append ;; runtrans) (print "key-msg: " key-msg) ;;(push-run-spec torun contour |
︙ | ︙ |
Modified rmt.scm from [f3cb7b6857] to [4dc97a8297].
︙ | ︙ | |||
53 54 55 56 57 58 59 60 61 62 63 64 65 66 | (define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id ;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname)) ;; (define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected ;;DOT digraph megatest_state_status { ;;DOT ranksep=0; ;;DOT // rankdir=LR; ;;DOT node [shape="box"]; ;;DOT "rmt:send-receive" -> MUTEXLOCK; ;;DOT { edge [style=invis];"case 1" -> "case 2" -> "case 3" -> "case 4" -> "case 5" -> "case 6" -> "case 7" -> "case 8" -> "case 9" -> "case 10" -> "case 11"; } ;; do all the prep locked under the rmt-mutex | > > > > > | 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 | (define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id ;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname)) ;; (define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected (common:telemetry-log (conc "rmt:"(->string cmd)) payload: `((rid . ,rid) (params . ,params))) ;;DOT digraph megatest_state_status { ;;DOT ranksep=0; ;;DOT // rankdir=LR; ;;DOT node [shape="box"]; ;;DOT "rmt:send-receive" -> MUTEXLOCK; ;;DOT { edge [style=invis];"case 1" -> "case 2" -> "case 3" -> "case 4" -> "case 5" -> "case 6" -> "case 7" -> "case 8" -> "case 9" -> "case 10" -> "case 11"; } ;; do all the prep locked under the rmt-mutex |
︙ | ︙ | |||
690 691 692 693 694 695 696 | (rmt:send-receive 'get-count-tests-running-in-jobgroup run-id (list run-id jobgroup))) ;; state and status are extra hints not usually used in the calculation ;; (define (rmt:set-state-status-and-roll-up-items run-id test-name item-path state status comment) (rmt:send-receive 'set-state-status-and-roll-up-items run-id (list run-id test-name item-path state status comment))) | | | | 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 | (rmt:send-receive 'get-count-tests-running-in-jobgroup run-id (list run-id jobgroup))) ;; state and status are extra hints not usually used in the calculation ;; (define (rmt:set-state-status-and-roll-up-items run-id test-name item-path state status comment) (rmt:send-receive 'set-state-status-and-roll-up-items run-id (list run-id test-name item-path state status comment))) (define (rmt:set-state-status-and-roll-up-run run-id state status) (rmt:send-receive 'set-state-status-and-roll-up-run run-id (list run-id state status))) (define (rmt:update-pass-fail-counts run-id test-name) (rmt:general-call 'update-pass-fail-counts run-id test-name test-name test-name)) (define (rmt:top-test-set-per-pf-counts run-id test-name) (rmt:send-receive 'top-test-set-per-pf-counts run-id (list run-id test-name))) |
︙ | ︙ | |||
753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 | (define (rmt:lock/unlock-run run-id lock unlock user) (rmt:send-receive 'lock/unlock-run #f (list run-id lock unlock user))) ;; set/get status (define (rmt:get-run-status run-id) (rmt:send-receive 'get-run-status #f (list run-id))) (define (rmt:set-run-status run-id run-status #!key (msg #f)) (rmt:send-receive 'set-run-status #f (list run-id run-status msg))) (define (rmt:update-run-event_time run-id) (rmt:send-receive 'update-run-event_time #f (list run-id))) (define (rmt:get-runs-by-patt keys runnamepatt targpatt offset limit fields last-runs-update #!key (sort-order "asc")) ;; fields of #f uses default (rmt:send-receive 'get-runs-by-patt #f (list keys runnamepatt targpatt offset limit fields last-runs-update sort-order))) | > > > > > > > > | 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 | (define (rmt:lock/unlock-run run-id lock unlock user) (rmt:send-receive 'lock/unlock-run #f (list run-id lock unlock user))) ;; set/get status (define (rmt:get-run-status run-id) (rmt:send-receive 'get-run-status #f (list run-id))) (define (rmt:get-run-state run-id) (rmt:send-receive 'get-run-state #f (list run-id))) (define (rmt:set-run-status run-id run-status #!key (msg #f)) (rmt:send-receive 'set-run-status #f (list run-id run-status msg))) (define (rmt:set-run-state-status run-id state status ) (rmt:send-receive 'set-run-state-status #f (list run-id state status))) (define (rmt:update-run-event_time run-id) (rmt:send-receive 'update-run-event_time #f (list run-id))) (define (rmt:get-runs-by-patt keys runnamepatt targpatt offset limit fields last-runs-update #!key (sort-order "asc")) ;; fields of #f uses default (rmt:send-receive 'get-runs-by-patt #f (list keys runnamepatt targpatt offset limit fields last-runs-update sort-order))) |
︙ | ︙ |
Modified runs.scm from [f0db5bffd4] to [c9b27c71eb].
︙ | ︙ | |||
448 449 450 451 452 453 454 455 456 457 458 459 460 461 | (debug:print-info 0 *default-log-port* "filtering initial test list with tagexpr: " (args:get-arg "-tagexpr") " => " allowed-tests) ));; tests will be ANDed with this list ;; register this run in monitor.db (rmt:tasks-add "run-tests" user target runname test-patts task-key) ;; params) (rmt:tasks-set-state-given-param-key task-key "running") ;; Now generate all the tests lists (set! all-tests-registry (tests:get-all)) ;; hash of testname => path-to-test (set! all-test-names (hash-table-keys all-tests-registry)) ;; filter first for allowed-tests (from -tagexpr) then for test-patts. (set! test-names (tests:filter-test-names (if allowed-tests (tests:filter-test-names all-test-names allowed-tests) | > > > > > > > | 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 | (debug:print-info 0 *default-log-port* "filtering initial test list with tagexpr: " (args:get-arg "-tagexpr") " => " allowed-tests) ));; tests will be ANDed with this list ;; register this run in monitor.db (rmt:tasks-add "run-tests" user target runname test-patts task-key) ;; params) (rmt:tasks-set-state-given-param-key task-key "running") (common:telemetry-log "run-tests" payload: `( (target . ,target) (run-name . ,runname) (test-patts . ,test-patts) ) ) ;; Now generate all the tests lists (set! all-tests-registry (tests:get-all)) ;; hash of testname => path-to-test (set! all-test-names (hash-table-keys all-tests-registry)) ;; filter first for allowed-tests (from -tagexpr) then for test-patts. (set! test-names (tests:filter-test-names (if allowed-tests (tests:filter-test-names all-test-names allowed-tests) |
︙ | ︙ | |||
513 514 515 516 517 518 519 520 521 522 523 524 525 526 | (runs:update-all-test_meta #f) ;; run the run prehook if there are no tests yet run for this run: ;; (runs:run-pre-hook run-id) ;; mark all test launced flag as false in the meta table (rmt:set-var (conc "lunch-complete-" run-id) "no") ;; now add non-directly referenced dependencies (i.e. waiton) ;;====================================================================== ;; refactoring this block into tests:get-full-data ;; ;; What happended, this code is now duplicated in tests!? ;; ;;====================================================================== | > | 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 | (runs:update-all-test_meta #f) ;; run the run prehook if there are no tests yet run for this run: ;; (runs:run-pre-hook run-id) ;; mark all test launced flag as false in the meta table (rmt:set-var (conc "lunch-complete-" run-id) "no") (rmt:set-run-state-status run-id "new" "n/a") ;; now add non-directly referenced dependencies (i.e. waiton) ;;====================================================================== ;; refactoring this block into tests:get-full-data ;; ;; What happended, this code is now duplicated in tests!? ;; ;;====================================================================== |
︙ | ︙ | |||
2001 2002 2003 2004 2005 2006 2007 | (lambda (action) (case action ((print) (print " " (simple-run-runname run) " " (time->string (seconds->local-time (simple-run-event_time run)) "WW%V.%u %H:%M:%S") " " (if remove "REMOVE" ""))) ((remove-runs) | | > > > | > > > | 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 | (lambda (action) (case action ((print) (print " " (simple-run-runname run) " " (time->string (seconds->local-time (simple-run-event_time run)) "WW%V.%u %H:%M:%S") " " (if remove "REMOVE" ""))) ((remove-runs) (if remove (system (conc precmd " megatest -remove-runs -target " target " -runname " (simple-run-runname run) " -testpatt %" (if (member 'kill-runs actions) ;; if kill-runs is specified then set -kill-wait to 0 " -kill-wait 0" ""))))) ((archive) (if remove (system (conc precmd " megatest -archive save-remove -target " target " -runname " (simple-run-runname run) " -testpatt %")))) ((kill-runs) (if remove (system (conc precmd " megatest -kill-runs -target " target " -runname " (simple-run-runname run) " -testpatt %")))) )) actions)))) sorted))) ;; (print "Sorted: " (map simple-run-event_time sorted)) ;; (print "Remove: " (map simple-run-event_time to-remove)))) (hash-table-keys runs-ht)) runs-ht)) |
︙ | ︙ | |||
2135 2136 2137 2138 2139 2140 2141 | (> (string-length dira)(string-length dirb)) #f)))))) (toplevel-retries (make-hash-table)) ;; try three times to loop through and remove top level tests (test-retry-time (make-hash-table)) (backgrounded-remove-status (make-hash-table)) (backgrounded-remove-last-visit (make-hash-table)) (backgrounded-remove-result (make-hash-table)) | | > > | 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 | (> (string-length dira)(string-length dirb)) #f)))))) (toplevel-retries (make-hash-table)) ;; try three times to loop through and remove top level tests (test-retry-time (make-hash-table)) (backgrounded-remove-status (make-hash-table)) (backgrounded-remove-last-visit (make-hash-table)) (backgrounded-remove-result (make-hash-table)) (allow-run-time (string->number (or (args:get-arg "-kill-wait") "10")))) ;; seconds to allow for killing tests before just brutally killing 'em (let loop ((test (car sorted-tests)) (tal (cdr sorted-tests))) (let* ((test-id (db:test-get-id test)) (new-test-dat (rmt:get-test-info-by-id run-id test-id))) (if (not new-test-dat) (begin (debug:print-error 0 *default-log-port* "We have a test-id of " test-id " but no record was found. NOTE: No locking of records is done between processes, do not simultaneously remove the same run from two processes!") (if (not (null? tal)) (loop (car tal)(cdr tal)))) (let* ((item-path (db:test-get-item-path new-test-dat)) (test-name (db:test-get-testname new-test-dat)) (run-dir ;;(filedb:get-path *fdb* ;; (rmt:sdb-qry 'getid (db:test-get-rundir new-test-dat)) ;; ) ;; run dir is from the link tree (has-subrun (and (subrun:subrun-test-initialized? run-dir) (not (subrun:subrun-removed? run-dir)))) (test-state (db:test-get-state new-test-dat)) (test-status (db:test-get-status new-test-dat)) (test-fulln (db:test-get-fullname new-test-dat)) (uname (db:test-get-uname new-test-dat)) (toplevel-with-children (and (db:test-get-is-toplevel test) (> (rmt:test-toplevel-num-items run-id test-name) 0)))) (case action ((remove-runs) ;; if the test is a toplevel-with-children issue an error and do not remove (cond (toplevel-with-children (debug:print 0 *default-log-port* "WARNING: skipping removal of " test-fulln " with run-id " run-id " as it has sub tests") (hash-table-set! toplevel-retries test-fulln (+ (hash-table-ref/default toplevel-retries test-fulln 0) 1)) |
︙ | ︙ | |||
2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 | (loop (car tal)(cdr tal))) ) ((member test-state (list "RUNNING" "LAUNCHED" "REMOTEHOSTSTART" "KILLREQ")) (debug:print 1 *default-log-port* "INFO: issuing killreq to test "test-fulln) (mt:test-set-state-status-by-id run-id (db:test-get-id test) "KILLREQ" "n/a" #f) (if (not (null? tal)) (loop (car tal)(cdr tal)))) (else (if (not (null? tal)) (loop (car tal)(cdr tal))) ))) ((set-state-status) (let* ((new-state (car state-status)) (new-status (cadr state-status)) | > > > > > > | 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 | (loop (car tal)(cdr tal))) ) ((member test-state (list "RUNNING" "LAUNCHED" "REMOTEHOSTSTART" "KILLREQ")) (debug:print 1 *default-log-port* "INFO: issuing killreq to test "test-fulln) (mt:test-set-state-status-by-id run-id (db:test-get-id test) "KILLREQ" "n/a" #f) (if (not (null? tal)) (loop (car tal)(cdr tal)))) ((and (member test-status '("PREQ_FAIL" "PREQ_DISCARDED" "BLOCKED" "ZERO_ITEMS" "KEEP_TRYING" "TEN_STRIKES" "TIMED_OUT"))) (rmt:set-state-status-and-roll-up-items run-id (db:test-get-id test) 'foo "NOT_STARTED" "n/a" (conc "kill-run moved from "test-state":"test-status" to NOT_STARTED:n/a")) ;;(mt:test-set-state-status-by-id run-id (db:test-get-id test) "NOT_STARTED" "n/a" (conc "kill-run moved from "test-state":"test-status" to NOT_STARTED:n/a")) (if (not (null? tal)) (loop (car tal)(cdr tal))) ) (else (if (not (null? tal)) (loop (car tal)(cdr tal))) ))) ((set-state-status) (let* ((new-state (car state-status)) (new-status (cadr state-status)) |
︙ | ︙ |
Modified server.scm from [b72b3224b4] to [8ce184eea5].
︙ | ︙ | |||
526 527 528 529 530 531 532 533 534 535 536 537 538 539 | (common:snapshot-file mtdbfile subdir: ".db-snapshot")) (delete-file* staging-file) (let* ((start-time (current-milliseconds)) (res (system sync-cmd))) (cond ((eq? 0 res) (delete-file* (conc mtdbfile ".backup")) (system (conc "/bin/mv " staging-file " " mtdbfile)) (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "(/ (- (current-milliseconds) start-time) 1000)" sec") #t) (else (system (conc "/bin/cp "sync-log" "sync-log".fail")) (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Sync failed. See log at "sync-log".fail") (if (file-exists? (conc mtdbfile ".backup")) | > > | 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 | (common:snapshot-file mtdbfile subdir: ".db-snapshot")) (delete-file* staging-file) (let* ((start-time (current-milliseconds)) (res (system sync-cmd))) (cond ((eq? 0 res) (delete-file* (conc mtdbfile ".backup")) (if (eq? 0 (file-size sync-log)) (delete-file sync-log)) (system (conc "/bin/mv " staging-file " " mtdbfile)) (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "(/ (- (current-milliseconds) start-time) 1000)" sec") #t) (else (system (conc "/bin/cp "sync-log" "sync-log".fail")) (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Sync failed. See log at "sync-log".fail") (if (file-exists? (conc mtdbfile ".backup")) |
︙ | ︙ |
Modified tasks.scm from [9a414ee32a] to [6e633165ba].
︙ | ︙ | |||
991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 | ;; get runs changed since last sync ;; (define (tasks:sync-test-data dbh cached-info area-info) ;; (let* (( (define (tasks:sync-to-postgres configdat dest) (let* ((dbh (pgdb:open configdat dbname: dest)) (area-info (pgdb:get-area-by-path dbh *toppath*)) (cached-info (make-hash-table)) (start (current-seconds)) (test-patt (if (args:get-arg "-testpatt") (args:get-arg "-testpatt") "%")) | > | 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 | ;; get runs changed since last sync ;; (define (tasks:sync-test-data dbh cached-info area-info) ;; (let* (( (define (tasks:sync-to-postgres configdat dest) (print "In sync") (let* ((dbh (pgdb:open configdat dbname: dest)) (area-info (pgdb:get-area-by-path dbh *toppath*)) (cached-info (make-hash-table)) (start (current-seconds)) (test-patt (if (args:get-arg "-testpatt") (args:get-arg "-testpatt") "%")) |
︙ | ︙ | |||
1012 1013 1014 1015 1016 1017 1018 | (begin (print "Error: Provide runname") (exit 1))) (if (and (not target) run-name) (begin (print "Error: Provide target") (exit 1))) | | > > > | 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 | (begin (print "Error: Provide runname") (exit 1))) (if (and (not target) run-name) (begin (print "Error: Provide target") (exit 1))) (print "123") (exit 1) (for-each (lambda (dtype) (hash-table-set! cached-info dtype (make-hash-table))) '(runs targets tests steps data)) (hash-table-set! cached-info 'start start) ;; when done we'll set sync times to this (print "here") (if area-info (let* ((last-sync-time (vector-ref area-info 3)) (smallest-last-update-time (make-hash-table)) (changed (if (and target run-name) (rmt:get-run-record-ids target run-name (rmt:get-keys) test-patt) (rmt:get-changed-record-ids last-sync-time))) (run-ids (alist-ref 'runs changed)) (test-ids (alist-ref 'tests changed)) (test-step-ids (alist-ref 'test_steps changed)) (test-data-ids (alist-ref 'test_data changed)) (run-stat-ids (alist-ref 'run_stats changed)) (area-tag (if (args:get-arg "-area-tag") (args:get-arg "-area-tag") (if (args:get-arg "-area") (args:get-arg "-area") "")))) (print "here2") (if (and (equal? area-tag "") (not (pgdb:is-area-taged dbh (vector-ref area-info 0)))) (set! area-tag *default-area-tag*)) (if (not (equal? area-tag "")) (task:add-area-tag dbh area-info area-tag)) (if (or (not (null? test-ids)) (not (null? run-ids))) (begin (debug:print-info 0 *default-log-port* "syncing runs") |
︙ | ︙ |
Added telemetry-daemon version [a2b1d26b8f].
> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 | #!/usr/bin/env python # -*- Mode: Python; -*- ## Tiny Syslog Server in Python. ## ## This is a tiny syslog server that is able to receive UDP based syslog ## entries on a specified port and save them to a file. ## That's it... it does nothing else... import os import sys, os, time, atexit from signal import SIGTERM import logging import logging.handlers import SocketServer import datetime from subprocess import call import argparse import os import socket ## code to determine this host's IP on non-loopback interface if os.name != "nt": import fcntl import struct def get_interface_ip(ifname): s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) return socket.inet_ntoa(fcntl.ioctl(s.fileno(), 0x8915, struct.pack('256s', ifname[:15]))[20:24]) def get_lan_ip(): ip = socket.gethostbyname(socket.gethostname()) if ip.startswith("127.") and os.name != "nt": interfaces = [ "eth0", "eth1", "eth2", "wlan0", "wlan1", "wifi0", "ath0", "ath1", "ppp0", ] for ifname in interfaces: try: ip = get_interface_ip(ifname) break except IOError: pass return ip class Daemon(object): """ A generic daemon class. Usage: subclass the Daemon class and override the run() method """ def __init__(self, pidfile, stdin='/dev/null', stdout='/dev/null', stderr='/dev/null'): self.stdin = stdin self.stdout = stdout self.stderr = stderr self.pidfile = pidfile def daemonize(self): """ do the UNIX double-fork magic, see Stevens' "Advanced Programming in the UNIX Environment" for details (ISBN 0201563177) http://www.erlenstar.demon.co.uk/unix/faq_2.html#SEC16 """ try: pid = os.fork() if pid > 0: # exit first parent sys.exit(0) except OSError, e: sys.stderr.write("fork #1 failed: %d (%s)\n" % (e.errno, e.strerror)) sys.exit(1) # decouple from parent environment os.chdir("/") os.setsid() os.umask(0) # do second fork try: pid = os.fork() if pid > 0: # exit from second parent sys.exit(0) except OSError, e: sys.stderr.write("fork #2 failed: %d (%s)\n" % (e.errno, e.strerror)) sys.exit(1) # redirect standard file descriptors sys.stdout.flush() sys.stderr.flush() si = file(self.stdin, 'r') so = file(self.stdout, 'a+') se = file(self.stderr, 'a+', 0) os.dup2(si.fileno(), sys.stdin.fileno()) os.dup2(so.fileno(), sys.stdout.fileno()) os.dup2(se.fileno(), sys.stderr.fileno()) # write pidfile atexit.register(self.delpid) pid = str(os.getpid()) file(self.pidfile,'w+').write("%s\n" % pid) def delpid(self): os.remove(self.pidfile) def start(self): """ Start the daemon """ # Check for a pidfile to see if the daemon already runs try: pf = file(self.pidfile,'r') pid = int(pf.read().strip()) pf.close() except IOError: pid = None if pid: message = "pidfile %s already exist. Daemon already running?\n" sys.stderr.write(message % self.pidfile) sys.exit(1) # Start the daemon self.daemonize() self.run() def stop(self): """ Stop the daemon """ # Get the pid from the pidfile try: pf = file(self.pidfile,'r') pid = int(pf.read().strip()) pf.close() except IOError: pid = None if not pid: message = "pidfile %s does not exist. Daemon not running?\n" sys.stderr.write(message % self.pidfile) return # not an error in a restart # Try killing the daemon process try: while 1: os.kill(pid, SIGTERM) time.sleep(0.1) except OSError, err: err = str(err) if err.find("No such process") > 0: if os.path.exists(self.pidfile): os.remove(self.pidfile) else: print str(err) sys.exit(1) def restart(self): """ Restart the daemon """ self.stop() self.start() def run(self): """ You should override this method when you subclass Daemon. It will be called after the process has been daemonized by start() or restart(). """ # setup logging module so that the log can be moved aside and will reopen for append def log_setup(logfile): log_handler = logging.handlers.WatchedFileHandler(logfile) formatter = logging.Formatter( '%(message)s','') log_handler.setFormatter(formatter) logger = logging.getLogger() logger.addHandler(log_handler) logger.setLevel(logging.INFO) class SyslogUDPHandler(SocketServer.BaseRequestHandler): def handle(self): data = bytes.decode(self.request[0].strip()) socket = self.request[1] print( "%s : " % self.client_address[0], str(data)) timestamp = datetime.datetime.now().isoformat() logline = timestamp + ":"+self.client_address[0] + ":" + str(data) logging.info(str(logline)) class TelemetryLogDaemon(Daemon): def __init__(self, pidfile, logfile, server_ip, server_port): self.logfile = logfile self.server_ip = server_ip self.server_port = server_port super(TelemetryLogDaemon, self).__init__(pidfile) def run(self): log_setup(self.logfile) server = SocketServer.UDPServer((self.server_ip,int(self.server_port)), SyslogUDPHandler) server.serve_forever(poll_interval=0.5) def main(): default_log_file = os.environ['PWD'] + "/telemetry.log" parser = argparse.ArgumentParser(description = 'telemetry-daemon') actions="start,restart,stop,nodaemon".split(",") parser.add_argument("-a", "--action", required=True, choices=actions, help="manage daemon: start stop or restart") parser.add_argument("-p", "--server-port", default="5929", help="specify alternate udp port number, default is 5929") parser.add_argument("-i", "--server-ip", default=get_lan_ip(), help="specify IP if heuristics to get local host lan ip fails") parser.add_argument("-l", "--log-file", default=default_log_file, help="specify log file to write") parser.add_argument("-z", "--pid-file", default=default_log_file + ".pidfile", help="specify pidfile") opts = parser.parse_args() tld = TelemetryLogDaemon(opts.pid_file, opts.log_file, opts.server_ip, opts.server_port) if opts.action == "start": print "Info: Starting server" print """Example addition to megatest.config to enable telemetry: [telemetry] host %s port %s want-events ALL """ % (opts.server_ip, opts.server_port) tld.start() elif opts.action == "stop": tld.stop() elif opts.action == "restart": print "Info: Restarting server" print """Example addition to megatest.config to enable telemetry: [telemetry] host %s port %s want-events ALL """ % (opts.server_ip, opts.server_port) tld.restart() elif opts.action == "nodaemon": log_setup(opts.log_file) server = SocketServer.UDPServer((opts.server_ip,int(opts.server_port)), SyslogUDPHandler) server.serve_forever(poll_interval=0.5) if __name__ == '__main__': main() |
Modified tests.scm from [b8a74e9d3b] to [c7490d0df8].
︙ | ︙ | |||
1515 1516 1517 1518 1519 1520 1521 | #f)) ;; if .testconfig exists in test directory read and return it ;; else if have cached copy in *testconfigs* return it IFF there is a section "have fulldata" ;; else read the testconfig file ;; if have path to test directory save the config as .testconfig and return it ;; | | | 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 | #f)) ;; if .testconfig exists in test directory read and return it ;; else if have cached copy in *testconfigs* return it IFF there is a section "have fulldata" ;; else read the testconfig file ;; if have path to test directory save the config as .testconfig and return it ;; (define (tests:get-testconfig test-name item-path test-registry system-allowed #!key (force-create #f)(allow-write-cache #t)(wait-a-minute #f)) (let* ((use-cache (common:use-cache?)) (cache-path (tests:get-test-path-from-environment)) (cache-file (and cache-path (conc cache-path "/.testconfig"))) (cache-exists (and cache-file (not force-create) ;; if force-create then pretend there is no cache to read (common:file-exists? cache-file))) (cached-dat (if (and (not force-create) |
︙ | ︙ | |||
1543 1544 1545 1546 1547 1548 1549 1550 1551 | (if (and dat ;; have a locally cached version (hash-table-ref/default dat "have fulldata" #f)) ;; marked as good data? dat ;; no cached data available (let* ((treg (or test-registry (tests:get-all))) (test-path (or (hash-table-ref/default treg test-name #f) (conc *toppath* "/tests/" test-name))) (test-configf (conc test-path "/testconfig")) | > > > > > > > > > > > | > > > > > > > > > > > > > | | 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 | (if (and dat ;; have a locally cached version (hash-table-ref/default dat "have fulldata" #f)) ;; marked as good data? dat ;; no cached data available (let* ((treg (or test-registry (tests:get-all))) (test-path (or (hash-table-ref/default treg test-name #f) (let* ((local-tcdir (conc (getenv "MT_LINKTREE") "/" (getenv "MT_TARGET") "/" (getenv "MT_RUNNAME") "/" test-name "/" item-path)) (local-tcfg (conc local-tcdir "/testconfig"))) (if (common:file-exists? local-tcfg) local-tcdir #f)) (conc *toppath* "/tests/" test-name))) (test-configf (conc test-path "/testconfig")) (testexists (let loopa ((tries-left 30)) (cond ( (and (common:file-exists? test-configf)(file-read-access? test-configf)) #t) ( (common:file-exists? test-configf) (debug:print 0 *default-log-port* "WARNING: Cannot read testconfig file: "test-configf) #f) ( (and wait-a-minute (> tries-left 0)) (thread-sleep! 10) (debug:print 0 *default-log-port* "WARNING: testconfig file does not exist: "test-configf" will retry in 10 seconds. Tries left: "tries-left) ;; BB: this fires (loopa (sub1 tries-left))) (else (debug:print 0 *default-log-port* "WARNING: testconfig file does not exist: "test-configf) ;; BB: this fires #f)))) (tcfg (if testexists (read-config test-configf #f system-allowed environ-patt: (if system-allowed "pre-launch-env-vars" #f)) #f))) (if (and tcfg cache-file) (hash-table-set! tcfg "have fulldata" #t)) ;; mark this as fully read data (if tcfg (hash-table-set! *testconfigs* test-full-name tcfg)) (if (and testexists cache-file (file-write-access? cache-path) allow-write-cache) (let ((tpath (conc cache-path "/.testconfig"))) (debug:print-info 1 *default-log-port* "Caching testconfig for " test-name " in " tpath) (if (and tcfg (not (common:in-running-test?))) (configf:write-alist tcfg tpath)))) tcfg)))))) ;; sort tests by priority and waiton ;; Move test specific stuff to a test unit FIXME one of these days (define (tests:sort-by-priority-and-waiton test-records) (if (eq? (hash-table-size test-records) 0) |
︙ | ︙ |