Changes In Branch v1.65-telemetry Through [074c8889ef] Excluding Merge-Ins
This is equivalent to a diff from 73cb4bf58e to 074c8889ef
2019-02-27
| ||
18:15 | fixed some unbalanced parens check-in: c13adaee88 user: bjbarcla tags: v1.65-telemetry | |
18:09 | wip check-in: 074c8889ef user: bjbarcla tags: v1.65-telemetry | |
17:17 | added (incomplete) docs/manual/devnotes.txt for telemetry usage check-in: ce9c3e41f2 user: bjbarcla tags: v1.65-telemetry | |
2019-02-16
| ||
13:58 | Merged in missing changes from intra-waiton check-in: 7b1e045169 user: matt tags: v1.65 | |
2019-02-14
| ||
15:04 | pulled in docs/manual changes check-in: 3797a8bab4 user: bjbarcla tags: v1.65-telemetry | |
15:02 | merged in trunk to get docs/manual updates check-in: 73cb4bf58e user: bjbarcla tags: v1.65 | |
2019-02-11
| ||
11:35 | merged brute force syncer check-in: 2aaccbd409 user: bjbarcla tags: v1.65, v1.6524 | |
2018-12-05
| ||
15:31 | updates to manual check-in: 1b4e30d106 user: mrwellan tags: trunk | |
Modified api.scm from [1541791de9] to [cf3fabb928].
︙ | |||
155 156 157 158 159 160 161 162 163 164 165 166 167 168 | 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 | + + + + + | cmd-in (string->symbol cmd-in))) (params (vector-ref dat 1)) (start-t (current-milliseconds)) (readonly-mode (dbr:dbstruct-read-only dbstruct)) (readonly-command (member cmd api:read-only-queries)) (writecmd-in-readonly-mode (and readonly-mode (not readonly-command))) (foo (begin (common:telemetry-log (conc "api-in:"(->string cmd)) payload: `((params . ,params))) #t)) (res (if writecmd-in-readonly-mode (conc "attempt to run write command "cmd" on a read-only database") (case cmd ;;=============================================== ;; READ/WRITE QUERIES ;;=============================================== |
︙ | |||
325 326 327 328 329 330 331 332 333 334 335 336 337 338 | 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 | + + + + + - - + + + + + + | ((testmeta-get-record) (apply db:testmeta-get-record dbstruct params)) ;; TASKS ((find-task-queue-records) (apply tasks:find-task-queue-records dbstruct params)) (else (debug:print 0 *default-log-port* "ERROR: bad api call " cmd) (conc "ERROR: BAD api call " cmd)))))) ;; save all stats (let ((delta-t (- (current-milliseconds) start-t))) (hash-table-set! *db-api-call-time* cmd (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '())))) (if writecmd-in-readonly-mode (begin (common:telemetry-log (conc "api-out:"(->string cmd)) payload: `((params . ,params) (ok-res . #t))) |
︙ |
Modified common.scm from [b6c40dc319] to [be82152a65].
︙ | |||
15 16 17 18 19 20 21 | 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 | - + | ;; ;; You should have received a copy of the GNU General Public License ;; along with Megatest. If not, see <http://www.gnu.org/licenses/>. ;;====================================================================== (use srfi-1 data-structures posix regex-case (prefix base64 base64:) |
︙ | |||
79 80 81 82 83 84 85 86 87 88 89 90 91 92 | 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 | + | (define (get-file-descriptor-count #!key (pid (current-process-id ))) (list (length (glob (conc "/proc/" pid "/fd/*"))) (length (filter identity (map socket? (glob (conc "/proc/" pid "/fd/*"))))) ) ) ;; GLOBALS ;; CONTEXTS (defstruct cxt (taskdb #f) (cmutex (make-mutex))) |
︙ | |||
885 886 887 888 889 890 891 892 893 894 895 896 897 898 | 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 | + | (debug:print-info 13 *default-log-port* "loading writable-watchdog.") (server:writable-watchdog dbstruct))) (debug:print-info 13 *default-log-port* "watchdog done.")) (debug:print-info 13 *default-log-port* "no need for watchdog on non-homehost")))) (define (std-exit-procedure) ;;(common:telemetry-log-close) (on-exit (lambda () 0)) ;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*) (let ((no-hurry (if *time-to-exit* ;; hurry up #f (begin (set! *time-to-exit* #t) #t)))) |
︙ | |||
3047 3048 3049 3050 3051 3052 3053 | 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 | + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + | (if thread (handle-exceptions exn #t ;; just ignore it, it might have died in the meantime so joining it will throw an exception (thread-join! thread)) ))) (hash-table-keys *common:thread-punchlist*))) (define *common:telemetry-log-state* 'startup) (define *common:telemetry-log-socket* #f) (define (common:telemetry-log-open) (if (eq? *common:telemetry-log-state* 'startup) (let* ((serverhost (configf:lookup *configdat* "telemetry" "host")) (serverport (configf:lookup-number *configdat* "telemetry" "port")) (user (or (get-environment-variable "USER") "unknown")) (host (or (get-environment-variable "HOST") "unknown"))) (set! *common:telemetry-log-state* (handle-exceptions exn (begin (debug:print-info 0 *default-log-port* "common-telemetry-log open udp port failure") 'broken) (if (and serverhost serverport user host) (let* ((s (udp-open-socket))) ;;(udp-bind! s #f 0) (udp-connect! s serverhost serverport) (set! *common:telemetry-log-socket* s) 'open) 'not-needed)))))) (define (common:telemetry-log event #!key (payload '())) (if (eq? *common:telemetry-log-state* 'startup) (common:telemetry-log-open)) (if (eq? 'open *common:telemetry-log-state*) (handle-exceptions exn (begin (debug:print-info 0 *default-log-port* "common-telemetry-log comms failure ; disabled (no server?)") ;;(define *common:telemetry-log-state* 'broken-or-no-server-preclose) ;;(common:telemetry-log-close) (define *common:telemetry-log-state* 'broken-or-no-server) (set! *common:telemetry-log-socket* #f) ) (if (and *common:telemetry-log-socket* event) ;; TODO - filter on event against telemetry.want-events (let* ((user (or (get-environment-variable "USER") "unknown")) (host (or (get-environment-variable "HOST") "unknown")) (start (conc "[megatest "event"]")) (toppath (or *toppath* "/dev/null")) (payload-serialized (base64:base64-encode (z3:encode-buffer (with-output-to-string (lambda () (pp payload)))))) (msg (conc user":"host":"start":"(current-process-id)":"(car (argv))":" toppath":"payload-serialized))) (udp-send *common:telemetry-log-socket* msg)))))) (define (common:telemetry-log-close) (when (or (member *common:telemetry-log-state* '(broken-or-no-server-preclose open)) *common:telemetry-log-socket*) (handle-exceptions exn (begin (define *common:telemetry-log-state* 'closed-fail) (debug:print-info 0 *default-log-port* "common-telemetry-log closure failure") ) (begin (define *common:telemetry-log-state* 'closed) (udp-close-socket *common:telemetry-log-socket*) (set! *common:telemetry-log-socket* #f))))) |
Modified configf.scm from [77100eae92] to [c596e07f23].
︙ | |||
772 773 774 775 776 777 778 | 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 | - - - - + + + + | (hash-table-set! ht (car section)(cdr section))) adat) ht)) ;; if (define (configf:read-alist fname) (handle-exceptions |
︙ |
Modified db.scm from [da7bac24de] to [cffb18caf7].
︙ | |||
404 405 406 407 408 409 410 411 412 413 414 415 416 417 | 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 | + + + | ;; (debug:print 0 *default-log-port* "ERROR: attempt to open database when not on homehost. Exiting. Homehost: " (common:get-homehost)) ;; (exit 1)))) ;; Open the classic megatest.db file (defaults to open in toppath) ;; ;; NOTE: returns a dbdat not a dbstruct! ;; ;;(define (db:reopen-megatest-db (define (db:open-megatest-db #!key (path #f)(name #f)) (let* ((dbdir (or path *toppath*)) (dbpath (conc dbdir "/" (or name "megatest.db"))) (dbexists (common:file-exists? dbpath)) (db (db:lock-create-open dbpath (lambda (db) (db:initialize-main-db db) |
︙ | |||
1640 1641 1642 1643 1644 1645 1646 | 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 | - - - - - + + + + + + + + + + + - - - + + + + + + + + + + + + + + + + + + + - + - - + + + + - + - + - - + + | ;; end_time,strftime('%s','now') as now from tests where state in ;; ('RUNNING','REMOTEHOSTSTART','LAUNCHED')); (define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime) (let* ((incompleted '()) (oldlaunched '()) (toplevels '()) |
︙ | |||
3678 3679 3680 3681 3682 3683 3684 | 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 | - + | (db:test-get-testname testdat) test-name)) (item-path (db:test-get-item-path testdat)) (tl-testdat (db:get-test-info dbstruct run-id test-name "")) (tl-test-id (if tl-testdat (db:test-get-id tl-testdat) #f))) |
︙ |
Added docs/manual/devnotes.txt version [e6b6b73f5f].
|
Modified docs/manual/megatest_manual.html from [453b3aeb82] to [eadc4c2938].
︙ | |||
898 899 900 901 902 903 904 | 898 899 900 901 902 903 904 905 906 907 908 909 910 911 | - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | which can launch jobs on local and remote Linux hosts. Currently megatest uses the network filesystem to call home to your master sqlite3 database. Megatest has been used with the Intel Netbatch and lsf (also known as openlava) batch systems and it should be straightforward to use it with other similar systems.</p></div> </div> </div> |
︙ | |||
3031 3032 3033 3034 3035 3036 3037 | 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 | - + | </div> </div> </div> <div id="footnotes"><hr></div> <div id="footer"> <div id="footer-text"> Version 1.0<br> |
Modified launch.scm from [6dd1993f7c] to [510de63cde].
︙ | |||
375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 | 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 | + + - - - - + + + + + + + + + + + + + - - - - - - - - + + + + + + + + + + + + + + + + | (- (current-seconds) start-seconds))))) (kill-tries 0)) ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area) ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10) (let loop ((minutes (calc-minutes)) (cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (disk-free (get-df (current-directory))) (last-sync (current-seconds))) (common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync)) (let* ((over-time (> (current-seconds) (+ last-sync update-period))) (new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (delta (abs (- load cpu-load)))) (if (> delta 0.1) ;; don't bother updating with small changes load #f))) (new-disk-free (let* ((df (if over-time ;; only get df every 30 seconds (get-df (current-directory)) disk-free)) (delta (abs (- df disk-free)))) (if (and (> df 0) (> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg df #f))) |
︙ | |||
440 441 442 443 444 445 446 | 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 | + - + - + | (handle-exceptions exn #f (process-signal pid-num signal/kill))) (process:get-sub-pids pid)))) ;; (debug:print-info 0 *default-log-port* "not killing process " pid " as it is not alive")))) pids) ;; BB: question to Matt -- does the tests:test-state-status! encompass rollup to toplevel? If not, should it? |
︙ | |||
571 572 573 574 575 576 577 | 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 | - + + - + + - - + | (set! tconfigreg (tests:get-all)) ;; mapping of testname => test source path (let ((sighand (lambda (signum) ;; (signal-mask! signum) ;; to mask or not? seems to cause issues in exiting (if (eq? signum signal/stop) (debug:print-error 0 *default-log-port* "attempt to STOP process. Exiting.")) (set! *time-to-exit* #t) |
︙ | |||
603 604 605 606 607 608 609 | 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 | + + - + + - + | (exit)))) (test-pid (db:test-get-process_id test-info))) (cond ;; -mrw- I'm removing KILLREQ from this list so that a test in KILLREQ state is treated as a "do not run" flag. ((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun (debug:print 0 *default-log-port* "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request") ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") (rmt:general-call 'set-test-start-time #f test-id) |
︙ | |||
1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 | 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 | + + + + + + + + + + + + + + + + + + + + + + | (list lnkpathf lnkpath )) (if (and test-src-path (> remtries 0)) (begin (debug:print-error 0 *default-log-port* "Failed to create work area at " test-path " with link at " lnktarget ", remaining attempts " remtries) ;; (create-work-area run-id run-info keyvals test-id test-src-path disk-path testname itemdat remtries: (- remtries 1))) (list #f #f))))) (define (launch:handle-zombie-tests run-id) (let* ((key (conc "zombiescan-runid-"run-id)) (now (current-seconds)) (threshold (- (current-seconds) (* 2 (or (configf:lookup-number *configdat* "setup" "deadtime") 120)))) (val (rmt:get-var key)) (do-scan? (cond ((not val) #t) ((< val threshold) #t) (else #f)))) (when do-scan? (debug:print 1 *default-log-port* "INFO: search and mark zombie tests") (rmt:set-var key (current-seconds)) (rmt:find-and-mark-incomplete run-id #f)))) ;; 1. look though disks list for disk with most space ;; 2. create run dir on disk, path name is meaningful ;; 3. create link from run dir to megatest runs area ;; 4. remotely run the test on allocated host ;; - could be ssh to host from hosts table (update regularly with load) ;; - could be netbatch |
︙ |
Modified rmt.scm from [0a05f35135] to [bc89e0120c].
︙ | |||
53 54 55 56 57 58 59 60 61 62 63 64 65 66 | 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 | + + + + + | (define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id ;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname)) ;; (define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected (common:telemetry-log (conc "rmt:"(->string cmd)) payload: `((rid . ,rid) (params . ,params))) ;;DOT digraph megatest_state_status { ;;DOT ranksep=0; ;;DOT // rankdir=LR; ;;DOT node [shape="box"]; ;;DOT "rmt:send-receive" -> MUTEXLOCK; ;;DOT { edge [style=invis];"case 1" -> "case 2" -> "case 3" -> "case 4" -> "case 5" -> "case 6" -> "case 7" -> "case 8" -> "case 9" -> "case 10" -> "case 11"; } ;; do all the prep locked under the rmt-mutex |
︙ |
Modified runs.scm from [4560e73753] to [8403ca3ea3].
︙ | |||
449 450 451 452 453 454 455 456 457 458 459 460 461 462 | 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 | + + + + + + + | (debug:print-info 0 *default-log-port* "filtering initial test list with tagexpr: " (args:get-arg "-tagexpr") " => " allowed-tests) ));; tests will be ANDed with this list ;; register this run in monitor.db (rmt:tasks-add "run-tests" user target runname test-patts task-key) ;; params) (rmt:tasks-set-state-given-param-key task-key "running") (common:telemetry-log "run-tests" payload: `( (target . ,target) (run-name . ,runname) (test-patts . ,test-patts) ) ) ;; Now generate all the tests lists (set! all-tests-registry (tests:get-all)) ;; hash of testname => path-to-test (set! all-test-names (hash-table-keys all-tests-registry)) ;; filter first for allowed-tests (from -tagexpr) then for test-patts. (set! test-names (tests:filter-test-names (if allowed-tests (tests:filter-test-names all-test-names allowed-tests) |
︙ | |||
671 672 673 674 675 676 677 | 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 | - + | (begin (debug:print-info 0 *default-log-port* "Calling Post Hook") (debug:print-info 2 *default-log-port* " run-count " run-count) (runs:run-post-hook run-id)) (debug:print-info 2 *default-log-port* "Not calling post hook runcount = " run-count )) (rmt:tasks-set-state-given-param-key task-key "done") ;; (sqlite3:finalize! tasks-db) |
︙ | |||
2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 | 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 | + + | (loop (car tal)(cdr tal))) ) ((member test-state (list "RUNNING" "LAUNCHED" "REMOTEHOSTSTART" "KILLREQ")) (debug:print 1 *default-log-port* "INFO: issuing killreq to test "test-fulln) (mt:test-set-state-status-by-id run-id (db:test-get-id test) "KILLREQ" "n/a" #f) (if (not (null? tal)) (loop (car tal)(cdr tal)))) ; ((and (member test-state (list "COMPLETED" "NOT_STARTED")) (member test-status "PREQ_FAIL" "PREQ_DISCARDED" "BLOCKED" "ZERO_ITEMS" "KEEP_TRYING" "TEN_STRIKES" "TIMED_OUT")) ; (mt:test-set-state-status-by-id run-id (db:test-get-id test) "NOT_STARTED" "n/a" #f)) (else (if (not (null? tal)) (loop (car tal)(cdr tal))) ))) ((set-state-status) (let* ((new-state (car state-status)) (new-status (cadr state-status)) |
︙ |
Modified server.scm from [b72b3224b4] to [8ce184eea5].
︙ | |||
526 527 528 529 530 531 532 533 534 535 536 537 538 539 | 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 | + + | (common:snapshot-file mtdbfile subdir: ".db-snapshot")) (delete-file* staging-file) (let* ((start-time (current-milliseconds)) (res (system sync-cmd))) (cond ((eq? 0 res) (delete-file* (conc mtdbfile ".backup")) (if (eq? 0 (file-size sync-log)) (delete-file sync-log)) (system (conc "/bin/mv " staging-file " " mtdbfile)) (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "(/ (- (current-milliseconds) start-time) 1000)" sec") #t) (else (system (conc "/bin/cp "sync-log" "sync-log".fail")) (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Sync failed. See log at "sync-log".fail") (if (file-exists? (conc mtdbfile ".backup")) |
︙ |
Added telemetry-daemon version [a2b1d26b8f].