Megatest

Check-in [fe83540f6a]
Login
Overview
Comment:catch up with v1.65
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.65-end-of-run
Files: files | file ages | folders
SHA1: fe83540f6ad304de3f03fda268112557485781de
User & Date: pjhatwal on 2019-03-18 10:57:44
Other Links: branch diff | manifest | tags
Context
2019-03-22
14:27
merged v1.65-end-of-run check-in: 2be9e62191 user: pjhatwal tags: v1.65, v1.6527
2019-03-18
10:57
catch up with v1.65 Closed-Leaf check-in: fe83540f6a user: pjhatwal tags: v1.65-end-of-run
2019-03-14
10:39
Made changes to only create and process 4 packets per mtutil process run check-in: 4b37eea7d0 user: jmoon18 tags: v1.65
2019-03-11
17:56
end of run detection check-in: e719f22355 user: pjhatwal tags: v1.65-end-of-run
Changes

Modified api.scm from [4abe8743ea] to [1958ec5960].

47
48
49
50
51
52
53

54
55
56
57
58
59
60
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61







+







    get-testinfo-state-status
    test-get-top-process-pid
    test-get-paths-matching-keynames-target-new
    get-prereqs-not-met
    get-count-tests-running-for-run-id
    get-run-info
    get-run-status
    get-run-state
    get-run-stats
    get-run-times
    get-targets
    get-target
    ;; register-run
    get-tests-tags
    get-test-times
156
157
158
159
160
161
162





163
164
165
166
167
168
169
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175







+
+
+
+
+







				   cmd-in
				   (string->symbol cmd-in)))
            (params            (vector-ref dat 1))
            (start-t           (current-milliseconds))
            (readonly-mode     (dbr:dbstruct-read-only dbstruct))
            (readonly-command  (member cmd api:read-only-queries))
            (writecmd-in-readonly-mode (and readonly-mode (not readonly-command)))
            (foo               (begin
                                 (common:telemetry-log (conc "api-in:"(->string cmd))
                                                       payload: `((params . ,params)))
                                 
                                 #t))
            (res    
             (if writecmd-in-readonly-mode
                 (conc "attempt to run write command "cmd" on a read-only database")
                 (case cmd
                   ;;===============================================
                   ;; READ/WRITE QUERIES
                   ;;===============================================
279
280
281
282
283
284
285

286

287
288
289
290
291
292
293
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301







+

+







                   ((synchash-get)                    (apply synchash:server-get dbstruct params))
                   ((get-raw-run-stats)               (apply db:get-raw-run-stats dbstruct params))
		   ((get-test-times)                  (apply db:get-test-times dbstruct params))

                   ;; RUNS
                   ((get-run-info)                 (apply db:get-run-info dbstruct params))
                   ((get-run-status)               (apply db:get-run-status dbstruct params))
                   ((get-run-state)                (apply db:get-run-state dbstruct params))
                   ((set-run-status)               (apply db:set-run-status dbstruct params))
                   ((set-run-state-status)  			 (apply db:set-run-state-status dbstruct params))
                   ((get-tests-for-run)            (apply db:get-tests-for-run dbstruct params))
                   ((get-test-id)                  (apply db:get-test-id dbstruct params))
                   ((get-tests-for-run-mindata)    (apply db:get-tests-for-run-mindata dbstruct params))
                   ((get-tests-for-runs-mindata)   (apply db:get-tests-for-runs-mindata dbstruct params))
                   ((get-runs)                     (apply db:get-runs dbstruct params))
                   ((simple-get-runs)              (apply db:simple-get-runs dbstruct params))
                   ((get-num-runs)                 (apply db:get-num-runs dbstruct params))
328
329
330
331
332
333
334

335
336
337
338
339
340
341




342
343






344
345
346
347
348
349
350
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354


355
356
357
358
359
360
361
362
363
364
365
366
367







+







+
+
+
+
-
-
+
+
+
+
+
+







                   ((testmeta-get-record)       (apply db:testmeta-get-record dbstruct params))

                   ;; TASKS 
                   ((find-task-queue-records)   (apply tasks:find-task-queue-records dbstruct params))
		   (else
		    (debug:print 0 *default-log-port* "ERROR: bad api call " cmd)
		    (conc "ERROR: BAD api call " cmd))))))

       
       ;; save all stats
       (let ((delta-t (- (current-milliseconds)
			 start-t)))
	 (hash-table-set! *db-api-call-time* cmd
			  (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '()))))
       (if writecmd-in-readonly-mode
           (begin
             (common:telemetry-log (conc "api-out:"(->string cmd))
                                   payload: `((params . ,params)
                                              (ok-res . #t)))
	   (vector #f res)
           (vector #t res)))))))
	     (vector #f res))
           (begin
             (common:telemetry-log (conc "api-out:"(->string cmd))
                                   payload: `((params . ,params)
                                              (ok-res . #f)))
             (vector #t res))))))))

;; http-server  send-response
;;                 api:process-request
;;                    db:*
;;
;; NB// Runs on the server as part of the server loop
;;

Modified common.scm from [b6c40dc319] to [c41ac723cd].

15
16
17
18
19
20
21
22

23
24
25
26
27
28
29
15
16
17
18
19
20
21

22
23
24
25
26
27
28
29







-
+







;; 
;;     You should have received a copy of the GNU General Public License
;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.

;;======================================================================

(use srfi-1 data-structures posix regex-case (prefix base64 base64:)
     format dot-locking csv-xml z3 ;; sql-de-lite
     format dot-locking csv-xml z3 udp ;; sql-de-lite
     hostinfo md5 message-digest typed-records directory-utils stack
     matchable regex posix (srfi 18) extras ;; tcp 
     (prefix nanomsg nmsg:)
     (prefix sqlite3 sqlite3:)
     pkts (prefix dbi dbi:)
     )

79
80
81
82
83
84
85

86
87
88
89
90
91
92
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93







+







(define (get-file-descriptor-count #!key  (pid (current-process-id )))
  (list
    (length (glob (conc "/proc/" pid "/fd/*")))
    (length  (filter identity (map socket? (glob (conc "/proc/" pid "/fd/*")))))
  )
)

  

;; GLOBALS

;; CONTEXTS
(defstruct cxt
  (taskdb #f)
  (cmutex (make-mutex)))
885
886
887
888
889
890
891

892
893
894
895
896
897
898
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900







+







	      (debug:print-info 13 *default-log-port* "loading writable-watchdog.")
	      (server:writable-watchdog dbstruct)))
	    (debug:print-info 13 *default-log-port* "watchdog done."))
	  (debug:print-info 13 *default-log-port* "no need for watchdog on non-homehost"))))


(define (std-exit-procedure)
  ;;(common:telemetry-log-close)
  (on-exit (lambda () 0))
  ;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*)
  (let ((no-hurry  (if *time-to-exit* ;; hurry up
		       #f
		       (begin
			 (set! *time-to-exit* #t)
			 #t))))
1589
1590
1591
1592
1593
1594
1595



1596
1597
1598
1599
1600
1601
1602







1603
1604
1605
1606
1607




1608
1609
1610
1611
1612
1613
1614
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600







1601
1602
1603
1604
1605
1606
1607
1608




1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619







+
+
+
-
-
-
-
-
-
-
+
+
+
+
+
+
+

-
-
-
-
+
+
+
+







     exn
     #f
     (with-output-to-file fullpath (lambda ()(pp dat))))))

;; get cpu load by reading from /proc/loadavg, return all three values
;;
(define (common:get-cpu-load remote-host)
  (handle-exceptions
   exn
   '(99 99 99)
  (let* ((actual-hostname (or remote-host (get-host-name) "localhost")))
    (or (common:get-cached-info actual-hostname "cpu-load")
	(let ((result (if remote-host
			  (map (lambda (res)
				 (if (eof-object? res) 9e99 res))
			       (with-input-from-pipe 
				   (conc "ssh " remote-host " cat /proc/loadavg")
   (let* ((actual-hostname (or remote-host (get-host-name) "localhost")))
     (or (common:get-cached-info actual-hostname "cpu-load")
	 (let ((result (if remote-host
			   (map (lambda (res)
				  (if (eof-object? res) 9e99 res))
			        (with-input-from-pipe 
				 (conc "ssh " remote-host " cat /proc/loadavg")
				 (lambda ()(list (read)(read)(read)))))
			  (with-input-from-file "/proc/loadavg" 
			    (lambda ()(list (read)(read)(read)))))))
	  (common:write-cached-info actual-hostname "cpu-load" result)
	  result))))
			   (with-input-from-file "/proc/loadavg" 
			     (lambda ()(list (read)(read)(read)))))))
	   (common:write-cached-info actual-hostname "cpu-load" result)
	                     result)))))

;; get normalized cpu load by reading from /proc/loadavg and /proc/cpuinfo return all three values and the number of real cpus and the number of threads
;; returns alist '((adj-cpu-load . normalized-proc-load) ... etc.
;;  keys: adj-proc-load, adj-core-load, 1m-load, 5m-load, 15m-load
;;
(define (common:get-normalized-cpu-load remote-host)
  (let ((res (common:get-normalized-cpu-load-raw remote-host))
3047
3048
3049
3050
3051
3052
3053
































































3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122







+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
       (if thread
           (handle-exceptions
           exn
           #t ;; just ignore it, it might have died in the meantime so joining it will throw an exception
           (thread-join! thread))
           )))
   (hash-table-keys *common:thread-punchlist*)))

(define *common:telemetry-log-state* 'startup)
(define *common:telemetry-log-socket* #f)

(define (common:telemetry-log-open)
  (if (eq? *common:telemetry-log-state* 'startup)
      (let* ((serverhost (configf:lookup *configdat* "telemetry" "host"))
             (serverport (configf:lookup-number *configdat* "telemetry" "port"))
             (user (or (get-environment-variable "USER") "unknown"))
             (host (or (get-environment-variable "HOST") "unknown")))
        (set! *common:telemetry-log-state*
              (handle-exceptions
               exn
               (begin
                 (debug:print-info 0 *default-log-port* "common-telemetry-log open udp port failure")
                 'broken)
               (if (and serverhost serverport user host)
                   (let* ((s (udp-open-socket)))
                     ;;(udp-bind! s #f 0)
                     (udp-connect! s serverhost serverport)
                     (set! *common:telemetry-log-socket* s)
                     'open)
                   'not-needed))))))
  
(define (common:telemetry-log event #!key (payload '()))
  (if (eq? *common:telemetry-log-state* 'startup)
      (common:telemetry-log-open))

  (if (eq? 'open *common:telemetry-log-state*)
      (handle-exceptions
       exn
       (begin
         (debug:print-info 0 *default-log-port* "common-telemetry-log comms failure ; disabled (no server?)")
         ;;(define *common:telemetry-log-state* 'broken-or-no-server-preclose)
         ;;(common:telemetry-log-close)
         (define *common:telemetry-log-state* 'broken-or-no-server)
         (set! *common:telemetry-log-socket* #f)
         )
       (if (and *common:telemetry-log-socket* event) ;; TODO - filter on event against telemetry.want-events
           (let* ((user (or (get-environment-variable "USER") "unknown"))
                  (host (or (get-environment-variable "HOST") "unknown"))
                  (start (conc "[megatest "event"]"))
                  (toppath (or *toppath* "/dev/null"))
                  (payload-serialized
                   (base64:base64-encode
                    (z3:encode-buffer
                     (with-output-to-string (lambda () (pp payload))))))
                  (msg     (conc user":"host":"start":"(current-process-id)":"(car (argv))":"
                                 toppath":"payload-serialized)))
             (udp-send *common:telemetry-log-socket* msg))))))
  
(define (common:telemetry-log-close)
  (when (or (member *common:telemetry-log-state* '(broken-or-no-server-preclose open)) *common:telemetry-log-socket*)
    (handle-exceptions
     exn
     (begin
       (define *common:telemetry-log-state* 'closed-fail)
       (debug:print-info 0 *default-log-port* "common-telemetry-log closure failure")
       )
     (begin
       (define *common:telemetry-log-state* 'closed)
       (udp-close-socket *common:telemetry-log-socket*)
       (set! *common:telemetry-log-socket* #f)))))

Modified configf.scm from [77100eae92] to [c596e07f23].

772
773
774
775
776
777
778
779
780
781
782




783
784
785
786
787
788
789
772
773
774
775
776
777
778




779
780
781
782
783
784
785
786
787
788
789







-
-
-
-
+
+
+
+







       (hash-table-set! ht (car section)(cdr section)))
     adat)
    ht))

;; if 
(define (configf:read-alist fname)
  (handle-exceptions
      exn
      #f
    (configf:alist->config
     (with-input-from-file fname read))))
   exn
   #f
   (configf:alist->config
    (with-input-from-file fname read))))

(define (configf:write-alist cdat fname)
  (if (not (common:faux-lock fname))
      (debug:print 0 *default-log-port* "INFO: Could not get lock on " fname))
  (let* ((dat  (configf:config->alist cdat))
         (res
          (begin

Modified db.scm from [6a0d8206a6] to [73e0450409].

404
405
406
407
408
409
410



411
412
413
414
415
416
417
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420







+
+
+







   ;;  (debug:print 0 *default-log-port* "ERROR: attempt to open database when not on homehost. Exiting. Homehost: " (common:get-homehost))
   ;;  (exit 1))))

;; Open the classic megatest.db file (defaults to open in toppath)
;;
;;   NOTE: returns a dbdat not a dbstruct!
;;

;;(define (db:reopen-megatest-db

(define (db:open-megatest-db #!key (path #f)(name #f))
  (let* ((dbdir        (or path *toppath*))
         (dbpath       (conc  dbdir "/" (or name "megatest.db")))
	 (dbexists     (common:file-exists? dbpath))
	 (db           (db:lock-create-open dbpath
					    (lambda (db)
                                              (db:initialize-main-db db)
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651











1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665

















1666
1667
1668
1669
1670
1671


1672

1673
1674
1675


1676
1677
1678
1679
1680
1681
1682
1683
1684
1685


1686

1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707

1708
1709
1710
1711


1712
1713
1714
1715
1716
1717
1718
1643
1644
1645
1646
1647
1648
1649





1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663


1664
1665
1666
1667
1668
1669
1670
1671

1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696

1697
1698


1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712

1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733

1734
1735
1736


1737
1738
1739
1740
1741
1742
1743
1744
1745







-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+



-
-








-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+






+
+
-
+

-
-
+
+










+
+
-
+




















-
+


-
-
+
+







;;                          end_time,strftime('%s','now') as now from tests where state in
;;      ('RUNNING','REMOTEHOSTSTART','LAUNCHED'));

(define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime)
  (let* ((incompleted '())
	 (oldlaunched '())
	 (toplevels   '())
	 (deadtime-str (configf:lookup *configdat* "setup" "deadtime")) ;; FIXME suspect test run time & deadtime are not well matched; resulting in COMPLETED/DEAD status of an a-ok running test
	 (deadtime     (if (and deadtime-str
				(string->number deadtime-str))
			   (string->number deadtime-str)
			   7200))) ;; two hours
         (deadtime-trim (or ovr-deadtime (configf:lookup-number *configdat* "setup" "deadtime")))
         (server-start-allowance 200)
         (server-overloaded-budget 200)
         (launch-monitor-off-time (or (configf:lookup-number *configdat* "setup" "test-stats-update-period") 30))
         (launch-monitor-on-time-budget 30)
         (launch-monitor-period (+ launch-monitor-off-time launch-monitor-on-time-budget server-overloaded-budget))
         (remotehoststart-deadtime-default (+ server-start-allowance server-overloaded-budget 30))
         (remotehoststart-deadtime (or deadtime-trim remotehoststart-deadtime-default))
         (running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period)))
         (running-deadtime (or deadtime-trim running-deadtime-default)) ;; two minutes (30 seconds between updates, this leaves 3x grace period)
         )
    (db:with-db 
     dbstruct #f #f
     (lambda (db)
       (if (number? ovr-deadtime)(set! deadtime ovr-deadtime))
       
       ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
       ;;
       ;; HOWEVER: this code in run:test seems to work fine
       ;;              (> (- (current-seconds)(+ (db:test-get-event_time testdat)
       ;;                     (db:test-get-run_duration testdat)))
       ;;                    600) 
       ;; (db:delay-if-busy dbdat)
       (sqlite3:for-each-row 
        (lambda (test-id run-dir uname testname item-path)
        (lambda (test-id run-dir uname testname item-path event-time run-duration)
          (if (and (equal? uname "n/a")
                   (equal? item-path "")) ;; this is a toplevel test
              ;; what to do with toplevel? call rollup?
              (begin
                (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
                (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
              (begin
                (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
                (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time" run-duration="run-duration))))
        db
        "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('RUNNING');"
        run-id running-deadtime)

       
       (sqlite3:for-each-row 
        (lambda (test-id run-dir uname testname item-path event-time run-duration)
          (if (and (equal? uname "n/a")
                   (equal? item-path "")) ;; this is a toplevel test
              ;; what to do with toplevel? call rollup?
              (begin
                (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
                (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
              (begin
                (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time" run-duration="run-duration)
              (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))
                (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))))
        db
        "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('RUNNING','REMOTEHOSTSTART');"
        run-id deadtime)
        "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('REMOTEHOSTSTART');"
        run-id remotehoststart-deadtime)

       ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
       ;;
       ;; (db:delay-if-busy dbdat)
       (sqlite3:for-each-row
        (lambda (test-id run-dir uname testname item-path)
          (if (and (equal? uname "n/a")
                   (equal? item-path "")) ;; this is a toplevel test
              ;; what to do with toplevel? call rollup?
              (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
              (begin
                (debug:print-info 0 *default-log-port* "Found old test in LAUNCHED state, test-id=" test-id" 1 day since event_time marked")
              (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched))))
                (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched)))))
        db
        "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400 AND state IN ('LAUNCHED');"
        run-id)
       
       (debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, " (length toplevels) " old LAUNCHED toplevel tests and " (length incompleted) " tests marked RUNNING but apparently dead.")

       ;; These are defunct tests, do not do all the overhead of set-state-status. Force them to INCOMPLETE.
       ;;
       ;; (db:delay-if-busy dbdat)
       (let* (;; (min-incompleted (filter (lambda (x)
              ;;      		      (let* ((testpath (cadr x))
              ;;      			     (tdatpath (conc testpath "/testdat.db"))
              ;;      			     (dbexists (common:file-exists? tdatpath)))
              ;;      			(or (not dbexists) ;; if no file then something wrong - mark as incomplete
              ;;      			    (> (- (current-seconds)(file-modification-time tdatpath)) 600)))) ;; no change in 10 minutes to testdat.db - she's dead Jim
              ;;      		    incompleted))
              (min-incompleted-ids (map car incompleted)) ;; do 'em all
              (all-ids             (append min-incompleted-ids (map car oldlaunched))))
         (if (> (length all-ids) 0)
             (begin
               (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") " as INCOMPLETE")
               (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") " as DEAD")
               (for-each
                (lambda (test-id)
                  (db:test-set-state-status dbstruct run-id test-id "COMPLETED" "DEAD" "Test failed to complete")) ;; fix for one aspect of Randy's ticket 1405717332 ;; TODO - fix problem where test goes to COMPLETED/DEAD while in progress, only later to go to COMPLETED/PASS.  ref ticket 220546828

                  (db:set-state-status-and-roll-up-items dbstruct run-id test-id 'foo "COMPLETED" "DEAD" "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead."))
                  ;;(db:test-set-state-status dbstruct run-id test-id "COMPLETED" "DEAD" "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.")) ;; fix for one aspect of Randy's ticket 1405717332 ;; TODO - fix problem where test goes to COMPLETED/DEAD while in progress, only later to go to COMPLETED/PASS.  ref ticket 220546828
                all-ids))))))))

;; ALL REPLACED BY THE BLOCK ABOVE
;;
;; 	    (sqlite3:execute 
;; 	     db
;; 	     (conc "UPDATE tests SET state='INCOMPLETE' WHERE run_id=? AND id IN (" 
2623
2624
2625
2626
2627
2628
2629














2630
2631
2632
2633
2634
2635
2636
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677







+
+
+
+
+
+
+
+
+
+
+
+
+
+







       (sqlite3:for-each-row 
	(lambda (status)
	  (set! res status))
	db
	"SELECT status FROM runs WHERE id=?;" 
	run-id)
       res))))

(define (db:get-run-state dbstruct run-id)
  (let ((res "n/a"))
    (db:with-db
     dbstruct #f #f
     (lambda (db)
       (sqlite3:for-each-row 
	(lambda (status)
	  (set! res status))
	db
	"SELECT state FROM runs WHERE id=?;" 
	run-id)
       res))))


;;======================================================================
;; K E Y S
;;======================================================================

;; get key val pairs for a given run-id
;; ( (FIELDNAME1 keyval1) (FIELDNAME2 keyval2) ... )
3006
3007
3008
3009
3010
3011
3012
3013

3014
3015
3016
3017
3018
3019
3020
3047
3048
3049
3050
3051
3052
3053

3054
3055
3056
3057
3058
3059
3060
3061







-
+








(define (db:get-not-completed-cnt dbstruct run-id)
(db:with-db
   dbstruct
   run-id
   #f
   (lambda (db)
      (print "SELECT count(id) FROM tests WHERE state not in ('COMPLETED', 'DELETED') AND run_id=" run-id)  
      ;(print "SELECT count(id) FROM tests WHERE state not in ('COMPLETED', 'DELETED') AND run_id=" run-id)  
     (sqlite3:first-result
      db
      "SELECT count(id) FROM tests WHERE state not in ('COMPLETED', 'DELETED') AND run_id=?;" run-id))))

(define (db:get-count-tests-running-in-jobgroup dbstruct run-id jobgroup)
  (if (not jobgroup)
      0 ;; 
3697
3698
3699
3700
3701
3702
3703
3704

3705
3706
3707
3708
3709
3710
3711
3738
3739
3740
3741
3742
3743
3744

3745
3746
3747
3748
3749
3750
3751
3752







-
+







			   (db:test-get-testname testdat)
			   test-name))
	 (item-path    (db:test-get-item-path testdat))
         (tl-testdat   (db:get-test-info dbstruct run-id test-name ""))
         (tl-test-id   (if tl-testdat
			   (db:test-get-id tl-testdat)
			   #f)))
    (if (member state '("LAUNCHED" "REMOTEHOSTSTART"))
    (if (member state '("LAUNCHED" "REMOTEHOSTSTART")) 
	(db:general-call dbstruct 'set-test-start-time (list test-id)))
    (mutex-lock! *db-transaction-mutex*)
    (db:with-db
     dbstruct #f #f
     (lambda (db)
       (let ((tr-res
              (sqlite3:with-transaction
3785
3786
3787
3788
3789
3790
3791
3792

3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806


3807
3808
3809
3810
3811
3812
3813
3826
3827
3828
3829
3830
3831
3832

3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845


3846
3847
3848
3849
3850
3851
3852
3853
3854







-
+












-
-
+
+







                                         "\n--> newstate              "newstate
                                         "\n--> newstatus            "newstatus
                                         "\n\n")

                        ;; NB// Pass the db so it is part of the transaction
         (list newstate newstatus)))

(define (db:set-state-status-and-roll-up-run dbstruct run-id)
(define (db:set-state-status-and-roll-up-run dbstruct run-id curr-state curr-status)
    (mutex-lock! *db-transaction-mutex*)
    (db:with-db
     dbstruct #f #f
     (lambda (db)
       (let ((tr-res
              (sqlite3:with-transaction
               db
               (lambda ()
                   (let* ((state-status-counts  (db:get-all-state-status-counts-for-run dbstruct run-id))
													(state-stauses (db:roll-up-rules state-status-counts #f #f ))
                          (newstate (car state-stauses))
                          (newstatus (cadr state-stauses))) 
                    
                   (db:set-run-state-status dbstruct run-id newstate newstatus ))))))
                    (if (or (not (eq? newstate curr-state)) (not (eq?  newstatus curr-status)))
                   (db:set-run-state-status dbstruct run-id newstate newstatus )))))))
         (mutex-unlock! *db-transaction-mutex*)
         tr-res))))


(define (db:get-all-state-status-counts-for-run dbstruct run-id)
 (let* ((test-count-recs (db:with-db
                                  dbstruct #f #f

Added docs/manual/devnotes.txt version [e6b6b73f5f].






































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Developer Notes
---------------

Collected here are some topics that may interest the megatest developer.

telemetry
~~~~~~~~~

A new feature introduced in v1.6525 allows a centralized debug messaging system.  Debugging client-server issues
is greatly aided by a centralized, time coherent log of events across test execution, server, and runner.  This
is provided by the telemetry feature


source code call example


[source,ini]
 [telemetry]
 host <IP where telemetry daemon will run>
 port <UDP port to listen -- we recommend 5929>
 want-events <comma separated list of telemetry-log keywords

[source,ini]
 [telemetry]
 host 10.38.32.91
 port 5929
 want-events ALL

[source,scheme]
(common:telemetry-log <keyword string>


Usage:
1. Add telemetry section to megatest.config
2. Start telemetry daemon telemetry-daemon -a start -l /tmp/my-telemetry.log
3. Run megatest
4. examine / parse telemetry log

Modified docs/manual/megatest_manual.html from [453b3aeb82] to [eadc4c2938].

898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
898
899
900
901
902
903
904



























































905
906
907
908
909
910
911







-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-







which can launch jobs on local and remote Linux hosts. Currently
megatest uses the network filesystem to call home to your master
sqlite3 database. Megatest has been used with the Intel Netbatch and
lsf (also known as openlava) batch systems and it should be
straightforward to use it with other similar systems.</p></div>
</div>
</div>
</div>
<div class="sect1">
<h2 id="_overview">Overview</h2>
<div class="sectionbody">
<div class="sect2">
<h3 id="_stand_alone_megatest_area">Stand-alone Megatest Area</h3>
<div class="paragraph"><p>A single, stand-alone, Megatest based testsuite or "area" is
sufficient for most validation, automation and build problems.</p></div>
<div class="imageblock">
<div class="content">
<img src="megatest-stand-alone-area.png" alt="Static">
</div>
</div>
<div class="paragraph"><p>Megatest is designed as a distributed or decoupled system. This means
you can run the areas stand-alone with no additional
infrastructure. I.e. there are no databases, web servers or other
centralized resources needed. However as your needs grow you can
integrate multiple areas into a bigger system.</p></div>
<div class="sect3">
<h4 id="_component_descriptions">Component Descriptions</h4>
<div class="olist arabic"><ol class="arabic">
<li>
<p>
Multi-area dashboard and xterm. A gui (the dashboard) is usually the
  best option for controlling and launching runs but all operations
  can also be done from the commandline. Note: The not yet released
  multi-area dashboard replaces the old dashboard for browsing and
  controlling runs but for managing a single area the old dashboard
  works very well.
</p>
</li>
<li>
<p>
Area/testsuite. This is your testsuite or automation definition and
  consists of the information in megatest.config, runconfigs.config
  and your testconfigs along with any custom scripting that can&#8217;t be
  done with the native Megatest features.
</p>
</li>
<li>
<p>
If your testsuite or build automation is too large to run on a
  single instance you can distribute your jobs into a compute server
  pool. The only current requirements are password-less ssh access and
  a network filesystem.
</p>
</li>
</ol></div>
</div>
</div>
<div class="sect2">
<h3 id="_full_system_architecture">Full System Architecture</h3>
<div class="imageblock">
<div class="content">
<img src="megatest-system-architecture.png" alt="Static">
</div>
</div>
</div>
</div>
</div>
<div class="sect1">
<h2 id="_installation">Installation</h2>
<div class="sectionbody">
<div class="sect2">
<h3 id="_dependencies">Dependencies</h3>
<div class="paragraph"><p>Chicken scheme and a number of "eggs" are required for building
3031
3032
3033
3034
3035
3036
3037
3038

3039
3040
3041
3042
2972
2973
2974
2975
2976
2977
2978

2979
2980
2981
2982
2983







-
+




</div>
</div>
</div>
<div id="footnotes"><hr></div>
<div id="footer">
<div id="footer-text">
Version 1.0<br>
Last updated 2018-11-29 09:32:52 PST
Last updated 2019-02-19 11:13:54 PST
</div>
</div>
</body>
</html>

Modified docs/megatest-training.odp from [ba7ab2ab9e] to [f923ec026f].

cannot compute difference between binary files

Modified launch.scm from [b89d82dc9d] to [bd83b25556].

375
376
377
378
379
380
381

382
383
384
385

386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403













404
405
406
407
408
409
410
411

















412
413

414
415
416
417
418
419
420
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401




402
403
404
405
406
407
408
409
410
411
412
413
414








415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441







+




+














-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+


+







			    (- 
			     (current-seconds) 
			     start-seconds)))))
	 (kill-tries 0))
    ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area)
    ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area)
    (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10)

    (let loop ((minutes   (calc-minutes))
	       (cpu-load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
	       (disk-free (get-df (current-directory)))
               (last-sync (current-seconds)))
      (common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync))
      (let* ((over-time     (> (current-seconds) (+ last-sync update-period)))
             (new-cpu-load  (let* ((load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
                                   (delta (abs (- load cpu-load))))
                              (if (> delta 0.1) ;; don't bother updating with small changes
                                  load
                                  #f)))
             (new-disk-free (let* ((df    (if over-time ;; only get df every 30 seconds
                                              (get-df (current-directory))
                                              disk-free))
                                   (delta (abs (- df disk-free))))
                              (if (and (> df 0)
                                       (> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg
                                  df
                                  #f)))
             (do-sync       (or new-cpu-load new-disk-free over-time)))
        (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
	(set! kill-job? (or (test-get-kill-request run-id test-id) ;; run-id test-name itemdat))
			    (and runtlim (let* ((run-seconds   (- (current-seconds) start-seconds))
             (do-sync       (or new-cpu-load new-disk-free over-time))

             (test-info   (rmt:get-test-info-by-id run-id test-id))
             (state       (db:test-get-state test-info))
             (status      (db:test-get-status test-info))
             (kill-reason  "no kill reason specified")
             (kill-job?    #f))
        (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
        (cond
         ((test-get-kill-request run-id test-id)
          (set! kill-reason "KILLING TEST since received kill request (KILLREQ)")
          (set! kill-job? #t))
         ((and runtlim (> (- (current-seconds) start-seconds) runtlim))
						(time-exceeded (> run-seconds runtlim)))
					   (if time-exceeded
					       (begin
						 (debug:print-info 0 *default-log-port* "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" run-seconds " seconds, limit=" runtlim)
						 #t)
					       #f)))))
        (if do-sync
            (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f))
          (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim))
          (set! kill-job? #t))
         ((equal? status "DEAD")
          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
          (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
          ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
          (set! kill-job? #f)))

        (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
        (launch:handle-zombie-tests run-id)
        (when do-sync
          ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)
          ;;  (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))
          (common:telemetry-log "zombie" (conc  "launch:monitor-job - dosync started at "(current-seconds)))
          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
          (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds))))
        
	(if kill-job? 
	    (begin
              (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason)
	      (mutex-lock! m)
	      ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this
	      ;;       section and the runit section? Or add a loop that tries three times with a 1/4 second
	      ;;       between tries?
	      (let* ((pid1 (launch:einf-pid exit-info)) ;; (vector-ref exit-info 0))
		     (pid2 (rmt:test-get-top-process-pid run-id test-id))
		     (pids (delete-duplicates (filter number? (list pid1 pid2)))))
440
441
442
443
444
445
446

447

448
449
450

451
452
453


454
455
456
457
458
459
460
461
462
463
464
465
466
467
468

469
470
471

472
473
474

475
476
477
478
479
480
481
482
483







+
-
+


-
+


-
+
+







				 (handle-exceptions
				  exn
				  #f
				  (process-signal pid-num signal/kill)))
			       (process:get-sub-pids pid))))
		       ;;    (debug:print-info 0 *default-log-port* "not killing process " pid " as it is not alive"))))
		       pids)
                      ;; BB: question to Matt -- does the tests:test-state-status! encompass rollup to toplevel?  If not, should it?
		      (tests:test-set-status! run-id test-id "KILLED"  "KILLED" (args:get-arg "-m") #f))
		      (tests:test-set-status! run-id test-id "KILLED"  "KILLED" (conc (args:get-arg "-m")" "kill-reason) #f)) ;; BB ADDED kill-reason -- confirm OK with Matt
		    (begin
		      (debug:print-error 0 *default-log-port* "Nothing to kill, pid1=" pid1 ", pid2=" pid2)
		      (tests:test-set-status! run-id test-id "KILLED"  "FAILED TO KILL" (args:get-arg "-m") #f)
		      (tests:test-set-status! run-id test-id "KILLED"  "FAILED TO KILL" (conc (args:get-arg "-m")" "kill-reason) #f) ;; BB ADDED kill-reason -- confirm OK with Matt
		      )))
	      (mutex-unlock! m)
	      ;; no point in sticking around. Exit now.
	      ;; no point in sticking around. Exit now. But run end of run before exiting?
        (launch:end-of-run-check run-id)
	      (exit)))
	(if (hash-table-ref/default misc-flags 'keep-going #f)
	    (begin
	      (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses
	      (if (hash-table-ref/default misc-flags 'keep-going #f)  ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta
		  (loop (calc-minutes)
                        (or new-cpu-load cpu-load)
571
572
573
574
575
576
577
578

579

580


581
582
583
584
585

586
587
588
589
590
591
592
594
595
596
597
598
599
600

601
602
603

604
605
606

607
608

609
610
611
612
613
614
615
616







-
+

+
-
+
+

-


-
+







          
	  (set! tconfigreg (tests:get-all)) ;; mapping of testname => test source path
	  (let ((sighand (lambda (signum)
			   ;; (signal-mask! signum) ;; to mask or not? seems to cause issues in exiting
			   (if (eq? signum signal/stop)
			       (debug:print-error 0 *default-log-port* "attempt to STOP process. Exiting."))
			   (set! *time-to-exit* #t)
			   (print "Received signal " signum ", cleaning up before exit. Please wait...")
			   (print "Received signal " signum ", cleaning up before exit (set this test to COMPLETED/ABORT) . Please wait...")
			   (let ((th1 (make-thread (lambda ()
                                                     (print "set test to COMPLETED/ABORT begin.")
						     (rmt:test-set-state-status run-id test-id "INCOMPLETE" "KILLED" #f)
						     (rmt:test-set-state-status run-id test-id "COMPLETED" "ABORT" "received kill signal")
                                                     (print "set test to COMPLETED/ABORT complete.")
						     (print "Killed by signal " signum ". Exiting")
						     (thread-sleep! 1)
						     (exit 1))))
				 (th2 (make-thread (lambda ()
						     (thread-sleep! 2)
						     (thread-sleep! 20)
						     (debug:print 0 *default-log-port* "Done")
						     (exit 4)))))
			     (thread-start! th2)
			     (thread-start! th1)
			     (thread-join! th2)))))
	    (set-signal-handler! signal/int sighand)
	    (set-signal-handler! signal/term sighand)
603
604
605
606
607
608
609


610

611
612
613
614
615
616
617

618
619
620
621
622
623
624

625
626
627
628
629
630
631
627
628
629
630
631
632
633
634
635

636
637
638
639
640
641
642
643
644
645
646
647
648
649
650

651
652
653
654
655
656
657
658







+
+
-
+







+






-
+







				  (exit))))
		 (test-pid  (db:test-get-process_id  test-info)))
	    (cond
             ;; -mrw- I'm removing KILLREQ from this list so that a test in KILLREQ state is treated as a "do not run" flag.
	     ((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun
	      (debug:print 0 *default-log-port* "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request")
	      ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a")

              (rmt:general-call 'set-test-start-time #f test-id)
	      (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f)
              (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f)
	      ) ;; prime it for running
	     ((member (db:test-get-state test-info) '("RUNNING" "REMOTEHOSTSTART"))
	      (if (process:alive-on-host? test-host test-pid)
		  (debug:print-error 0 *default-log-port* "test state is "  (db:test-get-state test-info) " and process " test-pid " is still running on host " test-host ", cannot proceed")
		  (exit)))
	     ((not (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ")))
	      ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a")
              (rmt:general-call 'set-test-start-time #f test-id)
	      (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f)
	      )
	     (else ;; (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ"))
	      (debug:print-error 0 *default-log-port* "test state is " (db:test-get-state test-info) ", cannot proceed")
	      (exit))))
	  
	  (debug:print 2 *default-log-port* "Exectuing " test-name " (id: " test-id ") on " (get-host-name))
	  (debug:print 2 *default-log-port* "Executing " test-name " (id: " test-id ") on " (get-host-name))
	  (set! keys       (rmt:get-keys))
	  ;; (runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals) ;; these may be needed by the launching process
	  ;; one of these is defunct/redundant ...
	  (if (not (launch:setup force-reread: #t))
	      (begin
		(debug:print 0 *default-log-port* "Failed to setup, exiting") 
		;; (sqlite3:finalize! db)
833
834
835
836
837
838
839
840
841







842
843
844
845
846

847
848
849
850
851
852
853
854
855
856
857
858




859
860
861

862
863

864
865
866
867
868
869
870
860
861
862
863
864
865
866


867
868
869
870
871
872
873
874
875



876
877
878
879
880
881
882
883
884
885



886
887
888
889
890
891

892
893

894
895
896
897
898
899
900
901







-
-
+
+
+
+
+
+
+


-
-
-
+









-
-
-
+
+
+
+


-
+

-
+







;; > 3 RUNNING with not test_dead do nothing (run should already be RUNNING/ na
;; > 0 RUNNING and test_dead then send KILLREQ ==> COMPLETED
;; 0 RUNNING ==> this is actually the first condition, should not get here

(define (launch:end-of-run-check run-id )
    (let*	((not-completed-cnt (rmt:get-not-completed-cnt run-id))  
           (running-cnt (rmt:get-count-tests-running-for-run-id run-id))
           (all-test-launched (rmt:get-var (conc "lunch-complete-" run-id))))
     ;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing  
           (all-test-launched (rmt:get-var (conc "lunch-complete-" run-id)))
           (current-state (rmt:get-run-state run-id))
           (current-status (rmt:get-run-status run-id)))
     ;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing 
     (debug:print 0 *default-log-port* "rollup run state/status")                      
     (rmt:set-state-status-and-roll-up-run  run-id current-state current-status)
 
     (cond 
       ((and all-test-launched (eq? not-completed-cnt 0) (equal? all-test-launched "yes" ))
           	(debug:print 0 *default-log-port* "rollup run state/status")                      
          	(rmt:set-state-status-and-roll-up-run  run-id)
          	(debug:print 0 *default-log-port* "look for  post hook.")
           	(debug:print 0 *default-log-port* "look for  post hook.")
          	(runs:run-post-hook run-id))
        ((> running-cnt 3) 
        	  (debug:print 0 *default-log-port* "There are " running-cnt " tests running." ))
        ((> running-cnt 0)
            (debug:print 0 *default-log-port* "running cnt > 0 but <= 3 kill-running-tests-if-dead" )
   				  (let ((kill-cnt (launch:kill-tests-if-dead run-id)))
           			(if (and all-test-launched  (equal? all-test-launched "yes") (eq? kill-cnt running-cnt))
           					(launch:end-of-run-check run-id)))) ;;todo
        (else  (debug:print 0 *default-log-port* "Should it get here?? May be everything is not launched yet. Running test cnt:" running-cnt " Not completed test cnt:" not-completed-cnt)
         (let* ((running-tests (rmt:get-tests-for-run run-id "%" `("RUNNING" "LAUNCHED" "REMOTEHOSTSTART") `() #f #f #f #f #f #f #f #f)))
       (let loop ((running-test (car running-tests))
			     (tal    (cdr running-tests)))
         (let* ((not-completed-tests (rmt:get-tests-for-run run-id "%" `("NOT_STARTED" "RUNNING" "LAUNCHED" "REMOTEHOSTSTART") `() #f #f #f #f #f #f #f #f)))
       (if (> (length not-completed-tests) 0) 
           (let loop ((running-test (car not-completed-tests))
			     (tal    (cdr not-completed-tests)))
		       (let* ((test-name (vector-ref running-test 2))
                 (item-path (vector-ref running-test 11)))
			       	(debug:print 0 *default-log-port* "test " test-name "/" item-path " needs to be killed")
			       	(debug:print 0 *default-log-port* "test " test-name "/" item-path " not completed")
              (if (not (null? tal))
				  (loop (car tal) (cdr tal))))))))))        
				  (loop (car tal) (cdr tal)))))))))))        
 
(define (launch:is-test-alive host pid)
(if (and host pid (not (equal? host "n/a")))
(let* ((cmd (conc "ssh " host " pstree -A " pid))
      (output (with-input-from-pipe cmd read-lines)))
  (print "cmd: " cmd "\n op: " output )
  (if(eq? (length output) 0)
1403
1404
1405
1406
1407
1408
1409






















1410
1411
1412
1413
1414
1415
1416
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469







+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







	  (list lnkpathf lnkpath ))
	(if (and test-src-path (> remtries 0))
	    (begin
	      (debug:print-error 0 *default-log-port* "Failed to create work area at " test-path " with link at " lnktarget ", remaining attempts " remtries)
	      ;; 
	      (create-work-area run-id run-info keyvals test-id test-src-path disk-path testname itemdat remtries: (- remtries 1)))
	    (list #f #f)))))


(define (launch:handle-zombie-tests run-id)
  (let* ((key (conc "zombiescan-runid-"run-id))
         (now (current-seconds))
         (threshold (- (current-seconds)  (* 2 (or (configf:lookup-number *configdat* "setup" "deadtime") 120))))
         (val (rmt:get-var key))
         (do-scan?
          (cond
           ((not val)
            #t)
           ((< val threshold)
            #t)
           (else #f))))
    (when do-scan?
      (debug:print 1 *default-log-port* "INFO: search and mark zombie tests")
      (rmt:set-var key (current-seconds))
      (rmt:find-and-mark-incomplete run-id #f))))





;; 1. look though disks list for disk with most space
;; 2. create run dir on disk, path name is meaningful
;; 3. create link from run dir to megatest runs area 
;; 4. remotely run the test on allocated host
;;    - could be ssh to host from hosts table (update regularly with load)
;;    - could be netbatch

Modified megatest-version.scm from [f713858d59] to [cc55965633].

16
17
18
19
20
21
22
23

16
17
18
19
20
21
22

23







-
+
;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.

;; Always use two or four digit decimal
;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00..

(declare (unit megatest-version))

(define megatest-version 1.6524)
(define megatest-version 1.6525)

Modified megatest.scm from [bd9e9c775d] to [a7dc9766fe].

107
108
109
110
111
112
113
114
115
116




117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134

135
136
137
138
139
140
141
107
108
109
110
111
112
113



114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134

135
136
137
138
139
140
141
142







-
-
-
+
+
+
+

















-
+







  -manual                 : show the Megatest user manual
  -version                : print megatest version (currently " megatest-version ")

Launching and managing runs
  -run                    : run all tests or as specified by -testpatt
  -remove-runs            : remove the data for a run, requires -runname and -testpatt
                            Optionally use :state and :status, use -keep-records to remove only
                            the run data.
  -kill-runs               : kill existing run(s) (all incomplete tests killed)
  -kill-rerun              : kill an existing run (all incomplete tests killed and run is rerun)
                            the run data. Use -kill-wait to override the 10 second
                            per test wait after kill delay. 
  -kill-runs              : kill existing run(s) (all incomplete tests killed)
  -kill-rerun             : kill an existing run (all incomplete tests killed and run is rerun)
  -set-state-status X,Y   : set state to X and status to Y, requires controls per -remove-runs
  -rerun FAIL,WARN...     : force re-run for tests with specificed status(s)
  -rerun-clean            : set all tests not COMPLETED+PASS,WARN,WAIVED to NOT_STARTED,n/a
                            and then run the specified testpatt with -preclean
  -rerun-all              : set all tests to NOT_STARTED,n/a and run with -preclean
  -lock                   : lock run specified by target and runname
  -unlock                 : unlock run specified by target and runname
  -set-run-status status  : sets status for run to status, requires -target and -runname
  -get-run-status         : gets status for run specified by target and runname
  -run-wait               : wait on run specified by target and runname
  -preclean               : remove the existing test directory before running the test
  -clean-cache            : remove the cached megatest.config and runconfigs.config files
  -no-cache               : do not use the cached config files. 
  -one-pass               : launch as many tests as you can but do not wait for more to be ready
  -remove-keep N          : remove all but N most recent runs per target; use '-actions, -age, -precmd'
  -age <age>              : 120d,3h,20m to apply only to runs older than the 
                                 specified age. NB// M=month, m=minute
  -actions <action>       : print,remove-runs,archive to specify action to take
  -actions <action>[,...] : actions to take; print,remove-runs,archive,kill-runs
  -precmd                 : insert a wrapper command in front of the commands run

Selectors (e.g. use for -runtests, -remove-runs, -set-state-status, -list-runs etc.)
  -target key1/key2/...   : run for key1, key2, etc.
  -reqtarg key1/key2/...  : run for key1, key2, etc. but key1/key2 must be in runconfigs
  -testpatt patt1/patt2,patt3/...  : % is wildcard
  -runname                : required, name for this particular test run
355
356
357
358
359
360
361

362
363
364
365
366
367
368
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370







+







			"-prefix-target"

                        "-src-target"
                        "-src-runname"
                        "-diff-email"
			"-sync-to"			
			"-pgsync"
			"-kill-wait"    ;; wait this long before removing test (default is 10 sec)
                        "-diff-html"
			)
 		 (list  "-h" "-help" "--help"
			"-manual"
			"-version"
		        "-force"
		        "-xterm"
378
379
380
381
382
383
384

385
386
387
388
389
390
391
392

393
394
395


396
397

398
399
400
401
402
403
404
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394

395
396


397
398
399
400
401
402
403
404
405
406
407
408







+







-
+

-
-
+
+


+







			"-rerun-clean"
			"-rerun-all"
			"-clean-cache"
			"-no-cache"
			"-cache-db"
                        "-use-db-cache"
                        "-prepend-contour"

			;; misc
			"-repl"
			"-lock"
			"-unlock"
			"-list-servers"
			"-kill-servers"
                        "-run-wait"      ;; wait on a run to complete (i.e. no RUNNING)
			"-one-pass"       ;;
			"-one-pass"      ;;
			"-local"         ;; run some commands using local db access
      "-generate-html"
      "-generate-html-structure" 
			"-generate-html"
			"-generate-html-structure" 
			"-list-run-time"
                        "-list-test-time"
			
			;; misc queries
			"-list-disks"
			"-list-targets"
			"-list-db-targets"
			"-show-runconfig"
			"-show-config"
			"-show-cmdinfo"

Modified mt-pg.sql from [1d0cdb4bfc] to [a25c58b313].

80
81
82
83
84
85
86
87

88
89
90
91
92
93
94
80
81
82
83
84
85
86

87
88
89
90
91
92
93
94







-
+







CREATE TABLE IF NOT EXISTS area_tags (
       id SERIAL PRIMARY KEY,
       tag_id   INTEGER DEFAULT 0,
       area_id  INTEGER DEFAULT 0,
       CONSTRAINT areatagconstraint UNIQUE (tag_id, area_id));

CREATE VIEW area_tag_view as 
select a.id as aid, t.id as tid,area_name,tag_name from areas as a inner join area_tags as at on at.area_id = a.id
select a.id as aid, t.id as tid,area_name,tag_name,area_path from areas as a inner join area_tags as at on at.area_id = a.id
inner join tags as t on t.id = at.tag_id  ;

INSERT INTO areas (id,area_name,area_path) VALUES (0,'local','.');

CREATE TABLE IF NOT EXISTS ttype (
       id SERIAL PRIMARY KEY,
       target_spec TEXT DEFAULT '');
307
308
309
310
311
312
313
314
315
316
317
318






319
320
321
322
323
324
325
307
308
309
310
311
312
313





314
315
316
317
318
319
320
321
322
323
324
325
326







-
-
-
-
-
+
+
+
+
+
+







   deleted           INTEGER     default 0
);



CREATE TABLE IF NOT EXISTS users_webviews(
 id      SERIAL  PRIMARY KEY   ,
 user_id   INTEGER NOT NULL,
 webview_id  INTEGER NOT NULL,
 deleted     INTEGER default 0,
 searchpattern TEXT Default '',
 web_page TEXT Default '',
 user_id         INTEGER NOT NULL,
 webview_id  		 INTEGER NOT NULL,
 deleted     		 INTEGER default 0,
 searchpattern 	 TEXT Default '',
 web_page   		 TEXT Default '',
 is_default      boolean default 'f',
 other_search_data TEXT Default ''
);

CREATE TABLE IF NOT EXISTS cctrl_info(
 id      SERIAL  PRIMARY KEY   ,
 user_id   INTEGER NOT NULL,
 input TEXT Default '',

Modified mtut.scm from [5cc50f038e] to [d7026e5f70].

826
827
828
829
830
831
832
833


834
835
836
837
838
839
840
826
827
828
829
830
831
832

833
834
835
836
837
838
839
840
841







-
+
+








;; (use trace)(trace create-run-pkt)
(define (contains list x) (cond ((null? list) #f) ((eq? (car list) x) #t) (else (contains (cdr list) x))))

;; collect all needed data and create run pkts for contours with changed inputs
;;
(define (generate-run-pkts mtconf toppath)
  (let ((std-runname (conc "sched"  (time->string (seconds->local-time (current-seconds)) "%M%H%d"))))
  (let ((std-runname (conc "sched"  (time->string (seconds->local-time (current-seconds)) "%M%H%d")))
        (packets-generated 0))
    (common:with-queue-db
     mtconf
     (lambda (pktsdirs pktsdir pdb)
       (let* ((rgconfdat (find-and-read-config (conc toppath "/runconfigs.config")))
	      (rgconf    (car rgconfdat))
	      (all-areas (map car (configf:get-section mtconf "areas")))
	      (contours  (configf:get-section mtconf "contours"))
1009
1010
1011
1012
1013
1014
1015

1016

1017
1018

1019
1020
1021
1022
1023
1024
1025
1010
1011
1012
1013
1014
1015
1016
1017

1018
1019

1020
1021
1022
1023
1024
1025
1026
1027







+
-
+

-
+







		       (lambda (cmd)
			 ;;(print "cmd: " cmd)
                         ;;(print "Areas: " all-areas)
                         (for-each 
                           (lambda (area) 
;Add code to check whether area is valid
			     (if 
                   ;; This code checks whether the target has been passed in via argument, and only runs the specified target
                   (if (args:get-arg "-target") 
                   (and (< packets-generated 4)  (if (args:get-arg "-target") 
                     (if (string= (args:get-arg "-target") runkey) (area-allowed? area "area-needs-to-be-run" runkey contour #f) #f) 
                     (area-allowed? area "area-needs-to-be-run" runkey contour #f))
                     (area-allowed? area "area-needs-to-be-run" runkey contour #f)))
       
			     (let* ((script (car cmd))
				(params (cdr cmd))
				(cmd    (conc script " " contour " " area " " runkey " " std-runname " " action " " params))
				(res    (handle-exceptions
					    exn
					    #f
1094
1095
1096
1097
1098
1099
1100

1101

1102
1103



1104
1105
1106
1107
1108
1109
1110
1096
1097
1098
1099
1100
1101
1102
1103

1104
1105

1106
1107
1108
1109
1110
1111
1112
1113
1114
1115







+
-
+

-
+
+
+







							  ;;(target   . ,(list new-target)) ;; overriding with result from runing the script
                                                          ))
							(aval       (or (configf:lookup mtconf "areas" area) ""))
                                    			(aval-alist (common:val->alist aval))

							(targets (map-targets mtconf aval-alist runkey area contour)))
                                        (pp targets)
				        (for-each (lambda (target) 
				        (for-each (lambda (target) (create-run-pkt mtconf action area runkey target new-runname mode-patt
                                                     (create-run-pkt mtconf action area runkey target new-runname mode-patt
                                                      tag-expr pktsdir reason contour sched dbdest append
                                                      runtrans)) targets)
                                                      runtrans)
                                                     (set! packets-generated (+ packets-generated 1))
                                                  ) targets)
					;; Add filter for targets

                                       ;;(create-run-pkt mtconf action area runkey target runname
                                       ;;               pktsdir reason contour dbdest append
                                       ;;               runtrans)
				       (print "key-msg: " key-msg)
				       ;;(push-run-spec torun contour

Modified rmt.scm from [f3cb7b6857] to [4dc97a8297].

53
54
55
56
57
58
59





60
61
62
63
64
65
66
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71







+
+
+
+
+








(define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id

;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname))
;;
(define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected

  (common:telemetry-log (conc "rmt:"(->string cmd))
                        payload: `((rid . ,rid)
                                   (params . ,params)))
                          
  
  ;;DOT digraph megatest_state_status {
  ;;DOT   ranksep=0;
  ;;DOT   // rankdir=LR;
  ;;DOT   node [shape="box"];
  ;;DOT "rmt:send-receive" -> MUTEXLOCK;
  ;;DOT { edge [style=invis];"case 1" -> "case 2" -> "case 3" -> "case 4" -> "case 5" -> "case 6" -> "case 7" -> "case 8" -> "case 9" -> "case 10" -> "case 11"; }
  ;; do all the prep locked under the rmt-mutex
690
691
692
693
694
695
696
697
698


699
700
701
702
703
704
705
695
696
697
698
699
700
701


702
703
704
705
706
707
708
709
710







-
-
+
+







  (rmt:send-receive 'get-count-tests-running-in-jobgroup run-id (list run-id jobgroup)))

;; state and status are extra hints not usually used in the calculation
;;
(define (rmt:set-state-status-and-roll-up-items run-id test-name item-path state status comment)
  (rmt:send-receive 'set-state-status-and-roll-up-items run-id (list run-id test-name item-path state status comment)))

(define (rmt:set-state-status-and-roll-up-run run-id)
  (rmt:send-receive 'set-state-status-and-roll-up-run run-id (list run-id)))
(define (rmt:set-state-status-and-roll-up-run run-id state status)
  (rmt:send-receive 'set-state-status-and-roll-up-run run-id (list run-id state status)))


(define (rmt:update-pass-fail-counts run-id test-name)
  (rmt:general-call 'update-pass-fail-counts run-id test-name test-name test-name))

(define (rmt:top-test-set-per-pf-counts run-id test-name)
  (rmt:send-receive 'top-test-set-per-pf-counts run-id (list run-id test-name)))
753
754
755
756
757
758
759




760
761
762




763
764
765
766
767
768
769
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782







+
+
+
+



+
+
+
+








(define (rmt:lock/unlock-run run-id lock unlock user)
  (rmt:send-receive 'lock/unlock-run #f (list run-id lock unlock user)))

;; set/get status
(define (rmt:get-run-status run-id)
  (rmt:send-receive 'get-run-status #f (list run-id)))

(define (rmt:get-run-state run-id)
  (rmt:send-receive 'get-run-state #f (list run-id)))


(define (rmt:set-run-status run-id run-status #!key (msg #f))
  (rmt:send-receive 'set-run-status #f (list run-id run-status msg)))

(define (rmt:set-run-state-status run-id state status )
  (rmt:send-receive 'set-run-state-status #f (list run-id state status)))


(define (rmt:update-run-event_time run-id)
  (rmt:send-receive 'update-run-event_time #f (list run-id)))

(define (rmt:get-runs-by-patt  keys runnamepatt targpatt offset limit fields last-runs-update  #!key  (sort-order "asc")) ;; fields of #f uses default
  (rmt:send-receive 'get-runs-by-patt #f (list keys runnamepatt targpatt offset limit fields last-runs-update sort-order)))

Modified runs.scm from [f0db5bffd4] to [c9b27c71eb].

448
449
450
451
452
453
454







455
456
457
458
459
460
461
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468







+
+
+
+
+
+
+







	  	  (debug:print-info 0 *default-log-port* "filtering initial test list with tagexpr: " (args:get-arg "-tagexpr") " => " allowed-tests)
		  ));; tests will be ANDed with this list

    ;; register this run in monitor.db
    (rmt:tasks-add "run-tests" user target runname test-patts task-key) ;; params)
    (rmt:tasks-set-state-given-param-key task-key "running")

    (common:telemetry-log "run-tests"
                          payload:
                          `( (target . ,target)
                             (run-name . ,runname)
                             (test-patts . ,test-patts) ) )

    
    ;; Now generate all the tests lists
    (set! all-tests-registry (tests:get-all))   ;; hash of testname => path-to-test
    (set! all-test-names     (hash-table-keys all-tests-registry))
    ;; filter first for allowed-tests (from -tagexpr) then for test-patts.
    (set! test-names         (tests:filter-test-names
			      (if allowed-tests
				  (tests:filter-test-names all-test-names allowed-tests)
513
514
515
516
517
518
519

520
521
522
523
524
525
526
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534







+







    (runs:update-all-test_meta #f)

    ;; run the run prehook if there are no tests yet run for this run:
    ;;
    (runs:run-pre-hook run-id)
    ;; mark all test launced flag as false in the meta table 
    (rmt:set-var (conc "lunch-complete-" run-id) "no")
    (rmt:set-run-state-status run-id "new" "n/a")
    ;; now add non-directly referenced dependencies (i.e. waiton)
    ;;======================================================================
    ;; refactoring this block into tests:get-full-data
    ;;
    ;; What happended, this code is now duplicated in tests!?
    ;;
    ;;======================================================================
2001
2002
2003
2004
2005
2006
2007
2008




2009
2010




2011
2012
2013
2014
2015
2016
2017
2009
2010
2011
2012
2013
2014
2015

2016
2017
2018
2019
2020

2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031







-
+
+
+
+

-
+
+
+
+







		   (lambda (action)
		     (case action
		       ((print)
			(print " " (simple-run-runname run)
			       " " (time->string (seconds->local-time (simple-run-event_time run)) "WW%V.%u %H:%M:%S")
			       " " (if remove "REMOVE" "")))
		       ((remove-runs)
			(if remove (system (conc precmd " megatest -remove-runs -target " target " -runname " (simple-run-runname run) " -testpatt %"))))
			(if remove (system (conc precmd " megatest -remove-runs -target " target " -runname " (simple-run-runname run) " -testpatt %"
						 (if (member 'kill-runs actions) ;; if kill-runs is specified then set -kill-wait to 0
						     " -kill-wait 0"
						     "")))))
		       ((archive)
			(if remove (system (conc precmd " megatest -archive save-remove -target " target " -runname " (simple-run-runname run) " -testpatt %"))))))
			(if remove (system (conc precmd " megatest -archive save-remove -target " target " -runname " (simple-run-runname run) " -testpatt %"))))
		       ((kill-runs)
			(if remove (system (conc precmd " megatest -kill-runs -target " target " -runname " (simple-run-runname run) " -testpatt %"))))
			))
		   actions))))
          sorted)))
     ;; (print "Sorted: " (map simple-run-event_time sorted))
     ;; (print "Remove: " (map simple-run-event_time to-remove))))
     (hash-table-keys runs-ht))
    runs-ht))

2135
2136
2137
2138
2139
2140
2141
2142

2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159

2160
2161
2162
2163

2164
2165
2166
2167
2168
2169
2170
2149
2150
2151
2152
2153
2154
2155

2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186







-
+

















+




+







									 (> (string-length dira)(string-length dirb))
									 #f))))))
		       (toplevel-retries (make-hash-table)) ;; try three times to loop through and remove top level tests
		       (test-retry-time  (make-hash-table))
                       (backgrounded-remove-status     (make-hash-table))
                       (backgrounded-remove-last-visit (make-hash-table))
                       (backgrounded-remove-result     (make-hash-table))
		       (allow-run-time   10)) ;; seconds to allow for killing tests before just brutally killing 'em
		       (allow-run-time   (string->number (or (args:get-arg "-kill-wait") "10")))) ;; seconds to allow for killing tests before just brutally killing 'em
		   (let loop ((test (car sorted-tests))
			      (tal  (cdr sorted-tests)))
		     (let* ((test-id       (db:test-get-id test))
			    (new-test-dat  (rmt:get-test-info-by-id run-id test-id)))
		       (if (not new-test-dat)
			   (begin
			     (debug:print-error 0 *default-log-port* "We have a test-id of " test-id " but no record was found. NOTE: No locking of records is done between processes, do not simultaneously remove the same run from two processes!")
			     (if (not (null? tal))
				 (loop (car tal)(cdr tal))))
			   (let* ((item-path     (db:test-get-item-path new-test-dat))
				  (test-name     (db:test-get-testname new-test-dat))
				  (run-dir       ;;(filedb:get-path *fdb*
				   ;; (rmt:sdb-qry 'getid 
				   (db:test-get-rundir new-test-dat)) ;; )    ;; run dir is from the link tree
                                  (has-subrun    (and (subrun:subrun-test-initialized? run-dir)
                                                      (not (subrun:subrun-removed? run-dir))))
				  (test-state    (db:test-get-state new-test-dat))
                                  (test-status    (db:test-get-status new-test-dat))
				  (test-fulln    (db:test-get-fullname new-test-dat))
				  (uname         (db:test-get-uname    new-test-dat))
				  (toplevel-with-children (and (db:test-get-is-toplevel test)
							       (> (rmt:test-toplevel-num-items run-id test-name) 0))))
                             
			     (case action
			       ((remove-runs)
				;; if the test is a toplevel-with-children issue an error and do not remove
				(cond
                                 (toplevel-with-children
                                  (debug:print 0 *default-log-port* "WARNING: skipping removal of " test-fulln " with run-id " run-id " as it has sub tests")
                                  (hash-table-set! toplevel-retries test-fulln (+ (hash-table-ref/default toplevel-retries test-fulln 0) 1))
2273
2274
2275
2276
2277
2278
2279






2280
2281
2282
2283
2284
2285
2286
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308







+
+
+
+
+
+







				    (loop (car tal)(cdr tal)))
                                  )
                                 ((member test-state (list "RUNNING" "LAUNCHED" "REMOTEHOSTSTART" "KILLREQ"))
                                  (debug:print 1 *default-log-port* "INFO: issuing killreq to test "test-fulln)
                                  (mt:test-set-state-status-by-id run-id (db:test-get-id test) "KILLREQ" "n/a" #f)
                                  (if (not (null? tal))
				      (loop (car tal)(cdr tal))))
                                 ((and (member test-status '("PREQ_FAIL" "PREQ_DISCARDED" "BLOCKED" "ZERO_ITEMS" "KEEP_TRYING" "TEN_STRIKES" "TIMED_OUT")))
                                  (rmt:set-state-status-and-roll-up-items run-id (db:test-get-id test) 'foo "NOT_STARTED" "n/a" (conc "kill-run moved from "test-state":"test-status" to NOT_STARTED:n/a"))
                                  ;;(mt:test-set-state-status-by-id run-id (db:test-get-id test) "NOT_STARTED" "n/a" (conc "kill-run moved from "test-state":"test-status" to NOT_STARTED:n/a"))
                                  (if (not (null? tal))
				      (loop (car tal)(cdr tal)))
                                  )
                                 (else
                                  (if (not (null? tal))
				      (loop (car tal)(cdr tal)))
                                  )))
			       ((set-state-status)
                                (let* ((new-state (car state-status))
                                       (new-status (cadr state-status))

Modified server.scm from [b72b3224b4] to [8ce184eea5].

526
527
528
529
530
531
532


533
534
535
536
537
538
539
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541







+
+







                    (common:snapshot-file mtdbfile subdir: ".db-snapshot"))
		(delete-file* staging-file)
		(let* ((start-time (current-milliseconds))
                       (res (system sync-cmd)))
                  (cond
                   ((eq? 0 res)
		    (delete-file* (conc mtdbfile ".backup"))
                    (if (eq? 0 (file-size sync-log))
                        (delete-file sync-log))
		    (system (conc "/bin/mv " staging-file " " mtdbfile))
                    (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "(/ (- (current-milliseconds) start-time) 1000)" sec")
                    #t)
                   (else
                    (system (conc "/bin/cp "sync-log" "sync-log".fail"))
                    (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Sync failed. See log at "sync-log".fail")
                    (if (file-exists? (conc mtdbfile ".backup"))

Modified tasks.scm from [9a414ee32a] to [6e633165ba].

991
992
993
994
995
996
997

998
999
1000
1001
1002
1003
1004
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005







+









;; get runs changed since last sync
;; (define (tasks:sync-test-data dbh cached-info area-info)
;;   (let* ((

(define (tasks:sync-to-postgres configdat dest)
  (print "In sync")
  (let* ((dbh         (pgdb:open configdat dbname: dest))
	 (area-info   (pgdb:get-area-by-path dbh *toppath*))
	 (cached-info (make-hash-table))
	 (start       (current-seconds))
   (test-patt   (if (args:get-arg "-testpatt")
											(args:get-arg "-testpatt")
                      "%"))
1012
1013
1014
1015
1016
1017
1018
1019


1020
1021
1022
1023

1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039

1040
1041
1042
1043
1044
1045
1046
1013
1014
1015
1016
1017
1018
1019

1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050







-
+
+




+
















+







       (begin
					(print "Error: Provide runname")
          (exit 1)))
     (if (and (not target)  run-name)
       (begin
					(print "Error: Provide target")
          (exit 1)))

    (print "123")
    (exit 1) 
    (for-each (lambda (dtype)
		(hash-table-set! cached-info dtype (make-hash-table)))
	      '(runs targets tests steps data))
    (hash-table-set! cached-info 'start start) ;; when done we'll set sync times to this
    (print "here")
    (if area-info
	(let* ((last-sync-time (vector-ref area-info 3))
	       (smallest-last-update-time  (make-hash-table))
         (changed      (if (and target run-name)
                            (rmt:get-run-record-ids target run-name (rmt:get-keys) test-patt)
                            (rmt:get-changed-record-ids last-sync-time)))
	       (run-ids        (alist-ref 'runs       changed))
	       (test-ids       (alist-ref 'tests      changed))
	       (test-step-ids  (alist-ref 'test_steps changed))
	       (test-data-ids  (alist-ref 'test_data  changed))
	       (run-stat-ids   (alist-ref 'run_stats  changed))
         (area-tag    (if (args:get-arg "-area-tag") 
                                 (args:get-arg "-area-tag")
                                 (if (args:get-arg "-area") 
                                   (args:get-arg "-area") 
                                   ""))))
          (print "here2")
           (if (and (equal? area-tag "") (not (pgdb:is-area-taged dbh (vector-ref area-info 0))))
            (set! area-tag *default-area-tag*)) 
           (if (not (equal? area-tag "")) 
             (task:add-area-tag dbh area-info area-tag)) 
	  (if (or (not (null? test-ids)) (not (null? run-ids)))
	      (begin
                (debug:print-info 0 *default-log-port*  "syncing runs")   

Added telemetry-daemon version [a2b1d26b8f].










































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
#!/usr/bin/env python
# -*- Mode: Python; -*- 
## Tiny Syslog Server in Python.
##
## This is a tiny syslog server that is able to receive UDP based syslog
## entries on a specified port and save them to a file.
## That's it... it does nothing else...

 
import os
import sys, os, time, atexit
from signal import SIGTERM
import logging
import logging.handlers
import SocketServer
import datetime
from subprocess import call
import argparse
import os
import socket

## code to determine this host's IP on non-loopback interface
if os.name != "nt":
    import fcntl
    import struct

    def get_interface_ip(ifname):
        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
        return socket.inet_ntoa(fcntl.ioctl(s.fileno(), 0x8915, struct.pack('256s',
                                ifname[:15]))[20:24])

def get_lan_ip():
    ip = socket.gethostbyname(socket.gethostname())
    if ip.startswith("127.") and os.name != "nt":
        interfaces = [
            "eth0",
            "eth1",
            "eth2",
            "wlan0",
            "wlan1",
            "wifi0",
            "ath0",
            "ath1",
            "ppp0",
            ]
        for ifname in interfaces:
            try:
                ip = get_interface_ip(ifname)
                break
            except IOError:
                pass
    return ip

class Daemon(object):
        """
        A generic daemon class.
       
        Usage: subclass the Daemon class and override the run() method
        """
        def __init__(self, pidfile, stdin='/dev/null', stdout='/dev/null', stderr='/dev/null'):
                self.stdin = stdin
                self.stdout = stdout
                self.stderr = stderr
                self.pidfile = pidfile
       
        def daemonize(self):
                """
                do the UNIX double-fork magic, see Stevens' "Advanced
                Programming in the UNIX Environment" for details (ISBN 0201563177)
                http://www.erlenstar.demon.co.uk/unix/faq_2.html#SEC16
                """
                try:
                        pid = os.fork()
                        if pid > 0:
                                # exit first parent
                                sys.exit(0)
                except OSError, e:
                        sys.stderr.write("fork #1 failed: %d (%s)\n" % (e.errno, e.strerror))
                        sys.exit(1)
       
                # decouple from parent environment
                os.chdir("/")
                os.setsid()
                os.umask(0)
       
                # do second fork
                try:
                        pid = os.fork()
                        if pid > 0:
                                # exit from second parent
                                sys.exit(0)
                except OSError, e:
                        sys.stderr.write("fork #2 failed: %d (%s)\n" % (e.errno, e.strerror))
                        sys.exit(1)
       
                # redirect standard file descriptors
                sys.stdout.flush()
                sys.stderr.flush()
                si = file(self.stdin, 'r')
                so = file(self.stdout, 'a+')
                se = file(self.stderr, 'a+', 0)
                os.dup2(si.fileno(), sys.stdin.fileno())
                os.dup2(so.fileno(), sys.stdout.fileno())
                os.dup2(se.fileno(), sys.stderr.fileno())
       
                # write pidfile
                atexit.register(self.delpid)
                pid = str(os.getpid())
                file(self.pidfile,'w+').write("%s\n" % pid)
       
        def delpid(self):
                os.remove(self.pidfile)
 
        def start(self):
                """
                Start the daemon
                """
                # Check for a pidfile to see if the daemon already runs
                try:
                        pf = file(self.pidfile,'r')
                        pid = int(pf.read().strip())
                        pf.close()
                except IOError:
                        pid = None
       
                if pid:
                        message = "pidfile %s already exist. Daemon already running?\n"
                        sys.stderr.write(message % self.pidfile)
                        sys.exit(1)
               
                # Start the daemon
                self.daemonize()
                self.run()
 
        def stop(self):
                """
                Stop the daemon
                """
                # Get the pid from the pidfile
                try:
                        pf = file(self.pidfile,'r')
                        pid = int(pf.read().strip())
                        pf.close()
                except IOError:
                        pid = None
       
                if not pid:
                        message = "pidfile %s does not exist. Daemon not running?\n"
                        sys.stderr.write(message % self.pidfile)
                        return # not an error in a restart
 
                # Try killing the daemon process       
                try:
                        while 1:
                                os.kill(pid, SIGTERM)
                                time.sleep(0.1)
                except OSError, err:
                        err = str(err)
                        if err.find("No such process") > 0:
                                if os.path.exists(self.pidfile):
                                        os.remove(self.pidfile)
                        else:
                                print str(err)
                                sys.exit(1)
 
        def restart(self):
                """
                Restart the daemon
                """
                self.stop()
                self.start()
 
        def run(self):
                """
                You should override this method when you subclass Daemon. It will be called after the process has been
                daemonized by start() or restart().
                """

# setup logging module so that the log can be moved aside and will reopen for append
def log_setup(logfile):
    log_handler = logging.handlers.WatchedFileHandler(logfile)
    formatter = logging.Formatter(
        '%(message)s','')
    log_handler.setFormatter(formatter)
    logger = logging.getLogger()
    logger.addHandler(log_handler)
    logger.setLevel(logging.INFO)


class SyslogUDPHandler(SocketServer.BaseRequestHandler):
	def handle(self):
		data = bytes.decode(self.request[0].strip())
		socket = self.request[1]
		print( "%s : " % self.client_address[0], str(data))
		timestamp = datetime.datetime.now().isoformat()
		logline = timestamp + ":"+self.client_address[0] + ":" + str(data)
		logging.info(str(logline))



class TelemetryLogDaemon(Daemon):
  def __init__(self, pidfile, logfile, server_ip, server_port):
    self.logfile = logfile
    self.server_ip = server_ip
    self.server_port = server_port
    super(TelemetryLogDaemon, self).__init__(pidfile)
    
  def run(self):
    log_setup(self.logfile)
    server = SocketServer.UDPServer((self.server_ip,int(self.server_port)), SyslogUDPHandler)
    server.serve_forever(poll_interval=0.5)


def main():
  default_log_file = os.environ['PWD'] + "/telemetry.log"

  parser = argparse.ArgumentParser(description = 'telemetry-daemon')
  actions="start,restart,stop,nodaemon".split(",")
  
  parser.add_argument("-a", "--action", required=True, choices=actions, help="manage daemon: start stop or restart")
  parser.add_argument("-p", "--server-port", default="5929", help="specify alternate udp port number, default is 5929")
  parser.add_argument("-i", "--server-ip", default=get_lan_ip(), help="specify IP if heuristics to get local host lan ip fails")
  parser.add_argument("-l", "--log-file", default=default_log_file, help="specify log file to write")
  parser.add_argument("-z", "--pid-file", default=default_log_file + ".pidfile", help="specify pidfile")
  opts = parser.parse_args()

  tld = TelemetryLogDaemon(opts.pid_file, opts.log_file, opts.server_ip, opts.server_port)

  if opts.action == "start":
    print "Info: Starting server"
    print """Example addition to megatest.config to enable telemetry:

[telemetry]
host %s
port %s
want-events ALL

    """ % (opts.server_ip, opts.server_port)
    tld.start()
    
  elif opts.action == "stop":
    tld.stop()
  elif opts.action == "restart":

    print "Info: Restarting server"
    print """Example addition to megatest.config to enable telemetry:

[telemetry]
host %s
port %s
want-events ALL

    """ % (opts.server_ip, opts.server_port)
    tld.restart()
  elif opts.action == "nodaemon":
    log_setup(opts.log_file)
    server = SocketServer.UDPServer((opts.server_ip,int(opts.server_port)), SyslogUDPHandler)
    server.serve_forever(poll_interval=0.5)
  
if __name__ == '__main__':
  main()




Modified tests.scm from [b8a74e9d3b] to [c7490d0df8].

1515
1516
1517
1518
1519
1520
1521
1522

1523
1524
1525
1526
1527
1528
1529
1515
1516
1517
1518
1519
1520
1521

1522
1523
1524
1525
1526
1527
1528
1529







-
+







      #f))

;; if .testconfig exists in test directory read and return it
;; else if have cached copy in *testconfigs* return it IFF there is a section "have fulldata"
;; else read the testconfig file
;;   if have path to test directory save the config as .testconfig and return it
;;
(define (tests:get-testconfig test-name item-path test-registry system-allowed #!key (force-create #f)(allow-write-cache #t))
(define (tests:get-testconfig test-name item-path test-registry system-allowed #!key (force-create #f)(allow-write-cache #t)(wait-a-minute #f))
  (let* ((use-cache    (common:use-cache?))
	 (cache-path   (tests:get-test-path-from-environment))
	 (cache-file   (and cache-path (conc cache-path "/.testconfig")))
	 (cache-exists (and cache-file
			    (not force-create)  ;; if force-create then pretend there is no cache to read
			    (common:file-exists? cache-file)))
	 (cached-dat   (if (and (not force-create)
1543
1544
1545
1546
1547
1548
1549








1550
1551



1552














1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567

1568
1569
1570
1571
1572
1573
1574
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562

1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590

1591
1592
1593
1594
1595
1596
1597
1598







+
+
+
+
+
+
+
+


+
+
+
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+














-
+







	  (if (and  dat ;; have a locally cached version
		    (hash-table-ref/default dat "have fulldata" #f)) ;; marked as good data?
	      dat
	      ;; no cached data available
	      (let* ((treg         (or test-registry
				       (tests:get-all)))
		     (test-path    (or (hash-table-ref/default treg test-name #f)
                                       (let* ((local-tcdir (conc (getenv "MT_LINKTREE") "/"
                                                                 (getenv "MT_TARGET") "/"
                                                                 (getenv "MT_RUNNAME") "/"
                                                                 test-name "/" item-path))
                                              (local-tcfg (conc local-tcdir "/testconfig")))
                                         (if (common:file-exists? local-tcfg)
                                             local-tcdir
                                             #f))
				       (conc *toppath* "/tests/" test-name)))
		     (test-configf (conc test-path "/testconfig"))
		     (testexists   (let loopa ((tries-left 30))
                                     (cond
                                      (
		     (testexists   (and (common:file-exists? test-configf)(file-read-access? test-configf)))
                                       (and (common:file-exists? test-configf)(file-read-access? test-configf))
                                       #t)
                                      (
                                       (common:file-exists? test-configf)
                                       (debug:print 0 *default-log-port* "WARNING: Cannot read testconfig file: "test-configf)
                                       #f)
                                      (
                                       (and wait-a-minute (> tries-left 0))
                                       (thread-sleep! 10)
                                       (debug:print 0 *default-log-port* "WARNING: testconfig file does not exist: "test-configf" will retry in 10 seconds.  Tries left: "tries-left) ;; BB: this fires
                                       (loopa (sub1 tries-left)))
                                      (else
                                       (debug:print 0 *default-log-port* "WARNING: testconfig file does not exist: "test-configf) ;; BB: this fires
                                       #f))))
		     (tcfg         (if testexists
				       (read-config test-configf #f system-allowed
						    environ-patt: (if system-allowed
								      "pre-launch-env-vars"
								      #f))
				       #f)))
		(if (and tcfg cache-file) (hash-table-set! tcfg "have fulldata" #t)) ;; mark this as fully read data
		(if tcfg (hash-table-set! *testconfigs* test-full-name tcfg))
		(if (and testexists
			 cache-file
			 (file-write-access? cache-path)
			 allow-write-cache)
		    (let ((tpath (conc cache-path "/.testconfig")))
		      (debug:print-info 1 *default-log-port* "Caching testconfig for " test-name " in " tpath)
                      (if (not (common:in-running-test?))
                      (if (and tcfg (not (common:in-running-test?)))
                          (configf:write-alist tcfg tpath))))
		tcfg))))))
  
;; sort tests by priority and waiton
;; Move test specific stuff to a test unit FIXME one of these days
(define (tests:sort-by-priority-and-waiton test-records)
  (if (eq? (hash-table-size test-records) 0)