Megatest

Check-in [99a884c695]
Login
Overview
Comment:Cherry picked fixes from 1.70 and 1.80
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.71
Files: files | file ages | folders
SHA1: 99a884c6956dc9a8ed930f8f969764c358bbc208
User & Date: mmgraham on 2024-04-09 18:13:45
Other Links: branch diff | manifest | tags
Context
2024-04-10
11:39
moved a message to level 2 check-in: 1bc5f2dab3 user: mmgraham tags: v1.71
2024-04-09
18:13
Cherry picked fixes from 1.70 and 1.80 check-in: 99a884c695 user: mmgraham tags: v1.71
2024-04-08
11:05
Changed version to v1.7102 check-in: c719bc2748 user: mmgraham tags: v1.71, v1.7102
Changes

Modified common.scm from [95cd7a5e85] to [7944e605d1].

29
30
31
32
33
34
35
36
37


38
39

40
41
42
43
44
45
46
47
48
49
50
51
52
53

54
55
56
57
58
59
60
29
30
31
32
33
34
35


36
37


38






39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54







-
-
+
+
-
-
+
-
-
-
-
-
-








+








(declare (unit common))
(declare (uses commonmod))
(import commonmod)

(include "common_records.scm")


;; (require-library margs)
(define (remove-server-files directory-path)
  (let ((files (glob (string-append directory-path "/server*"))))
;; (include "margs.scm")

    (for-each delete-file* files)))
;; (define old-exit exit)
;; 
;; (define (exit . code)
;;   (if (null? code)
;;       (old-exit)
;;       (old-exit code)))

(define (stop-the-train)
  (thread-start! (make-thread (lambda ()
				(let loop ()
				  (if (and *toppath*
					   (file-exists? (conc *toppath*"/stop-the-train")))
				      (begin
					(debug:print 0 *default-log-port* "ERROR: found file "*toppath*"/stop-the-train, exiting immediately")
                                        (remove-server-files (conc *toppath* "/logs"))
					(exit 1)))
				  (thread-sleep! 5)
				  (loop))))))

;; execute thunk, return value.  If exception thrown, trap exception, return #f, and emit nonfatal condition note to *default-log-port* .
;; arguments - thunk, message
(define (common:fail-safe thunk warning-message-on-exception)

Modified dbfile.scm from [3caef9c8a2] to [14d0a1b210].

373
374
375
376
377
378
379
380


381
382
383
384
385
386
387
388

389
390
391
392
393
394
395
373
374
375
376
377
378
379

380
381
382
383
384
385
386
387
388

389
390
391
392
393
394
395
396







-
+
+







-
+







                                 (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 30000))
				 (if sync-mode
				     (sqlite3:execute db (conc "PRAGMA synchronous = "sync-mode";")))
				 (if journal-mode
				     (sqlite3:execute db (conc "PRAGMA journal_mode = "journal-mode";")))
				 (if (and init-proc (not db-exists))
				     (init-proc db))
				 db)))
				 db))
			     expire-time: 30)
                            (begin
			      (if (file-exists? fname )
                                  (let ((db (sqlite3:open-database fname)))
				    ;; pragmas synchronous not needed because this db is used read-only
				    ;; (sqlite3:execute db (conc "PRAGMA synchronous = "mode";")
				    (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 30000)) ;; read-only but still need timeout
				    db )
                                  (print "file doesn't exist: " fname))))
                                  (print "cautious-open-database: file doesn't exist: " fname))))
			(exn (io-error)
			     (dbfile:print-err exn "ERROR: i/o error with " fname ". Check permissions, disk space etc. and try again.")
			     (retry))
			(exn (corrupt)
			     (dbfile:print-err exn "ERROR: database " fname " is corrupt. Repair it to proceed.")
			     (retry))
			(exn (busy)
1159
1160
1161
1162
1163
1164
1165


1166



1167
1168
1169
1170
1171
1172
1173
1160
1161
1162
1163
1164
1165
1166
1167
1168

1169
1170
1171
1172
1173
1174
1175
1176
1177
1178







+
+
-
+
+
+







	  (thread-sleep! 0.25)
	  (if (file-exists? fname)
	      (handle-exceptions exn
                #f 
                (with-input-from-file fname
	  	  (lambda ()
		    (equal? key-string (read-line)))))
              (begin
                (dbfile:print-err "dbfile:simple-file-lock created " fname " but it was gone 0.25 seconds later")
	      #f)
	      #f
              )
          )
       )
    )
  )
)

(define (dbfile:simple-file-lock-and-wait fname #!key (expire-time 300))
  (let ((end-time (+ expire-time (current-seconds))))

Modified launch.scm from [60d380c61b] to [6c7fb3bbfc].

591
592
593
594
595
596
597



598

599
600
601
602
603
604
605
591
592
593
594
595
596
597
598
599
600

601
602
603
604
605
606
607
608







+
+
+
-
+







	      (list  "MT_RUNNAME"   runname)
	      (list  "MT_MEGATEST"  megatest)
	      (list  "MT_TARGET"    target)
	      (list  "MT_LINKTREE"  (common:get-linktree)) ;; (configf:lookup *configdat* "setup" "linktree"))
	      (list  "MT_TESTSUITENAME" (common:get-testsuite-name))))
          ;;(bb-check-path msg: "launch:execute post block 3")

	  (let ((tmppath (getenv "PATH")))
	    (if (string-search tmppath " ")
		(debug:print 0 *default-log-port* "WARNING: spaces in PATH are not supported."))
	  (if mt-bindir-path (setenv "PATH" (conc "\""(getenv "PATH")":"mt-bindir-path"\"")))
	    (if mt-bindir-path (setenv "PATH" (conc tmppath":"mt-bindir-path))))
          ;;(bb-check-path msg: "launch:execute post block 4")
	  ;; (change-directory top-path)
	  ;; Can setup as client for server mode now
	  ;; (client:setup)

	  
	  ;; environment overrides are done *before* the remaining critical envars.
889
890
891
892
893
894
895
896

897
898
899
900
901
902
903
892
893
894
895
896
897
898

899
900
901
902
903
904
905
906







-
+







		       runname
		       (common:file-exists? fulldir))
		  (let ((tmpfile  (conc fulldir "/.megatest.cfg." (current-seconds)))
			(targfile (conc fulldir "/.megatest.cfg-"  megatest-version "-" megatest-fossil-hash))
			(rconfig  (conc fulldir "/.runconfig." megatest-version "-" megatest-fossil-hash)))
		    (if (common:file-exists? rconfig) ;; only cache megatest.config AFTER runconfigs has been cached
			(begin
			  (debug:print-info 0 *default-log-port* "Caching megatest.config in " tmpfile)
			  (debug:print-info 2 *default-log-port* "Caching megatest.config in " tmpfile)
                          (if (not (common:in-running-test?))
                              (configf:write-alist *configdat* tmpfile))
			  (system (conc "ln -sf " tmpfile " " targfile))))
		    )))
	    (debug:print-info 1 *default-log-port* "No linktree yet, no caching configs.")))))


Modified megatest.scm from [0a3a4d7312] to [5e043f1c71].

84
85
86
87
88
89
90



91




92
93
94
95
96
97
98
84
85
86
87
88
89
90
91
92
93

94
95
96
97
98
99
100
101
102
103
104







+
+
+
-
+
+
+
+








(dbfile:db-init-proc db:initialize-main-db)

;; load the ~/.megatestrc file, put (use trace)(trace-call-sites #t)(trace function-you-want-to-trace) in this file
;;
(let ((debugcontrolf (conc (get-environment-variable "HOME") "/.megatestrc")))
  (if (common:file-exists? debugcontrolf)
    (begin
      ;; for some reason, debug:print does not work here. Had to use print.
      (print (conc "WARNING: loading " debugcontrolf))
      (load debugcontrolf)))
      (load debugcontrolf)
    )
  )
)

;; usage logging, careful with this, it is not designed to deal with all real world challenges!
;;
(if (and *usage-log-file*
         (file-write-access? *usage-log-file*))
    (with-output-to-file
        *usage-log-file*

Modified portlogger.scm from [36a4964f50] to [6fdf0de81e].

24
25
26
27
28
29
30
31


32
33
34
35
36
37
38
24
25
26
27
28
29
30

31
32
33
34
35
36
37
38
39







-
+
+








(declare (unit portlogger))
(declare (uses db))

;; lsof -i

(define (portlogger:open-db fname)
  (let* ((avail    (tasks:wait-on-journal fname 5 remove: #t)) ;; wait up to about 10 seconds for the journal to go away
  (let* (;; (avail    (tasks:wait-on-journal fname 5 remove: #t)) ;; wait up to about 10 seconds for the journal to go away
         (avail #t)
	 (exists   (common:file-exists? fname))
	 (db       (if avail 
		       (sqlite3:open-database fname)
		       (begin
			 (system (conc "rm -f " fname))
			 (sqlite3:open-database fname))))
	 (handler  (sqlite3:make-busy-timeout 136000))
54
55
56
57
58
59
60
61
62


63
64
65
66
67
68
69
55
56
57
58
59
60
61


62
63
64
65
66
67
68
69
70







-
-
+
+







            port INTEGER PRIMARY KEY,
            state TEXT DEFAULT 'not-used',
            fail_count INTEGER DEFAULT 0,
            update_time TIMESTAMP DEFAULT (strftime('%s','now')) );")
    db))

(define (portlogger:open-run-close proc . params)
  (let* ((fname  (conc "/tmp/." (current-user-name) "-portlogger.db"))
	 (avail  (tasks:wait-on-journal fname 10))) ;; wait up to about 10 seconds for the journal to go away
  (let* ((fname  (conc "/tmp/." (current-user-name) "-portlogger.db")))
	 ;; (avail  (tasks:wait-on-journal fname 10))) ;; wait up to about 10 seconds for the journal to go away
    (handle-exceptions
     exn
     (begin
       ;; (release-dot-lock fname)
       (debug:print-error 0 *default-log-port* "portlogger:open-run-close failed. " proc " " params)
       (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
       (debug:print 5 *default-log-port* "exn=" (condition->list exn))

Modified rmt.scm from [df8aa04bdb] to [f953bf98e7].

333
334
335
336
337
338
339
340

341
342
343
344
345
346
347
333
334
335
336
337
338
339

340
341
342
343
344
345
346
347







-
+







    (debug:print-info 13 *default-log-port* "rmt:send-receive, case  9. conninfo=" conninfo " dat=" dat " runremote = " runremote)
    (mutex-unlock! *rmt-mutex*)
    (if success ;; success only tells us that the transport was
	;; successful, have to examine the data to see if
	;; there was a detected issue at the other end
	(extras-transport-succeded *default-log-port* *rmt-mutex* attemptnum runremote res params rid cmd)
	(begin
           (debug:print-error 0 *default-log-port* " dat=" dat) 
           (debug:print-info 0 *default-log-port* "Bad return data from Megatest server: dat=" dat) 
           (extras-transport-failed *default-log-port* *rmt-mutex* attemptnum runremote cmd rid params))
	)))

(define (rmt:print-db-stats)
  (let ((fmtstr "~40a~7-d~9-d~20,2-f")) ;; "~20,2-f"
    (debug:print 18 *default-log-port* "DB Stats\n========")
    (debug:print 18 *default-log-port* (format #f "~40a~8a~10a~10a" "Cmd" "Count" "TotTime" "Avg"))
1127
1128
1129
1130
1131
1132
1133

1134
1135
1136
1137
1138
1139
1140
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141







+







	 (all-dat    (cdr run-dat))
	 (tests-data (alist-ref "data" all-dat equal?))
	 (run-meta   (alist-ref "meta" all-dat equal?))
         (run-id     (string->number (alist-ref "id"   run-meta equal?))))
    (rmt:insert-run run-id target runname run-meta)
    (if (list? tests-data)
      (begin
        (debug:print 0 *default-log-port* "Inserting " (length tests-data) " tests in run " runname)
        (for-each
          (lambda (test-dat)
            (let* ((test-id  (car test-dat))
	      (test-rec (cdr test-dat)))
	      (rmt:insert-test run-id test-rec)))
         tests-data)
      )
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1165
1166
1167
1168
1169
1170
1171

1172
1173
1174
1175
1176
1177
1178
1179







-








	 (item-path (alist-ref "item_path" test-rec equal?))
         (test-id (rmt:get-test-id run-id testname item-path))
         )

    (if test-id
       (debug:print 0 *default-log-port* "test "testname"/"item-path " already exists in run-id " run-id)
       (begin
         (debug:print 0 *default-log-port* " Insert test in run "run-id": "testname"/"item-path)
         (rmt:send-receive 'insert-test run-id test-rec)
       )
    )
  )
)



Modified runs.scm from [9381a36070] to [9b84cba39d].

797
798
799
800
801
802
803
804

805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843


844
845
846
847
848
849
850
797
798
799
800
801
802
803

804
























805
806
807


808
809
810
811
812
813
814
815


816
817
818
819
820
821
822
823
824







-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-



-
-








-
-
+
+








    (if (not (null? required-tests))
	(debug:print-info 1 *default-log-port* "Adding \"" (string-intersperse required-tests " ") "\" to the run queue"))
    ;; NOTE: these are all parent tests, items are not expanded yet.
    (debug:print-info 4 *default-log-port* "test-records=" (hash-table->alist test-records))
    (let ((reglen (configf:lookup *configdat* "setup" "runqueue")))
      (if (> (length (hash-table-keys test-records)) 0)
	  (let* ((keep-going        #t)
	  (let* ()
		 (run-queue-retries 5)
		;; (th1        (make-thread (lambda ()
		;; 			    (handle-exceptions
		;; 				exn
		;; 				(begin
		;; 				  (print-call-chain)
		;; 				  (print " message: " ((condition-property-accessor 'exn 'message) exn)))
		;; 			      (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests
		;; 						    (any->number reglen) all-tests-registry)))
		;; 			  "runs:run-tests-queue"))
		 #;(th2        (make-thread (lambda ()			 ;; BBQ: why are we visiting ALL runs here?	    
					    ;; (rmt:find-and-mark-incomplete-all-runs))))) CAN'T INTERRUPT IT ...
					    (let ((run-ids (rmt:get-all-run-ids)))
					      (for-each (lambda (run-id)
							  (if keep-going
							      (handle-exceptions
							       exn
							       (debug:print 0 *default-log-port* "error in calling find-and-mark-incomplete for run-id " run-id ", exn=" exn)
							       (rmt:find-and-mark-incomplete run-id #f)))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27)
							run-ids)))
					  "runs: mark-incompletes")))
	    ;; (thread-start! th1)
	    ;; (thread-start! th2)
	    ;; (thread-join! th1)
	    ;; just do the main stuff in the main thread
	    (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests
								    (any->number reglen) all-tests-registry)
	    (set! keep-going #f)
	    #;(thread-join! th2)
	    ;; if run-count > 0 call, set -preclean and -rerun STUCK/DEAD
	    (if (> run-count 0) ;; handle reruns
		(begin
		  (if (not (hash-table-ref/default flags "-preclean" #f))
		      (hash-table-set! flags "-preclean" #t))
		  (if (not (hash-table-ref/default flags "-rerun" #f))
		      (hash-table-set! flags "-rerun" "ABORT,STUCK/DEAD,n/a,ZERO_ITEMS"))
		  ;; recursive call to self
      (runs:run-tests target runname test-patts user flags run-count: (- run-count 1)))
                  (launch:end-of-run-check run-id)))
                  (runs:run-tests target runname test-patts user flags run-count: (- run-count 1)))
                 (launch:end-of-run-check run-id)))
	  (debug:print-info 0 *default-log-port* "No tests to run")))
    (debug:print-info 4 *default-log-port* "All done by here")
    ;; TODO: try putting post hook call here
      
    ;  (debug:print-info 2 *default-log-port* " run-count " run-count)
    ;  (runs:run-post-hook run-id))
    ;  (debug:print-info 2 *default-log-port* "Not calling post hook runcount = " run-count ))   
2904
2905
2906
2907
2908
2909
2910
2911

2912
2913
2914
2915
2916
2917
2918
2878
2879
2880
2881
2882
2883
2884

2885
2886
2887
2888
2889
2890
2891
2892







-
+







     (lambda (key)
       (let* ((idx (cadr key))
	      (fld (car  key))
	      (val (configf:lookup test-conf "test_meta" fld)))
	 ;; (debug:print 5 *default-log-port* "idx: " idx " fld: " fld " val: " val)
	 (if (and val (not (equal? (vector-ref currrecord idx) val)))
	     (begin
	       (debug:print 0 *default-log-port* "Updating " test-name " " fld " to " val)
	       (debug:print 2 *default-log-port* "Updating " test-name " " fld " to " val)
	       (rmt:testmeta-update-field test-name fld val)))))
     '(("author" 2)("owner" 3)("description" 4)("reviewed" 5)("tags" 9)("jobgroup" 10)))))

;; find tests with matching tags, tagpatt is a string "tagpatt1,tagpatt2%, ..."
;;
(define (runs:get-tests-matching-tags tagpatt)
  (let* ((tagdata (rmt:get-tests-tags))

Modified tests.scm from [5c2006972a] to [fd3f543001].

1637
1638
1639
1640
1641
1642
1643
1644

1645
1646
1647
1648
1649
1650
1651
1637
1638
1639
1640
1641
1642
1643

1644
1645
1646
1647
1648
1649
1650
1651







-
+







		(if (and tcfg cache-file) (hash-table-set! tcfg "have fulldata" #t)) ;; mark this as fully read data
		(if tcfg (hash-table-set! *testconfigs* test-full-name tcfg))
		(if (and testexists
			 cache-file
			 (file-write-access? cache-path)
			 allow-write-cache)
		    (let ((tpath (conc cache-path "/.testconfig")))
		      (debug:print-info 1 *default-log-port* "Caching testconfig for " test-name " in " tpath)
		      (debug:print-info 2 *default-log-port* "Caching testconfig for " test-name " in " tpath)
                      (if (and tcfg (not (common:in-running-test?)))
                          (configf:write-alist tcfg tpath))))
		tcfg))))))
  
;; sort tests by priority and waiton
;; Move test specific stuff to a test unit FIXME one of these days
(define (tests:sort-by-priority-and-waiton test-records)