Megatest

Diff
Login

Differences From Artifact [dbe1379c23]:

To Artifact [adfae1025a]:


27
28
29
30
31
32
33

34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52

53
54
55
56
57
58
59
(declare (uses tests))
(declare (uses server))
(declare (uses mt))
(declare (uses archive))
(declare (uses mtargs))
(declare (uses rmtmod))
(declare (uses dbfile))


(use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 (srfi 18) 
     posix-extras directory-utils pathname-expand typed-records format  sxml-serializer
     sxml-modifications matchable)



(include "common_records.scm")
(include "key_records.scm")
(include "db_records.scm")
(include "run_records.scm")
(include "test_records.scm")

;; (include "debugger.scm")

(import commonmod
	debugprint
	rmtmod
	dbfile

	(prefix mtargs args:))

;; use this struct to facilitate refactoring
;;

(defstruct runs:dat
  reglen regfull







>



















>







27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
(declare (uses tests))
(declare (uses server))
(declare (uses mt))
(declare (uses archive))
(declare (uses mtargs))
(declare (uses rmtmod))
(declare (uses dbfile))
(declare (uses tcp-transportmod))

(use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 (srfi 18) 
     posix-extras directory-utils pathname-expand typed-records format  sxml-serializer
     sxml-modifications matchable)



(include "common_records.scm")
(include "key_records.scm")
(include "db_records.scm")
(include "run_records.scm")
(include "test_records.scm")

;; (include "debugger.scm")

(import commonmod
	debugprint
	rmtmod
	dbfile
	tcp-transportmod
	(prefix mtargs args:))

;; use this struct to facilitate refactoring
;;

(defstruct runs:dat
  reglen regfull
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843

    (if (not (null? required-tests))
	(debug:print-info 1 *default-log-port* "Adding \"" (string-intersperse required-tests " ") "\" to the run queue"))
    ;; NOTE: these are all parent tests, items are not expanded yet.
    (debug:print-info 4 *default-log-port* "test-records=" (hash-table->alist test-records))
    (let ((reglen (configf:lookup *configdat* "setup" "runqueue")))
      (if (> (length (hash-table-keys test-records)) 0)
	  (let* ((keep-going        #t)
		 (run-queue-retries 5)
		 (run-ids (rmt:get-all-run-ids)))
	    #;(for-each (lambda (run-id)
			(if keep-going
			    (handle-exceptions
				exn
			      (debug:print 0 *default-log-port* "error in calling find-and-mark-incomplete for run-id " run-id ", exn=" exn)
			      (rmt:find-and-mark-incomplete run-id #f)))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27)
		      run-ids)
	    (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests
				  (any->number reglen) all-tests-registry)
	    (set! keep-going #f)
	    (if (> run-count 0) ;; handle reruns
		(begin
		  (if (not (hash-table-ref/default flags "-preclean" #f))
		      (hash-table-set! flags "-preclean" #t))
		  (if (not (hash-table-ref/default flags "-rerun" #f))
		      (hash-table-set! flags "-rerun" "ABORT,STUCK/DEAD,n/a,ZERO_ITEMS"))
      (runs:run-tests target runname test-patts user flags run-count: (- run-count 1)))
                (launch:end-of-run-check run-id)))
	  (debug:print-info 0 *default-log-port* "No tests to run")))
    (debug:print-info 4 *default-log-port* "All done by here")
    ;; TODO: try putting post hook call here
      
    ;  (debug:print-info 2 *default-log-port* " run-count " run-count)
    ;  (runs:run-post-hook run-id))
    ;  (debug:print-info 2 *default-log-port* "Not calling post hook runcount = " run-count ))   







|
<
<
<
<
<
<
<
<
<


<






|
|







811
812
813
814
815
816
817
818









819
820

821
822
823
824
825
826
827
828
829
830
831
832
833
834
835

    (if (not (null? required-tests))
	(debug:print-info 1 *default-log-port* "Adding \"" (string-intersperse required-tests " ") "\" to the run queue"))
    ;; NOTE: these are all parent tests, items are not expanded yet.
    (debug:print-info 4 *default-log-port* "test-records=" (hash-table->alist test-records))
    (let ((reglen (configf:lookup *configdat* "setup" "runqueue")))
      (if (> (length (hash-table-keys test-records)) 0)
	  (let* ()









	    (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests
				  (any->number reglen) all-tests-registry)

	    (if (> run-count 0) ;; handle reruns
		(begin
		  (if (not (hash-table-ref/default flags "-preclean" #f))
		      (hash-table-set! flags "-preclean" #t))
		  (if (not (hash-table-ref/default flags "-rerun" #f))
		      (hash-table-set! flags "-rerun" "ABORT,STUCK/DEAD,n/a,ZERO_ITEMS"))
                  (runs:run-tests target runname test-patts user flags run-count: (- run-count 1)))
                 (launch:end-of-run-check run-id)))
	  (debug:print-info 0 *default-log-port* "No tests to run")))
    (debug:print-info 4 *default-log-port* "All done by here")
    ;; TODO: try putting post hook call here
      
    ;  (debug:print-info 2 *default-log-port* " run-count " run-count)
    ;  (runs:run-post-hook run-id))
    ;  (debug:print-info 2 *default-log-port* "Not calling post hook runcount = " run-count ))   
1153
1154
1155
1156
1157
1158
1159
1160













1161
1162
1163
1164
1165
1166
1167
	 (registry-mutex         (runs:dat-registry-mutex runsdat))
	 (flags                  (runs:dat-flags runsdat))
	 (keyvals                (runs:dat-keyvals runsdat))
	 (run-info               (runs:dat-run-info runsdat))
	 (all-tests-registry     (runs:dat-all-tests-registry runsdat))
	 (run-limits-info        (runs:dat-can-run-more-tests runsdat))
	 ;; (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running
	 (have-resources         (car run-limits-info))













	 (num-running            (list-ref run-limits-info 1))
	 (num-running-in-jobgroup(list-ref run-limits-info 2)) 
	 (max-concurrent-jobs    (list-ref run-limits-info 3))
	 (job-group-limit        (list-ref run-limits-info 4))
	 ;; (prereqs-not-met        (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))
	 ;; (prereqs-not-met         (mt:lazy-get-prereqs-not-met run-id waitons item-path mode: testmode itemmap: itemmap))
	 (fails                  (if (list? prereqs-not-met) ;; TODO: rename fails to failed-prereqs







|
>
>
>
>
>
>
>
>
>
>
>
>
>







1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
	 (registry-mutex         (runs:dat-registry-mutex runsdat))
	 (flags                  (runs:dat-flags runsdat))
	 (keyvals                (runs:dat-keyvals runsdat))
	 (run-info               (runs:dat-run-info runsdat))
	 (all-tests-registry     (runs:dat-all-tests-registry runsdat))
	 (run-limits-info        (runs:dat-can-run-more-tests runsdat))
	 ;; (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running
	 (have-resources         (and (if *journal-stats*
					  (let* ((dbfname (conc
							   (dbfile:run-id->dbnum run-id)
							   ".db"))
						 (load (tt:get-journal-stats dbfname)))
					    (if (> load 0.1) ;; dbs too busy to start more tests
						(begin
						  (debug:print-info 0 *default-log-port* "Gating launch due to db load "load" based on journal file observations for "dbfname)
						 #f)
						#t))
					  (begin
					    (debug:print-info 0 *default-log-port* "Journal gating not started for "run-id)
					    #t)) ;; if journal monitoring not started do not gate
				      (car run-limits-info)))
	 (num-running            (list-ref run-limits-info 1))
	 (num-running-in-jobgroup(list-ref run-limits-info 2)) 
	 (max-concurrent-jobs    (list-ref run-limits-info 3))
	 (job-group-limit        (list-ref run-limits-info 4))
	 ;; (prereqs-not-met        (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))
	 ;; (prereqs-not-met         (mt:lazy-get-prereqs-not-met run-id waitons item-path mode: testmode itemmap: itemmap))
	 (fails                  (if (list? prereqs-not-met) ;; TODO: rename fails to failed-prereqs
1199
1200
1201
1202
1203
1204
1205
1206
1207


1208


1209
1210
1211
1212
1213
1214
1215
	   (if (and (not (common:on-homehost?))
		    maxload) ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized
	       (common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f))
	   
	   ;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues
	   (if maxhomehostload
	       (common:wait-for-homehost-load maxhomehostload
					      (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload))))))
    


 


    
    (if (and (not (null? prereqs-not-met))
	     (runs:lownoise (conc "waiting on tests " prereqs-not-met hed) 60))
	(debug:print-info 2 *default-log-port* "waiting on tests; " (string-intersperse (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) ", ")))

    ;; Don't know at this time if the test have been launched at some time in the past
    ;; i.e. is this a re-launch?







|
|
>
>
|
>
>







1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
	   (if (and (not (common:on-homehost?))
		    maxload) ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized
	       (common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f))
	   
	   ;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues
	   (if maxhomehostload
	       (common:wait-for-homehost-load maxhomehostload
					      (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload)))

	   ;; lastly lets check the servers are not overloaded by looking at threads
	   (tt:wait-on-server-load run-id *ttdat*)
	   
	   )))
    
    
    (if (and (not (null? prereqs-not-met))
	     (runs:lownoise (conc "waiting on tests " prereqs-not-met hed) 60))
	(debug:print-info 2 *default-log-port* "waiting on tests; " (string-intersperse (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) ", ")))

    ;; Don't know at this time if the test have been launched at some time in the past
    ;; i.e. is this a re-launch?
1379
1380
1381
1382
1383
1384
1385



1386
1387

1388
1389
1390
1391
1392
1393
1394
		      (if (eq? nth-try 'removed) ;; removed is removed - drop it NOW
			  (if (null? tal)
			      #f ;; yes, really
			      (list (car tal)(cdr tal) reg reruns))
			  (begin
			    (if (runs:lownoise (conc "FAILED prerequisites or other issue" hed) 60)
				(debug:print 0 *default-log-port* "WARNING: test " hed " has FAILED prerequisites or other issue. Internal state >" nth-try "< will be overridden and we'll retry."))



			    ;; was: (mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "KEEP_TRYING" #f)
                            (mt:test-set-state-status-by-testname-unless-completed run-id test-name item-path "COMPLETED" "PREQ_FAIL" #f)

			    (hash-table-set! test-registry hed 'removed) ;; was 0
                            (if (not (and (null? reg) (null? tal)))
                                (runs:loop-values tal reg reglen regfull reruns)
                                #f))))
		     (else
		      (if (runs:lownoise (conc "FAILED prerequitests and we tried" hed) 60)
			  (debug:print 0 *default-log-port* "WARNING: test " hed " has FAILED prerequitests and we've tried at least 10 times to run it. Giving up now."))







>
>
>
|
|
>







1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
		      (if (eq? nth-try 'removed) ;; removed is removed - drop it NOW
			  (if (null? tal)
			      #f ;; yes, really
			      (list (car tal)(cdr tal) reg reruns))
			  (begin
			    (if (runs:lownoise (conc "FAILED prerequisites or other issue" hed) 60)
				(debug:print 0 *default-log-port* "WARNING: test " hed " has FAILED prerequisites or other issue. Internal state >" nth-try "< will be overridden and we'll retry."))
			    (let* ((test-id      (rmt:get-test-id run-id hed item-path))
				   (test-info    (rmt:get-testinfo-state-status run-id test-id)) ;; we need *current* info
				   (status       (db:test-status test-info)))
			      (if (equal? status "KEEP_TRYING")
				  (mt:test-set-state-status-by-testname-unless-completed run-id test-name item-path "COMPLETED" "PREQ_FAIL" #f)
				  (mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "KEEP_TRYING" #f)))
			    (hash-table-set! test-registry hed 'removed) ;; was 0
                            (if (not (and (null? reg) (null? tal)))
                                (runs:loop-values tal reg reglen regfull reruns)
                                #f))))
		     (else
		      (if (runs:lownoise (conc "FAILED prerequitests and we tried" hed) 60)
			  (debug:print 0 *default-log-port* "WARNING: test " hed " has FAILED prerequitests and we've tried at least 10 times to run it. Giving up now."))