Megatest

Check-in [1fbf00b8f7]
Login
Overview
Comment:Gate tests running on load and max jobs right before launching
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.65-cleanup
Files: files | file ages | folders
SHA1: 1fbf00b8f79339c251d54d1acca99e7166c0748c
User & Date: matt on 2020-08-30 09:54:03
Other Links: branch diff | manifest | tags
Context
2020-08-30
15:44
updated megatest version to 1.6566 check-in: d66b6aae6b user: mmgraham tags: v1.65-cleanup, v1.6566
09:54
Gate tests running on load and max jobs right before launching check-in: 1fbf00b8f7 user: matt tags: v1.65-cleanup
00:14
Gate when first detect that all job slots are in use. No point in looping until slots are availble. ==7.1/1.6/WARN/1201/mars== check-in: cd1100885b user: matt tags: v1.65-cleanup
Changes

Modified runs.scm from [8b4951a591] to [6de703a6c2].

49
50
51
52
53
54
55

56
57
58
59
60
61
62
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63







+







  test-patts required-tests test-registry
  registry-mutex flags keyvals run-info all-tests-registry
  can-run-more-tests
  ((can-run-more-tests-count 0) : fixnum)
  (last-fuel-check         0)  ;; time when we last checked fuel
  (beginning-of-time       (current-seconds))
  (load-mgmt-function      #f)
  (wait-for-jobs-function  #f)
  )

(defstruct runs:testdat
  hed tal reg reruns  test-record
  test-name item-path jobgroup
  waitons testmode  newtal itemmaps prereqs-not-met)
  
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690




1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710






























1711
1712
1713
1714
1715
1716
1717
1675
1676
1677
1678
1679
1680
1681



1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692




















1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729







-
-
-







+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







	 ;; 
	 ((not items)
          (debug:print-info 4 *default-log-port* "cond branch - "  "rtq-2")
	  (debug:print-info 4 *default-log-port* "OUTER COND: (not items)")
	  (if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests))
		   (not (null? tal)))
	      (loop (car tal)(cdr tal) reg reruns))
	  (runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))

	  ;; This would be a good place to block on homehost load

	  ;; gonna try a strategy change here.
	  ;;
	  ;; check if can run more tests. if yes, continue, if no, rest for 10 seconds, check again
	  ;; repeat until can run more tests
	  ;;
	  ;; look at the test jobgroup and tot jobs running
	  (if (not (runs:dat-wait-for-jobs-funcion runsdat))
	      (runs:dat-wait-for-jobs-function-set!
	       runsdat 
	       (lambda ()
	  (let loop-can-run-more
	      ((res      (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs))
	       (remtries 60))
	    (match res
              ((run-more num-running . rem)
	       (if (or run-more
		       (< remtries 1))
		   (begin
		     (if (runs:lownoise "num-running" 30)
			 (debug:print-info 0 *default-log-port* "Have "num-running" tests of max " max-concurrent-jobs))
		     (runs:dat-can-run-more-tests-set! runsdat res)) ;; capture the result and then drop through
		   (begin
		     (if (runs:lownoise "num-running" 10)
			 (debug:print-info 0 *default-log-port* "Can't run more tests, have "num-running" tests of "
					   max-concurrent-jobs " allowed."))
		     (thread-sleep! 5) ;; if we've hit max concurrent jobs take a breather, nb// make this configurable
		     (loop-can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)
					(- remtries 1)))))))
	      
	  (let ((loop-list (runs:process-expanded-tests runsdat testdat)))
		 (let loop-can-run-more
		     ((res      (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs))
		      (remtries 1440)) ;; we can wait for up to two hours for jobs to get done
		   (match res
			  ((run-more num-running . rem)
			   (if (or run-more
				   (< remtries 1))
			       (begin
				 (if (runs:lownoise "num-running" 30)
				     (debug:print-info 0 *default-log-port* "Have "num-running" tests of max " max-concurrent-jobs))
				 (runs:dat-can-run-more-tests-set! runsdat res)) ;; capture the result and then drop through
			       (begin
				 (if (runs:lownoise "num-running" 10)
				     (debug:print-info 0 *default-log-port* "Can't run more tests, have "num-running" tests of "
						       max-concurrent-jobs " allowed."))
				 (thread-sleep! 5) ;; if we've hit max concurrent jobs take a breather, nb// make this configurable
				 
				 ;; wait for load here
				 (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
				 (loop-can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)
						    (- remtries 1)))))))
		 )))

	  ;; I'm not clear on why prereqs are gathered here TODO: verfiy this is needed
	  (runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))

	  ;; I'm not clear on why we'd capture running job counts here TODO: verify this is needed
	  (runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs))

	  (let ((loop-list (runs:process-expanded-tests runsdat testdat))) ;; in process-expanded-tests ultimately run:test -> launch-test -> test actually running
            (if loop-list (apply loop loop-list))))

	 ;; items processed into a list but not came in as a list been processed
	 ;;
	 ((and (list? items)     ;; thus we know our items are already calculated
	       (not   itemdat))  ;; and not yet expanded into the list of things to be done
          (debug:print-info 4 *default-log-port* "cond branch - "  "rtq-3")
2083
2084
2085
2086
2087
2088
2089





2090
2091
2092
2093
2094








2095
2096
2097
2098
2099
2100
2101
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106





2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121







+
+
+
+
+
-
-
-
-
-
+
+
+
+
+
+
+
+







		 (if skip-test
		     (begin
		       (mt:test-set-state-status-by-id run-id test-id "COMPLETED" "SKIP" skip-test)
		       (debug:print-info 1 *default-log-port* "SKIPPING Test " full-test-name " due to " skip-test))
		     ;;
		     ;; Here the test is handed off to launch.scm for launch-test to complete the launch process
		     ;;
		     (begin
		       ;; wait for less than max jobs here
		       (if (runs:dat-wait-for-jobs-function runsdat)
			   ((runs:dat-wait-for-jobs-function runsdat)))
		       
		     (if (not (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat flags))
			 (begin
			   (print "ERROR: Failed to launch the test. Exiting as soon as possible")
			   (set! *globalexitstatus* 1) ;; 
			   (process-signal (current-process-id) signal/kill))))))))
		       (if (not (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat flags))
			   (begin
			     (print "ERROR: Failed to launch the test. Exiting as soon as possible")
			     (set! *globalexitstatus* 1) ;; 
			     (process-signal (current-process-id) signal/kill))
			   )
		       ;; wait again here?
		       ))))))
	((KILLED) 
	 (debug:print 1 *default-log-port* "NOTE: " full-test-name " is already running or was explictly killed, use -force to launch it.")
	 (hash-table-set! test-registry (db:test-make-full-name test-name test-path) 'DONOTRUN)) ;; KILLED))
	((LAUNCHED REMOTEHOSTSTART RUNNING)  
	 (debug:print 2 *default-log-port* "NOTE: " test-name " is already running"))
	;; (if (> (- (current-seconds)(+ (db:test-get-event_time testdat)
	;; 			       (db:test-get-run_duration testdat)))