Megatest

Diff
Login

Differences From Artifact [26d43d79a0]:

To Artifact [03e715c08b]:


661
662
663
664
665
666
667
668

669
670
671
672
673
674
675
	  ;; (db:test-remove-steps db run-id testname itemdat)
	  ;; now is also a good time to write the .testconfig file
	  (let* ((tconfig-fname   (conc work-area "/.testconfig"))
		 (tconfig-tmpfile (conc tconfig-fname ".tmp"))
		 (tconfig         (tests:get-testconfig test-name item-path tconfigreg #t force-create: #t)) ;; 'return-procs)))
		 (scripts (configf:get-section tconfig "scripts")))
	    ;; create .testconfig file
	    (configf:write-alist tconfig tconfig-tmpfile)

	    (move-file tconfig-tmpfile tconfig-fname #t)
	    (delete-file* ".final-status")

	    ;; extract scripts from testconfig and write them to files in test run dir
	    (for-each
	     (lambda (scriptdat)
	       (match scriptdat







|
>







661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
	  ;; (db:test-remove-steps db run-id testname itemdat)
	  ;; now is also a good time to write the .testconfig file
	  (let* ((tconfig-fname   (conc work-area "/.testconfig"))
		 (tconfig-tmpfile (conc tconfig-fname ".tmp"))
		 (tconfig         (tests:get-testconfig test-name item-path tconfigreg #t force-create: #t)) ;; 'return-procs)))
		 (scripts (configf:get-section tconfig "scripts")))
	    ;; create .testconfig file
	    (configf:write-alist tconfig tconfig-tmpfile #t) ;; the #t forces a check of the written data
	    (assert (file-exists? tconfig-tmpfile) "FATAL: We just wrote the dang file, how can it not exist?")
	    (move-file tconfig-tmpfile tconfig-fname #t)
	    (delete-file* ".final-status")

	    ;; extract scripts from testconfig and write them to files in test run dir
	    (for-each
	     (lambda (scriptdat)
	       (match scriptdat
1468
1469
1470
1471
1472
1473
1474


1475
1476
1477
1478
1479
1480
1481
					(list 'target    mt_target)
					(list 'contour   contour)
					(list 'runtlim   (if run-time-limit (common:hms-string->seconds run-time-limit) #f))
					(list 'env-ovrd  (hash-table-ref/default *configdat* "env-override" '())) 
					(list 'set-vars  (if params (hash-table-ref/default params "-setvars" #f)))
					(list 'runname   runname)
					(list 'mt-bindir-path mt-bindir-path))))))))


      
      ;; clean out step records from previous run if they exist
      ;; (rmt:delete-test-step-records run-id test-id)
      ;; if the dir does not exist we may have a itempath where individual variables are a path, launch anyway
      (if (common:file-exists? work-area)
	  (change-directory work-area)) ;; so that log files from the launch process don't clutter the test dir
      (cond







>
>







1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
					(list 'target    mt_target)
					(list 'contour   contour)
					(list 'runtlim   (if run-time-limit (common:hms-string->seconds run-time-limit) #f))
					(list 'env-ovrd  (hash-table-ref/default *configdat* "env-override" '())) 
					(list 'set-vars  (if params (hash-table-ref/default params "-setvars" #f)))
					(list 'runname   runname)
					(list 'mt-bindir-path mt-bindir-path))))))))

      (setenv "MT_CMDINFO" cmdparms)  ;; setting this for use in nblauncher
      
      ;; clean out step records from previous run if they exist
      ;; (rmt:delete-test-step-records run-id test-id)
      ;; if the dir does not exist we may have a itempath where individual variables are a path, launch anyway
      (if (common:file-exists? work-area)
	  (change-directory work-area)) ;; so that log files from the launch process don't clutter the test dir
      (cond
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
;; > 3 RUNNING with not test_dead do nothing (run should already be RUNNING/ na
;; > 0 RUNNING and test_dead then send KILLREQ ==> COMPLETED
;; 0 RUNNING ==> this is actually the first condition, should not get here

(define (runs:end-of-run-check run-id )
  (let* ((not-completed-cnt (rmt:get-not-completed-cnt run-id))  
	 (running-cnt (rmt:get-count-tests-running-for-run-id run-id))
	 (all-test-launched (rmt:get-var run-id (conc "lunch-complete-" run-id)))
	 (current-state (rmt:get-run-state run-id))
	 (current-status (rmt:get-run-status run-id)))
    ;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing 
    (debug:print 0 *default-log-port* "Running test cnt :" running-cnt)                      
    (rmt:set-state-status-and-roll-up-run  run-id current-state current-status)
    (runs:update-junit-test-reporter-xml run-id) 
    (cond 
     ((and all-test-launched (eq? not-completed-cnt 0) (equal? all-test-launched "yes" ))
      (if (and (equal? (rmt:get-var run-id (conc "end-of-run-" run-id)) "no") (common:simple-lock (conc "endOfRun" run-id)))
	  (begin
	    (debug:print 4 *default-log-port* "look for  post hook. currseconds: " (current-seconds) " EOR " (rmt:get-var run-id (conc "end-of-run-" run-id)))
	    (debug:print 0 *default-log-port* "End of Run Detected.")
	    (rmt:set-var run-id (conc "end-of-run-" run-id) "yes")
					;(thread-sleep! 10)
	    (runs:run-post-hook run-id)
	    (debug:print 4 *default-log-port* "currseconds: " (current-seconds)" eor: " (rmt:get-var run-id (conc "end-of-run-" run-id)))
	    (common:simple-unlock (conc "endOfRun" run-id)))
	  (debug:print 0 *default-log-port* "End of Run Detected but not running post hook. This should happen when eor is set to yes. This will happen only when 2 tests exit at smae time. eor= " (rmt:get-var run-id (conc "end-of-run-" run-id)))))
     ((> running-cnt 3) 
      (debug:print 0 *default-log-port* "There are " running-cnt " tests running." ))
     ((> running-cnt 0)
      (debug:print 0 *default-log-port* "running cnt > 0 but <= 3 kill-running-tests-if-dead" )
      (let ((kill-cnt (launch:kill-tests-if-dead run-id)))
	(if (and all-test-launched  (equal? all-test-launched "yes") (eq? kill-cnt running-cnt))
	    (runs:end-of-run-check run-id)))) ;;todo







|

















|







1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
;; > 3 RUNNING with not test_dead do nothing (run should already be RUNNING/ na
;; > 0 RUNNING and test_dead then send KILLREQ ==> COMPLETED
;; 0 RUNNING ==> this is actually the first condition, should not get here

(define (runs:end-of-run-check run-id )
  (let* ((not-completed-cnt (rmt:get-not-completed-cnt run-id))  
	 (running-cnt (rmt:get-count-tests-running-for-run-id run-id))
	 (all-test-launched (rmt:get-var run-id (conc "launch-complete-" run-id)))
	 (current-state (rmt:get-run-state run-id))
	 (current-status (rmt:get-run-status run-id)))
    ;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing 
    (debug:print 0 *default-log-port* "Running test cnt :" running-cnt)                      
    (rmt:set-state-status-and-roll-up-run  run-id current-state current-status)
    (runs:update-junit-test-reporter-xml run-id) 
    (cond 
     ((and all-test-launched (eq? not-completed-cnt 0) (equal? all-test-launched "yes" ))
      (if (and (equal? (rmt:get-var run-id (conc "end-of-run-" run-id)) "no") (common:simple-lock (conc "endOfRun" run-id)))
	  (begin
	    (debug:print 4 *default-log-port* "look for  post hook. currseconds: " (current-seconds) " EOR " (rmt:get-var run-id (conc "end-of-run-" run-id)))
	    (debug:print 0 *default-log-port* "End of Run Detected.")
	    (rmt:set-var run-id (conc "end-of-run-" run-id) "yes")
					;(thread-sleep! 10)
	    (runs:run-post-hook run-id)
	    (debug:print 4 *default-log-port* "currseconds: " (current-seconds)" eor: " (rmt:get-var run-id (conc "end-of-run-" run-id)))
	    (common:simple-unlock (conc "endOfRun" run-id)))
	  (debug:print 0 *default-log-port* "End of Run Detected but not running post hook. This should happen when eor is set to yes. This will happen only when 2 tests exit at same time. eor= " (rmt:get-var run-id (conc "end-of-run-" run-id)))))
     ((> running-cnt 3) 
      (debug:print 0 *default-log-port* "There are " running-cnt " tests running." ))
     ((> running-cnt 0)
      (debug:print 0 *default-log-port* "running cnt > 0 but <= 3 kill-running-tests-if-dead" )
      (let ((kill-cnt (launch:kill-tests-if-dead run-id)))
	(if (and all-test-launched  (equal? all-test-launched "yes") (eq? kill-cnt running-cnt))
	    (runs:end-of-run-check run-id)))) ;;todo
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949

1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
		      (if (not (null? tal))
			  (loop (car tal) (cdr tal)))))))))))

(define (runs:find-and-mark-incomplete-and-check-end-of-run run-id ovr-deadtime)
  (rmt:find-and-mark-incomplete run-id ovr-deadtime)
  (runs:end-of-run-check run-id))



(define (launch:kill-tests-if-dead run-id)
  (let* ((running-tests (rmt:get-tests-for-run run-id "%" `("RUNNING" "LAUNCHED" "REMOTEHOSTSTART") `() #f #f #f #f #f #f #f #f)))
       (let loop ((running-test (car running-tests))
			     (tal    (cdr running-tests))
			     (kill-cnt 0))
		       (let* ((test-name (vector-ref running-test 2))
                 (item-path (vector-ref running-test 11))
								 (test-id (vector-ref running-test 0))
                 (host (vector-ref running-test 6))
                 (pid  (rmt:test-get-top-process-pid run-id test-id))   
                 (event-time (vector-ref running-test 5))
                 (duration (vector-ref running-test 12))
                 (flag 0)   
                 (curr-time (current-seconds)))

       (if (and (< (+ event-time duration 600) curr-time) (not (launch:is-test-alive host pid))) ;;test has not updated duration in last 10 min then likely its not running but confirm before marking it as killed
           (begin    
			       	(debug:print 0 *default-log-port* "test " test-name "/" item-path " needs to be killed")
              (set! flag 1) 
              (rmt:set-state-status-and-roll-up-items run-id test-name item-path "KILLREQ" "n/a" #f)))
               (if (not (null? tal))
				  (loop (car tal) (cdr tal) (+ kill-cnt flag))
                 (+ kill-cnt flag))))))


(define (runs:run-post-hook run-id)
    (let* ((run-post-hook   (configf:lookup *configdat* "runs" "post-hook"))
           (existing-tests (if run-post-hook
                               (rmt:get-tests-for-run run-id "%" '() '() ;; run-id testpatt states statuses
                                                      #f #f ;; offset limit







|
<


|
|
|
|
|
|
|
|
|
|
|
|
>
|
|
|


|
|
|







1930
1931
1932
1933
1934
1935
1936
1937

1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
		      (if (not (null? tal))
			  (loop (car tal) (cdr tal)))))))))))

(define (runs:find-and-mark-incomplete-and-check-end-of-run run-id ovr-deadtime)
  (rmt:find-and-mark-incomplete run-id ovr-deadtime)
  (runs:end-of-run-check run-id))

;; only called if there are more than zero running tests

(define (launch:kill-tests-if-dead run-id)
  (let* ((running-tests (rmt:get-tests-for-run run-id "%" `("RUNNING" "LAUNCHED" "REMOTEHOSTSTART") `() #f #f #f #f #f #f #f #f)))
    (let loop ((running-test (car running-tests))
	       (tal          (cdr running-tests))
	       (kill-cnt     0))
      (let* ((test-name  (vector-ref running-test 2))
             (item-path  (vector-ref running-test 11))
	     (test-id    (vector-ref running-test 0))
             (host       (vector-ref running-test 6))
             (pid        (rmt:test-get-top-process-pid run-id test-id))   
             (event-time (vector-ref running-test 5))
             (duration   (vector-ref running-test 12))
             (flag       0)   
             (curr-time  (current-seconds)))
	(if (and (< (+ event-time duration 600) curr-time)
		 (not (launch:is-test-alive host pid))) ;;test has not updated duration in last 10 min then likely its not running but confirm before marking it as killed
            (begin    
	      (debug:print 0 *default-log-port* "test " test-name "/" item-path " needs to be killed")
              (set! flag 1) 
              (rmt:set-state-status-and-roll-up-items run-id test-name item-path "KILLREQ" "n/a" #f)))
        (if (not (null? tal))
	    (loop (car tal) (cdr tal) (+ kill-cnt flag))
            (+ kill-cnt flag))))))


(define (runs:run-post-hook run-id)
    (let* ((run-post-hook   (configf:lookup *configdat* "runs" "post-hook"))
           (existing-tests (if run-post-hook
                               (rmt:get-tests-for-run run-id "%" '() '() ;; run-id testpatt states statuses
                                                      #f #f ;; offset limit