Megatest

Diff
Login

Differences From Artifact [d0067277fa]:

To Artifact [b35a784abe]:


166
167
168
169
170
171
172

173



174
175
176
177
178
179
180
181
182
183
184


185
186
187
188
189
190
191
166
167
168
169
170
171
172
173

174
175
176
177
178
179
180
181
182
183
184
185
186

187
188
189
190
191
192
193
194
195







+
-
+
+
+










-
+
+







              (set! ezsteps #t) ;; set the needed flag
	      (set! ezstepslst
                    (append (or ezstepslst '())
                            (list (list "subrun" (conc "{subrun=true} " mt-cmd)))))))

	;; process the ezsteps
	(if ezsteps
	    (let* ((envdbf        (conc "/tmp/."(current-user-name)"-"(current-process-id)"-"run-id"-"test-id".db"))
	    (let* ((all-steps-dat (make-hash-table))) ;; keep all the info around as stepname ==> alist; where  'params is the params list (add other stuff as needed)
		   (all-steps-dat (make-hash-table))) ;; keep all the info around as stepname ==> alist;
	                                              ;;; where  'params is the params list (add other
	                                              ;;; stuff as needed)
	      (if (not (common:file-exists? ".ezsteps"))(create-directory ".ezsteps"))
	      ;; if ezsteps was defined then we are sure to have at least one step but check anyway
	      (if (not (> (length ezstepslst) 0))
		  (debug:print-error 0 *default-log-port* "ezsteps defined but ezstepslst is zero length")
		  (let loop ((ezstep (car ezstepslst))
			     (tal    (cdr ezstepslst))
			     (prevstep #f))
                    (debug:print-info 0 *default-log-port* "Processing ezstep \"" (string-intersperse ezstep " ") "\"")
		    ;; check exit-info (vector-ref exit-info 1)
		    (if (launch:einf-exit-status exit-info) ;; (vector-ref exit-info 1)
			(let* ((logpro-used (launch:runstep ezstep run-id test-id exit-info m tal testconfig all-steps-dat))
			(let* ((logpro-used (launch:runstep ezstep run-id test-id exit-info m
							    tal testconfig all-steps-dat prevstep envdbf))
			       (stepname    (car ezstep))
			       (stepparms   (hash-table-ref all-steps-dat stepname)))
			  (setenv "MT_STEP_NAME" stepname)
			  (pp (hash-table->alist all-steps-dat))
			  ;; if logpro-used read in the stepname.dat file
			  (if (and logpro-used (common:file-exists? (conc stepname ".dat")))
			      (launch:load-logpro-dat run-id test-id stepname))
203
204
205
206
207
208
209
210

211
212
213
214
215
216



217
218
219
220
221
222
223
207
208
209
210
211
212
213

214
215
216
217
218
219

220
221
222
223
224
225
226
227
228
229







-
+





-
+
+
+







			   (round 
			    (- 
			     (current-seconds) 
			     start-seconds)))))
	 (kill-tries 0))
    ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area)
    ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area)
    (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10)
    (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10 update-db: #t)

    (let loop ((minutes   (calc-minutes))
	       (cpu-load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
	       (disk-free (get-df (current-directory)))
               (last-sync (current-seconds)))
      (common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync))
      ;; (common:telemetry-log "zombie" (conc "launch:monitor-job -
      ;; top of loop encountered at "(current-seconds)" with
      ;; last-sync="last-sync))
      (let* ((over-time     (> (current-seconds) (+ last-sync update-period)))
             (new-cpu-load  (let* ((load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
                                   (delta (abs (- load cpu-load))))
                              (if (> delta 0.1) ;; don't bother updating with small changes
                                  load
                                  #f)))
             (new-disk-free (let* ((df    (if over-time ;; only get df every 30 seconds
231
232
233
234
235
236
237
238

239
240
241
242
243
244
245
246
247

248
249
250
251
252
253
254

255
256
257
258

259
260
261
262
263
264
265
266
267
237
238
239
240
241
242
243

244
245
246
247
248
249
250
251
252

253
254
255
256
257
258
259

260




261


262
263
264
265
266
267
268







-
+








-
+






-
+
-
-
-
-
+
-
-







             (do-sync       (or new-cpu-load new-disk-free over-time))

             (test-info   (rmt:get-test-info-by-id run-id test-id))
             (state       (db:test-get-state test-info))
             (status      (db:test-get-status test-info))
             (kill-reason  "no kill reason specified")
             (kill-job?    #f))
        (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
        #;(common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
        (cond
         ((test-get-kill-request run-id test-id)
          (set! kill-reason "KILLING TEST since received kill request (KILLREQ)")
          (set! kill-job? #t))
         ((and runtlim (> (- (current-seconds) start-seconds) runtlim))
          (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim))
          (set! kill-job? #t))
         ((equal? status "DEAD")
          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f update-db: #t)
          (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
          ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
          (set! kill-job? #f)))

        (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
        (launch:handle-zombie-tests run-id)
        (when do-sync
        (if do-sync ;; save meta data about the running of this test
          ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)
          ;;  (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))
          (common:telemetry-log "zombie" (conc  "launch:monitor-job - dosync started at "(current-seconds)))
          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
	    (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f))
          (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds))))
        
	(if kill-job? 
	    (begin
              (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason)
	      (mutex-lock! m)
	      ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this
	      ;;       section and the runit section? Or add a loop that tries three times with a 1/4 second
	      ;;       between tries?
310
311
312
313
314
315
316
317

318
319
320
321
322
323
324
311
312
313
314
315
316
317

318
319
320
321
322
323
324
325







-
+







	    (begin
	      (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses
	      (if (hash-table-ref/default misc-flags 'keep-going #f)  ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta
		  (loop (calc-minutes)
                        (or new-cpu-load cpu-load)
                        (or new-disk-free disk-free)
                        (if do-sync (current-seconds) last-sync)))))))
    (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional
    (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f update-db: #t))) ;; NOTE: Checking twice for keep-going is intentional


(define (launch:execute encoded-cmd)
  (let* ((cmdinfo    (common:read-encoded-string encoded-cmd))
	 (tconfigreg #f))
    (setenv "MT_CMDINFO" encoded-cmd)
    ;;(bb-check-path msg: "launch:execute incoming")
463
464
465
466
467
468
469



470
471
472
473
474
475
476
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480







+
+
+







				(db:test-get-host        test-info)
				(begin
				  (debug:print 0 *default-log-port* "ERROR: failed to find a record for test-id " test-id ", exiting.")
				  (exit))))
		 (test-pid  (db:test-get-process_id  test-info)))
	    (cond
             ;; -mrw- I'm removing KILLREQ from this list so that a test in KILLREQ state is treated as a "do not run" flag.
	     ;;((or (member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun
	     ;;	  (and (equal? (db:test-get-state test-info) "COMPLETED")                           ;; completed/abort => rerun if asked
	     ;;	       (member (db:test-get-status test-info) '("ABORT"))))
	     ((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun
	      (debug:print 0 *default-log-port* "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request")
	      ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a")

              (rmt:general-call 'set-test-start-time #f test-id)
              (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f)
	      ) ;; prime it for running
729
730
731
732
733
734
735
736

737
738
739
740
741
742
743
733
734
735
736
737
738
739

740
741
742
743
744
745
746
747







-
+







;; 100% COMPLETED/ (PASS,FAIL,ABORT etc.) ==> COMPLETED / X where X is same as itemized rollup
;; > 3 RUNNING with not test_dead do nothing (run should already be RUNNING/ na
;; > 0 RUNNING and test_dead then send KILLREQ ==> COMPLETED
;; 0 RUNNING ==> this is actually the first condition, should not get here

(define (launch:end-of-run-check run-id )
    (let* ((not-completed-cnt (rmt:get-not-completed-cnt run-id))  
           (running-cnt (rmt:get-count-tests-running-for-run-id run-id #f)) ;; fastmode=no
           (running-cnt (rmt:get-count-tests-running-for-run-id run-id))
           (all-test-launched (rmt:get-var (conc "lunch-complete-" run-id)))
           (current-state (rmt:get-run-state run-id))
           (current-status (rmt:get-run-status run-id)))
     ;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing 
     (debug:print 0 *default-log-port* "Running test cnt :" running-cnt)                      
     (rmt:set-state-status-and-roll-up-run  run-id current-state current-status)
     (runs:update-junit-test-reporter-xml run-id) 
769
770
771
772
773
774
775


776

777
778
779
780
781
782
783
773
774
775
776
777
778
779
780
781

782
783
784
785
786
787
788
789







+
+
-
+







                 (item-path (vector-ref running-test 11)))
			       	(debug:print 0 *default-log-port* "test " test-name "/" item-path " not completed")
              (if (not (null? tal))
				  (loop (car tal) (cdr tal)))))))))))        
 
(define (launch:is-test-alive host pid)
  (if (and host pid (not (equal? host "n/a")))
      (let* ((is-local (equal? host (get-host-name)))
	     (ssh-cmd   (if is-local " " (conc "ssh " host " ")))
      (let* ((cmd (conc "ssh " host " pstree -A " pid))
	     (cmd (conc ssh-cmd "pstree -A " pid))
	     (output (with-input-from-pipe cmd read-lines)))
	(debug:print 2 *default-log-port* "Running " cmd " received " output)
	(if (eq? (length output) 0)
	   #f
	   #t))
      #t))
 
1540
1541
1542
1543
1544
1545
1546
1547

1548
1549
1550
1551
1552
1553
1554
1555
1546
1547
1548
1549
1550
1551
1552

1553

1554
1555
1556
1557
1558
1559
1560







-
+
-







			      (hash-table-ref/default *configdat* "env-override" '())))
	     (miscprevvals   (alist->env-vars ;; consolidate this code with the code in megatest.scm for "-execute"
			      (append (list (list "MT_TEST_RUN_DIR" work-area)
					    (list "MT_TEST_NAME" test-name)
					    (list "MT_ITEM_INFO" (conc itemdat)) 
					    (list "MT_RUNNAME"   runname)
					    (list "MT_TARGET"    mt_target)
					    (list "MT_ITEMPATH"  item-path)
					    (list "MT_ITEMPATH"  item-path))
					    )
				      itemdat)))
	     (testprevvals   (alist->env-vars
			      (hash-table-ref/default tconfig "pre-launch-env-overrides" '())))
	     ;; Launchwait defaults to true, must override it to turn off wait
	     (launchwait     (if (equal? (configf:lookup *configdat* "setup" "launchwait") "no") #f #t))
	     (launch-results-prev (apply (if launchwait ;; BB: TODO: refactor this to examine return code of launcher, if nonzero, set state to launch failed.
					     process:cmd-run-with-stderr-and-exitcode->list