Megatest

Check-in [545878c107]
Login
Overview
Comment:dechatter steps
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.81-dechatter | v1.81-fix-extract-scripts
Files: files | file ages | folders
SHA1: 545878c107dcf12a23e0fcd1e5586988b11211b6
User & Date: matt on 2024-08-19 20:17:15
Other Links: branch diff | manifest | tags
Context
2024-08-23
01:25
wip check-in: 5a118f1a51 user: matt tags: v1.81-dechatter, v1.81-fix-extract-scripts
2024-08-19
20:17
dechatter steps check-in: 545878c107 user: matt tags: v1.81-dechatter, v1.81-fix-extract-scripts
2024-08-13
12:55
removed extra copy of launch:extract-scripts-logpro and corrected it to add .logpro to the logpro filenames check-in: e829926867 user: mmgraham tags: v1.81-fix-extract-scripts
Changes

Modified launch.scm from [44ed5734bd] to [aa2fedc7f8].

250
251
252
253
254
255
256




257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277



278
279

280
281







282
283
284
285
286
287
288
                                   (delta (abs (- df disk-free))))
                              (if (and (> df 0)
                                       (> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg
                                  df
                                  #f)))
             (do-sync       (or new-cpu-load new-disk-free over-time))





             (test-info   (rmt:get-test-state-status-by-id run-id test-id))
             (state       (car test-info));; (db:test-get-state test-info))
             (status      (cdr test-info));; (db:test-get-status test-info))
	     (killreq     (equal? state "KILLREQ"))
             (kill-reason  "no kill reason specified")
             (kill-job?    #f))
        ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
        (cond
         (killreq
          (set! kill-reason "KILLING TEST since received kill request (KILLREQ)")
          (set! kill-job? #t))
         ((and runtlim (> (- (current-seconds) start-seconds) runtlim))
          (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim))
          (set! kill-job? #t))
         ((equal? status "DEAD")
          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
          (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
          ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
          (set! kill-job? #f)))

        (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)



        (if (common:low-noise-print 600 "run zombie") ;; every five minutes is plenty
	    (launch:handle-zombie-tests run-id))

        (when do-sync
          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f))







        
	(if kill-job? 
	    (begin
              (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason)
	      (mutex-lock! m)
	      ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this
	      ;;       section and the runit section? Or add a loop that tries three times with a 1/4 second







>
>
>
>
|
|
|
|










|






>
>
>


>

|
>
>
>
>
>
>
>







250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
                                   (delta (abs (- df disk-free))))
                              (if (and (> df 0)
                                       (> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg
                                  df
                                  #f)))
             (do-sync       (or new-cpu-load new-disk-free over-time))

	     ;;
	     ;; MOVE THIS TO A FILE FLAG BASED APPROACH (FOR NOW)
	     ;;
	     
             ;; (test-info   (rmt:get-test-state-status-by-id run-id test-id))
             ;; (state       (car test-info));; (db:test-get-state test-info))
             ;; (status      (cdr test-info));; (db:test-get-status test-info))
	     (killreq     (file-exists? (conc work-area"/kill-test"))) ;; (equal? state "KILLREQ"))
             (kill-reason  "no kill reason specified")
             (kill-job?    #f))
        ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
        (cond
         (killreq
          (set! kill-reason "KILLING TEST since received kill request (KILLREQ)")
          (set! kill-job? #t))
         ((and runtlim (> (- (current-seconds) start-seconds) runtlim))
          (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim))
          (set! kill-job? #t))
         #;((equal? status "DEAD") ;; NEED ALTERNATIVE MECHANISM FOR THIS.
          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
          (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
          ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
          (set! kill-job? #f)))

        (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)

	;; revisit logic in zombie handling.
	;;
        (if (common:low-noise-print 600 "run zombie") ;; every five minutes is plenty
	    (launch:handle-zombie-tests run-id))
	
        (when do-sync
          ;; (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)

	  (let ((oup (open-output-file (conc work-area"/.run-logging-stats.csv" #:append))))
	    (with-output-to-port oup
	      (lambda ()
		(print run-id","test-id","new-cpu-load","new-disk-free","(calc-minutes))))
	    (close-output-port oup))	  
	  )
        
	(if kill-job? 
	    (begin
              (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason)
	      (mutex-lock! m)
	      ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this
	      ;;       section and the runit section? Or add a loop that tries three times with a 1/4 second