Megatest

Check-in [b60108422e]
Login
Overview
Comment:Check disk usage every 30 seconds instead of every 3 seconds
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.64
Files: files | file ages | folders
SHA1: b60108422e9d9d99ada061488db9515308500586
User & Date: mrwellan on 2017-05-24 14:48:04
Other Links: branch diff | manifest | tags
Context
2017-05-24
14:50
Bumped version to v1.6415 check-in: 18a4e536c2 user: mrwellan tags: v1.64
14:48
Check disk usage every 30 seconds instead of every 3 seconds check-in: b60108422e user: mrwellan tags: v1.64
10:48
Sync from test to server only when cpu changed, disk changed by more than 10% or more than 30 seconds past. check-in: 8ca2b352b3 user: mrwellan tags: v1.64
Changes

Modified launch.scm from [4b0e8bb2f3] to [e2b2d5cae9].

315
316
317
318
319
320
321

322
323
324
325
326
327
328
329
330
331
332
333
334
335
336

337
338
339
340
341
342


343
344
345
346
347
348
349
350
351
352
353
354
355
		    (if (steprun-good? logpro-used (launch:einf-exit-code exit-info))
			(if (not (null? tal))
			    (loop (car tal) (cdr tal) stepname))
			(debug:print 4 *default-log-port* "WARNING: step " (car ezstep) " failed. Stopping")))
		  (debug:print 4 *default-log-port* "WARNING: a prior step failed, stopping at " ezstep)))))))

(define (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags)

  (let* ((start-seconds (current-seconds))
	 (calc-minutes  (lambda ()
			  (inexact->exact 
			   (round 
			    (- 
			     (current-seconds) 
			     start-seconds)))))
	 (kill-tries 0))
    ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area)
    ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area)
    (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10)
    (let loop ((minutes   (calc-minutes))
	       (cpu-load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
	       (disk-free (get-df (current-directory)))
               (last-sync (current-seconds)))

      (let* ((new-cpu-load (let* ((load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
                                  (delta (abs (- load cpu-load))))
                             (if (> delta 0.1) ;; don't bother updating with small changes
                                 load
                                 #f)))
             (new-disk-free (let* ((df    (get-df (current-directory)))


                                   (delta (abs (- df disk-free))))
                              (if (and (> df 0)
                                       (> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg
                                  df
                                  #f)))
             (do-sync       (or new-cpu-load new-disk-free (> (current-seconds) (+ last-sync 30)))))
        (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
	(set! kill-job? (or (test-get-kill-request run-id test-id) ;; run-id test-name itemdat))
			    (and runtlim (let* ((run-seconds   (- (current-seconds) start-seconds))
						(time-exceeded (> run-seconds runtlim)))
					   (if time-exceeded
					       (begin
						 (debug:print-info 0 *default-log-port* "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" run-seconds " seconds, limit=" runtlim)







>
|














>
|
|
|
|
|
|
>
>





|







315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
		    (if (steprun-good? logpro-used (launch:einf-exit-code exit-info))
			(if (not (null? tal))
			    (loop (car tal) (cdr tal) stepname))
			(debug:print 4 *default-log-port* "WARNING: step " (car ezstep) " failed. Stopping")))
		  (debug:print 4 *default-log-port* "WARNING: a prior step failed, stopping at " ezstep)))))))

(define (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags)
  (let* ((update-period (string->number (or (configf:lookup *configdat* "setup" "test-stats-update-period") "30")))
         (start-seconds (current-seconds))
	 (calc-minutes  (lambda ()
			  (inexact->exact 
			   (round 
			    (- 
			     (current-seconds) 
			     start-seconds)))))
	 (kill-tries 0))
    ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area)
    ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area)
    (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10)
    (let loop ((minutes   (calc-minutes))
	       (cpu-load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
	       (disk-free (get-df (current-directory)))
               (last-sync (current-seconds)))
      (let* ((over-time     (> (current-seconds) (+ last-sync update-period)))
             (new-cpu-load  (let* ((load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
                                   (delta (abs (- load cpu-load))))
                              (if (> delta 0.1) ;; don't bother updating with small changes
                                  load
                                  #f)))
             (new-disk-free (let* ((df    (if over-time ;; only get df every 30 seconds
                                              (get-df (current-directory))
                                              disk-free))
                                   (delta (abs (- df disk-free))))
                              (if (and (> df 0)
                                       (> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg
                                  df
                                  #f)))
             (do-sync       (or new-cpu-load new-disk-free over-time)))
        (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
	(set! kill-job? (or (test-get-kill-request run-id test-id) ;; run-id test-name itemdat))
			    (and runtlim (let* ((run-seconds   (- (current-seconds) start-seconds))
						(time-exceeded (> run-seconds runtlim)))
					   (if time-exceeded
					       (begin
						 (debug:print-info 0 *default-log-port* "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" run-seconds " seconds, limit=" runtlim)

Modified tests/unittests/all-rmt.scm from [a8b587a3b0] to [47417391a0].

24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43

(test #f #t (vector? (rmt:get-connection-info toppath))) ;; TODO: push areapath down.
(test #f #t (string? (server:check-if-running ".")))
;; DEF (test #f #f (rmt:send-receive-no-auto-client-setup *runremote* 'get-keys #f '()))
;; DEF (rmt:kill-server run-id)
;; DEF (rmt:start-server run-id)
(test #f '(#t "successful login")(rmt:login #f))

(test-batch rmt:login
            "rmt:login"
            (list (list "good" (list #t "successful login") #f)
                  (list "bad"  (list #f "login failed")     #t)))

;; DEF (rmt:login-no-auto-client-setup connection-info)
(test #f #t (pair? (rmt:get-latest-host-load (get-host-name))))

;; get-latest-host-load does a lookup in the db, it won't return a useful value unless
;; a test ran recently on host
(test-batch rmt:get-latest-host-load
            "rmt:get-latest-host-load"







<
<
<
<
<
<







24
25
26
27
28
29
30






31
32
33
34
35
36
37

(test #f #t (vector? (rmt:get-connection-info toppath))) ;; TODO: push areapath down.
(test #f #t (string? (server:check-if-running ".")))
;; DEF (test #f #f (rmt:send-receive-no-auto-client-setup *runremote* 'get-keys #f '()))
;; DEF (rmt:kill-server run-id)
;; DEF (rmt:start-server run-id)
(test #f '(#t "successful login")(rmt:login #f))






;; DEF (rmt:login-no-auto-client-setup connection-info)
(test #f #t (pair? (rmt:get-latest-host-load (get-host-name))))

;; get-latest-host-load does a lookup in the db, it won't return a useful value unless
;; a test ran recently on host
(test-batch rmt:get-latest-host-load
            "rmt:get-latest-host-load"
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
      (tpt  "%/%")) ;; target patt
  (test-batch rmt:get-runs-by-patt
              "rmt:get-runs-by-patt"
              (list (list "t=0" #t keys rnp tpt #f #f #f 0)
                    (list "t=current" #f keys rnp tpt #f #f #f (+ 100 (current-seconds))) ;; should be no records from the future
                    )
              post-proc: (lambda (res)
                           (print "rmt:get-runs-by-patt returned: " res)
                           (and (vector? res)
                                (let ((rows (vector-ref res 1)))
                                  (> (length rows) 0))))))

;; (rmt:find-and-mark-incomplete run-id ovr-deadtime)
;; (rmt:get-main-run-stats run-id)
;; (rmt:get-var varname)







|







104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
      (tpt  "%/%")) ;; target patt
  (test-batch rmt:get-runs-by-patt
              "rmt:get-runs-by-patt"
              (list (list "t=0" #t keys rnp tpt #f #f #f 0)
                    (list "t=current" #f keys rnp tpt #f #f #f (+ 100 (current-seconds))) ;; should be no records from the future
                    )
              post-proc: (lambda (res)
                           ;; (print "rmt:get-runs-by-patt returned: " res)
                           (and (vector? res)
                                (let ((rows (vector-ref res 1)))
                                  (> (length rows) 0))))))

;; (rmt:find-and-mark-incomplete run-id ovr-deadtime)
;; (rmt:get-main-run-stats run-id)
;; (rmt:get-var varname)