Megatest

Check-in [b1c53d218a]
Login
Overview
Comment:Turn off throttling in dashboard. Tweaked values and got proper output from the journal sensor
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.81-journal-based-throttling
Files: files | file ages | folders
SHA1: b1c53d218a69287cda4a3271be31a88b41967d45
User & Date: mrwellan on 2024-07-11 11:06:09
Other Links: branch diff | manifest | tags
Context
2024-07-11
13:28
Added setcicd script for registering branches ready to merge check-in: 3ced0f4705 user: mrwellan tags: v1.81-journal-based-throttling
11:06
Turn off throttling in dashboard. Tweaked values and got proper output from the journal sensor check-in: b1c53d218a user: mrwellan tags: v1.81-journal-based-throttling
06:06
Merged in latest from v1.81 check-in: afc4721a06 user: mrwellan tags: v1.81-journal-based-throttling
Changes

Modified dashboard.scm from [e5b27b795a] to [89bc431aeb].

166
167
168
169
170
171
172


173
174
175
176
177
178
179
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181







+
+







;;    please-update:        #t
;;    update-mutex:         (make-mutex)
;;    updaters:             (make-hash-table)
;;    updating:             #f
;;    hide-not-hide-tabs:   #f
;;    target:               ""
;;    ))

(set! *journal-stats-enable* #f)

;;======================================================================
;; buttons color using image
;;======================================================================

(define *images* (make-hash-table))

Modified rmt.scm from [07ade4ba5d] to [e8bc4b391f].

100
101
102
103
104
105
106
107
108


109
110
111
112
113
114
115
100
101
102
103
104
105
106


107
108
109
110
111
112
113
114
115







-
-
+
+







    (if (and (not *journal-stats*)
	     (file-exists? dbdir))
	(tt:start-stats dbdir)) ;; fixme - find the right call to get the db directory
    
    ;; check the load on dbfname and add some delay using a droop curve of sorts
    (if *journal-stats*
	(let* ((load  (tt:get-journal-stats dbfname)))
	  (if (> load 0)
	      (let ((dely (* 10 load)))
	  (if (> load 0.1) ;; start activating delay at 10% journal load time
	      (let ((dely (* 50 (* load load)))) ;; 100% journal time=50sec delay
		(debug:print 0 *default-log-port* "Journal load "load" delaying queries "dely"s.")
		(thread-sleep! dely)))))
	
    (case (rmt:transport-mode)
      ((tcp)
       (let* ((start-time    (current-seconds)) ;; snapshot time so all use cases get same value
	      (attemptnum    (+ 1 attemptnum))

Modified runs.scm from [832a86263e] to [adfae1025a].

1152
1153
1154
1155
1156
1157
1158
1159

1160
1161


1162

1163
1164
1165
1166
1167
1168
1169
1152
1153
1154
1155
1156
1157
1158

1159
1160
1161
1162
1163

1164
1165
1166
1167
1168
1169
1170
1171







-
+


+
+
-
+







	 (have-resources         (and (if *journal-stats*
					  (let* ((dbfname (conc
							   (dbfile:run-id->dbnum run-id)
							   ".db"))
						 (load (tt:get-journal-stats dbfname)))
					    (if (> load 0.1) ;; dbs too busy to start more tests
						(begin
						 (debug:print-info 0 *default-log-port* "Gating launch due to db load "load" based on journal file observations for "dbfname)
						  (debug:print-info 0 *default-log-port* "Gating launch due to db load "load" based on journal file observations for "dbfname)
						 #f)
						#t))
					  (begin
					    (debug:print-info 0 *default-log-port* "Journal gating not started for "run-id)
					  #t) ;; if journal monitoring not started do not gate
					    #t)) ;; if journal monitoring not started do not gate
				      (car run-limits-info)))
	 (num-running            (list-ref run-limits-info 1))
	 (num-running-in-jobgroup(list-ref run-limits-info 2)) 
	 (max-concurrent-jobs    (list-ref run-limits-info 3))
	 (job-group-limit        (list-ref run-limits-info 4))
	 ;; (prereqs-not-met        (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))
	 ;; (prereqs-not-met         (mt:lazy-get-prereqs-not-met run-id waitons item-path mode: testmode itemmap: itemmap))
1386
1387
1388
1389
1390
1391
1392
1393

1394
1395
1396
1397
1398
1399
1400
1388
1389
1390
1391
1392
1393
1394

1395
1396
1397
1398
1399
1400
1401
1402







-
+







		      (if (eq? nth-try 'removed) ;; removed is removed - drop it NOW
			  (if (null? tal)
			      #f ;; yes, really
			      (list (car tal)(cdr tal) reg reruns))
			  (begin
			    (if (runs:lownoise (conc "FAILED prerequisites or other issue" hed) 60)
				(debug:print 0 *default-log-port* "WARNING: test " hed " has FAILED prerequisites or other issue. Internal state >" nth-try "< will be overridden and we'll retry."))
			    (let* ((test-id      (rmt:get-test-id run-id testname item-path))
			    (let* ((test-id      (rmt:get-test-id run-id hed item-path))
				   (test-info    (rmt:get-testinfo-state-status run-id test-id)) ;; we need *current* info
				   (status       (db:test-status test-info)))
			      (if (equal? status "KEEP_TRYING")
				  (mt:test-set-state-status-by-testname-unless-completed run-id test-name item-path "COMPLETED" "PREQ_FAIL" #f)
				  (mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "KEEP_TRYING" #f)))
			    (hash-table-set! test-registry hed 'removed) ;; was 0
                            (if (not (and (null? reg) (null? tal)))

Modified tcp-transportmod.scm from [172e93584b] to [d0258d10eb].

1139
1140
1141
1142
1143
1144
1145

1146
1147
1148
1149
1150
1151
1152
1153

1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167














1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186


















1187
1188
1189
1190
1191
1192
1193
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155














1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170


















1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195







+








+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+

-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







(defstruct jstats
  (count 0)
  (jcount (make-hash-table)) ;; 1.db => journal_count
  )

;; timeblk => jstats
(define *journal-stats* #f) ;; (make-hash-table))
(define *journal-stats-enable* #t) ;; change to #f to turn off

;; monte-carlo-esque random sampling of journal files
;; for all the files:
;;   if .journal
;;      update stats +1 +1
;;      update stats +1  0
;;
(define (tt:write-load-tracking dbdir)
  (if *journal-stats-enable*
  (let* ((cs    (current-seconds))
	 (key   (inexact->exact (quotient cs 10)))
	 (old   (- key 5)) ;; 4 x 10 seconds ago
	 (jstat (if (hash-table-exists? *journal-stats* key)
		    (hash-table-ref *journal-stats* key )
		    (let ((new (make-jstats)))
		      (hash-table-set! *journal-stats* key new)
		      new))))
    ;; clear out old records
    (for-each
     (lambda (key)
       (if (< key old)
	   (hash-table-delete! *journal-stats* key)))
     (hash-table-keys *journal-stats*))
      (let* ((cs    (current-seconds))
	     (key   (inexact->exact (quotient cs 10)))
	     (old   (- key 5)) ;; 4 x 10 seconds ago
	     (jstat (if (hash-table-exists? *journal-stats* key)
			(hash-table-ref *journal-stats* key )
			(let ((new (make-jstats)))
			  (hash-table-set! *journal-stats* key new)
			  new))))
	;; clear out old records
	(for-each
	 (lambda (key)
	   (if (< key old)
	       (hash-table-delete! *journal-stats* key)))
	 (hash-table-keys *journal-stats*))

    ;; increment our count of observations
    (jstats-count-set! jstat (+ (jstats-count jstat) 1))
    
    ;; now find and increment journal file counts
    (directory-fold
     (lambda (fname res)
       ;; is it a journal file?
       (let ((parts (string-match "^(.*\\.db)-journal.*" fname)))
	 (match parts
	   ((_ dbfname)
	    (hash-table-set! (jstats-jcount jstat) dbfname
			     (+ (hash-table-ref/default (jstats-jcount jstat) dbfname 0) 1.0)
			     ))
	   (else #f)
	   )))
     '()
     dbdir 
     )))
	;; increment our count of observations
	(jstats-count-set! jstat (+ (jstats-count jstat) 1))
	
	;; now find and increment journal file counts
	(directory-fold
	 (lambda (fname res)
	   ;; is it a journal file?
	   (let ((parts (string-match "^(.*\\.db)-journal.*" fname)))
	     (match parts
		    ((_ dbfname)
		     (hash-table-set! (jstats-jcount jstat) dbfname
				      (+ (hash-table-ref/default (jstats-jcount jstat) dbfname 0) 1.0)
				      ))
		    (else #f)
		    )))
	 '()
	 dbdir 
     ))))

(define *journal-stats-mutex* (make-mutex))

(define (tt:journal-stats-run dbdir)
  (if (not *journal-stats*)(set! *journal-stats* (make-hash-table)))
  (let loop ()
    (mutex-lock! *journal-stats-mutex*)
1202
1203
1204
1205
1206
1207
1208
1209


1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229

1230
1231
1232
1233
1234
1235
1236
1204
1205
1206
1207
1208
1209
1210

1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231

1232
1233
1234
1235
1236
1237
1238
1239







-
+
+



















-
+







  (thread-start!
   (make-thread
    (lambda ()(tt:journal-stats-run dbdir)) "Journal stats collection thread")))

(define (tt:get-journal-stats #!optional (dbfname #f))
  (let* ((result    (make-jstats))
	 (hitcounts (jstats-jcount result)))
    (if *journal-stats*
    (if (and *journal-stats*
	     *journal-stats-enable*)
	(begin
	  (mutex-lock! *journal-stats-mutex*)
	  (hash-table-for-each
	   *journal-stats*
	   (lambda (k v) ;; key jstats
	     (let* ((count  (jstats-count v))
		    (jcount (jstats-jcount v))) ;; dbfname => hit count
	       (jstats-count-set! result
				  (+ (jstats-count result)
				     (jstats-count v)))
	       (hash-table-for-each
		jcount
		(lambda (dbfname hit-count)
		  (hash-table-set! hitcounts dbfname
				   (+ hit-count
				      (hash-table-ref/default hitcounts dbfname 0))))))))
	  (mutex-unlock! *journal-stats-mutex*))
	(debug:print 0 *default-log-port* "INFO: *journal-stats* not set."))
    ;; convert to normalized alist
    (let* ((tot  (min (jstats-count result) 1)) ;; avoid divide by zero
    (let* ((tot  (max (jstats-count result) 1)) ;; avoid divide by zero
	   (hits (jstats-jcount result)) ;; 1.db => count
	   (res  (hash-table-map
		  hits
		  (lambda (fname hitcount)
		    (cons fname (/ hitcount tot))))))
      (if dbfname
	  (or (alist-ref dbfname res equal?) 0)