Megatest

Changes On Branch 2635b582e7273323
Login

Changes In Branch v1.81-journal-based-throttling Through [2635b582e7] Excluding Merge-Ins

This is equivalent to a diff from 5ac37f3fd4 to 2635b582e7

2024-07-11
13:34
cherry pick e8d7732e preq-fail tests in messages Leaf check-in: 2b4f2ebcce user: mmgraham tags: v1.81-preq-fail-details
2024-07-10
23:42
CI/CD: Automated commit after successful test, build, and deploy for v1.81-server-load-and-cores check-in: 600ba0778d user: fdiskadm tags: v1.81
20:11
Force values to be real in journal stats colletion. still broken though check-in: c906466bb0 user: matt tags: v1.81-journal-based-throttling
18:32
Changed message on busy server candidate host to give actual load and number of cores Leaf check-in: dc040c6bd7 user: mmgraham tags: v1.81-server-load-and-cores
18:10
Gate test launch based on journal load. Values from load calc seem wrong. Should be 0-1.0 but seeing integers 0, 1, 2 ... check-in: 2635b582e7 user: mrwellan tags: v1.81-journal-based-throttling
17:44
Added journal based statical droop based throttling of queries. check-in: fc6b05f924 user: mrwellan tags: v1.81-journal-based-throttling
11:03
Create new branch named "ricky_testing_to_delete" Closed-Leaf check-in: a59962edb1 user: ramartin tags: ricky_testing_to_delete
2024-07-09
19:26
wip check-in: 040bf225dc user: mrwellan tags: v1.81-journal-based-throttling
10:21
CI/CD: Automated commit after successful test, build, and deploy for v1.81-multi-server check-in: 5ac37f3fd4 user: ramartin tags: v1.81
08:53
Merged from v1.81 and fixed conflicts Leaf check-in: dbc22912d1 user: mrwellan tags: v1.81-multi-server
2024-07-08
12:55
CI/CD: Automated commit after successful test, build, and deploy for v1.81-fix-api-changed check-in: 0249193b68 user: ramartin tags: v1.81

Added docs/csirc version [ab27eade5c].



































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
(cond-expand
 (chicken-4
  ;; chicken 4 stuff here
  (use readline)
  (current-input-port (make-readline-port))
  (install-history-file #f "/.csi.history")
  )
 (chicken-5
  (import (chicken load))
  (import (chicken format))
  (import (chicken process-context))
  (import (chicken process signal))
  (load-verbose #f)
  (let ()
    (unless (get-environment-variable "INSIDE_EMACS")
      (import breadline)
      (import breadline-scheme-completion)
      (history-file (format "~a/.csi_history" (get-environment-variable "HOME")))
      (stifle-history! 10000)
      (completer-word-break-characters-set! "\"\'`;|(")
      (completer-set! scheme-completer)
      (basic-quote-characters-set! "\"|")
      (variable-bind! "blink-matching-paren" "on")
      (paren-blink-timeout-set! 200000)
      (let ((handler (signal-handler signal/int)))
	(set-signal-handler! signal/int
			     (lambda (s)
			       (cleanup-after-signal!)
			       (reset-after-signal!)
			       (handler s))))
      (on-exit reset-terminal!)
      (current-input-port (make-readline-port))))
  ))

Modified rmt.scm from [519878889b] to [a86edf18db].

90
91
92
93
94
95
96
97















98
99
100
101
102
103
104
105
106
107
108
109
110
;; NB// area-dat replaced by ttdat
;; 
(define (rmt:send-receive cmd run-id params #!key (attemptnum 1)(ttdat #f))
  (assert (or (not run-id) (number? run-id)) "FATAL: run-id is required to be a number or #f")
  (assert *toppath* "FATAL: rmt:send-receive called with *toppath* not set.")
  (let* ((areapath      *toppath*) ;; TODO - resolve from dbstruct to be compatible with multiple areas
	 (readonly-mode (rmtmod:calc-ro-mode ttdat *toppath*))
	 (testsuite     (common:get-testsuite-name)))















    (case (rmt:transport-mode)
      ((tcp)
       (let* ((start-time    (current-seconds)) ;; snapshot time so all use cases get same value
	      (attemptnum    (+ 1 attemptnum))
	      (mtexe         (common:find-local-megatest))
	      (dbfname       (conc (dbfile:run-id->dbnum run-id)".db"))
	      (ttdat         (rmt:set-ttdat areapath ttdat))
	      (conn          (tt:get-conn ttdat dbfname))
	      (is-main       (equal? dbfname "main.db")) ;; why not (not run-id) ?
	      (server-start-proc (if is-main
				     #f
				     (lambda ()
				       ;; (debug:print-info 0 *default-log-port* "starting server for dbfname: "dbfname)







|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>





<







90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117

118
119
120
121
122
123
124
;; NB// area-dat replaced by ttdat
;; 
(define (rmt:send-receive cmd run-id params #!key (attemptnum 1)(ttdat #f))
  (assert (or (not run-id) (number? run-id)) "FATAL: run-id is required to be a number or #f")
  (assert *toppath* "FATAL: rmt:send-receive called with *toppath* not set.")
  (let* ((areapath      *toppath*) ;; TODO - resolve from dbstruct to be compatible with multiple areas
	 (readonly-mode (rmtmod:calc-ro-mode ttdat *toppath*))
	 (testsuite     (common:get-testsuite-name))
	 (dbfname       (conc (dbfile:run-id->dbnum run-id)".db"))
	 (dbdir         (conc areapath "/.mtdb")))
    (if (and (not *journal-stats*)
	     (file-exists? dbdir))
	(tt:start-stats dbdir)) ;; fixme - find the right call to get the db directory
    
    ;; check the load on dbfname and add some delay using a droop curve of sorts
    (if *journal-stats*
	(let* ((stats (tt:get-journal-stats))
	       (load  (or (alist-ref dbfname stats equal?) 0)))
	  (if (> load 0)
	      (let ((dely (* 10 load)))
		(debug:print 0 *default-log-port* "Journal load "load" delaying queries "dely"s.")
		(thread-sleep! dely)))))
	
    (case (rmt:transport-mode)
      ((tcp)
       (let* ((start-time    (current-seconds)) ;; snapshot time so all use cases get same value
	      (attemptnum    (+ 1 attemptnum))
	      (mtexe         (common:find-local-megatest))

	      (ttdat         (rmt:set-ttdat areapath ttdat))
	      (conn          (tt:get-conn ttdat dbfname))
	      (is-main       (equal? dbfname "main.db")) ;; why not (not run-id) ?
	      (server-start-proc (if is-main
				     #f
				     (lambda ()
				       ;; (debug:print-info 0 *default-log-port* "starting server for dbfname: "dbfname)

Modified runs.scm from [0cd899f860] to [dadc9aecb3].

1145
1146
1147
1148
1149
1150
1151
1152












1153
1154
1155
1156
1157
1158
1159
	 (registry-mutex         (runs:dat-registry-mutex runsdat))
	 (flags                  (runs:dat-flags runsdat))
	 (keyvals                (runs:dat-keyvals runsdat))
	 (run-info               (runs:dat-run-info runsdat))
	 (all-tests-registry     (runs:dat-all-tests-registry runsdat))
	 (run-limits-info        (runs:dat-can-run-more-tests runsdat))
	 ;; (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running
	 (have-resources         (car run-limits-info))












	 (num-running            (list-ref run-limits-info 1))
	 (num-running-in-jobgroup(list-ref run-limits-info 2)) 
	 (max-concurrent-jobs    (list-ref run-limits-info 3))
	 (job-group-limit        (list-ref run-limits-info 4))
	 ;; (prereqs-not-met        (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))
	 ;; (prereqs-not-met         (mt:lazy-get-prereqs-not-met run-id waitons item-path mode: testmode itemmap: itemmap))
	 (fails                  (if (list? prereqs-not-met) ;; TODO: rename fails to failed-prereqs







|
>
>
>
>
>
>
>
>
>
>
>
>







1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
	 (registry-mutex         (runs:dat-registry-mutex runsdat))
	 (flags                  (runs:dat-flags runsdat))
	 (keyvals                (runs:dat-keyvals runsdat))
	 (run-info               (runs:dat-run-info runsdat))
	 (all-tests-registry     (runs:dat-all-tests-registry runsdat))
	 (run-limits-info        (runs:dat-can-run-more-tests runsdat))
	 ;; (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running
	 (have-resources         (and (if *journal-stats*
					  (let* ((dbfname (conc
							   (dbfile:run-id->dbnum run-id)
							   ".db"))
						 (stats (tt:get-journal-stats))
						 (load  (or (alist-ref dbfname stats equal?) 0)))
					    (if (> load 0.1) ;; dbs too busy to start more tests
						(begin
						 (debug:print-info 0 *default-log-port* "Gating launch due to db load "load" based on journal file observations for "dbfname)
						 #f)
						#t))
					  #t) ;; if journal monitoring not started do not gate
				      (car run-limits-info)))
	 (num-running            (list-ref run-limits-info 1))
	 (num-running-in-jobgroup(list-ref run-limits-info 2)) 
	 (max-concurrent-jobs    (list-ref run-limits-info 3))
	 (job-group-limit        (list-ref run-limits-info 4))
	 ;; (prereqs-not-met        (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))
	 ;; (prereqs-not-met         (mt:lazy-get-prereqs-not-met run-id waitons item-path mode: testmode itemmap: itemmap))
	 (fails                  (if (list? prereqs-not-met) ;; TODO: rename fails to failed-prereqs

Modified tcp-transportmod.scm from [f03b836e05] to [8195ca9d01].

39
40
41
42
43
44
45

46
47
48
49
50
51
52
	  extras
	  hostinfo

	  ports
	  posix
	  files
	  data-structures

	  tcp
	  ))
 (chicken-5
  (import chicken.base
	  chicken.condition
	  chicken.file
	  chicken.pathname







>







39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
	  extras
	  hostinfo

	  ports
	  posix
	  files
	  data-structures
	  directory-utils
	  tcp
	  ))
 (chicken-5
  (import chicken.base
	  chicken.condition
	  chicken.file
	  chicken.pathname
1126
1127
1128
1129
1130
1131
1132
1133



1134













































































































  (sort (get-all-ips) ip-pref-less?))

(define (get-all-ips)
  (map address-info-host
       (filter (lambda (x)
		 (equal? (address-info-type x) "tcp"))
	       (address-infos (get-host-name)))))




)





















































































































>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
  (sort (get-all-ips) ip-pref-less?))

(define (get-all-ips)
  (map address-info-host
       (filter (lambda (x)
		 (equal? (address-info-type x) "tcp"))
	       (address-infos (get-host-name)))))

;;======================================================================
;; Other Utils
;;======================================================================

(defstruct jstats
  (count 0)
  (jcount (make-hash-table)) ;; 1.db => journal_count
  )

;; timeblk => jstats
(define *journal-stats* #f) ;; (make-hash-table))

;; monte-carlo-esque random sampling of journal files
;; for all the files:
;;   if .journal
;;      update stats +1 +1
;;      update stats +1  0
;;
(define (tt:write-load-tracking dbdir)
  (let* ((cs    (current-seconds))
	 (key   (inexact->exact (quotient cs 10)))
	 (old   (- key 5)) ;; 4 x 10 seconds ago
	 (jstat (if (hash-table-exists? *journal-stats* key)
		    (hash-table-ref *journal-stats* key )
		    (let ((new (make-jstats)))
		      (hash-table-set! *journal-stats* key new)
		      new))))
    ;; clear out old records
    (for-each
     (lambda (key)
       (if (< key old)
	   (hash-table-delete! *journal-stats* key)))
     (hash-table-keys *journal-stats*))

    ;; increment our count of observations
    (jstats-count-set! jstat (+ (jstats-count jstat) 1))
    
    ;; now find and increment journal file counts
    (directory-fold
     (lambda (fname res)
       ;; is it a journal file?
       (let ((parts (string-match "^(.*\\.db)-journal.*" fname)))
	 (match parts
	   ((_ dbfname)
	    (hash-table-set! (jstats-jcount jstat) dbfname
			     (+ (hash-table-ref/default (jstats-jcount jstat) dbfname 0) 1)
			     ))
	   (else #f)
	   )))
     '()
     dbdir 
     )))

(define *journal-stats-mutex* (make-mutex))

(define (tt:journal-stats-run dbdir)
  (if (not *journal-stats*)(set! *journal-stats* (make-hash-table)))
  (let loop ()
    (mutex-lock! *journal-stats-mutex*)
    (tt:write-load-tracking dbdir)
    (mutex-unlock! *journal-stats-mutex*)
    (thread-sleep! (/ (random 1000) 100.0))
    (loop)))

;; call this to start a thread that is keeping the journal-stats up to date.
(define (tt:start-stats dbdir)
  
  (thread-start!
   (make-thread
    (lambda ()(tt:journal-stats-run dbdir)) "Journal stats collection thread")))

(define (tt:get-journal-stats)
  (let* ((result    (make-jstats))
	 (hitcounts (jstats-jcount result)))
    (if *journal-stats*
	(begin
	  (mutex-lock! *journal-stats-mutex*)
	  (hash-table-for-each
	   *journal-stats*
	   (lambda (k v) ;; key jstats
	     (let* ((count  (jstats-count v))
		    (jcount (jstats-jcount v))) ;; dbfname => hit count
	       (jstats-count-set! result
				  (+ (jstats-count result)
				     (jstats-count v)))
	       (hash-table-for-each
		jcount
		(lambda (dbfname hit-count)
		  (hash-table-set! hitcounts dbfname
				   (+ hit-count
				      (hash-table-ref/default hitcounts dbfname 0))))))))
	  (mutex-unlock! *journal-stats-mutex*))
	(debug:print 0 *default-log-port* "INFO: *journal-stats* not set."))
    ;; convert to normalized alist
    (let ((tot  (min (jstats-count result) 1)) ;; avoid divide by zero
	  (hits (jstats-jcount result))) ;; 1.db => count
      (hash-table-map
       hits
       (lambda (fname hitcount)
	 (cons fname (/ hitcount tot)))))))

;; megatest> (import tcp-transportmod)
;; megatest> (tt:write-load-tracking ".mtdb")
;; megatest> (hash-table-keys *journal-stats*)
;; (172060297)
;; megatest> (jstats->alist (hash-table-ref *journal-stats* 172060297))
;; ((count . 1) (jcount . #<hash-table (1)>))
;; megatest> (jstats-jcount (hash-table-ref *journal-stats* 172060297))
;; #<hash-table (1)>
;; megatest> (hash-table->alist (jstats-jcount (hash-table-ref *journal-stats* 172060297)))
;; (("1.db" . 4))

)