Megatest

Check-in [04d06badd2]
Login
Overview
Comment:CI/CD: Automated commit after successful test, build, and deploy for v1.81-journal-based-throttling
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.81
Files: files | file ages | folders
SHA1: 04d06badd27b533e8f644aedeb6ac3a405d5f1fb
User & Date: fdiskadm on 2024-07-11 15:42:11
Other Links: branch diff | manifest | tags
Context
2024-07-11
16:12
CI/CD: Automated commit after successful test, build, and deploy for v1.81-preq-fail-details check-in: b7cfdb3706 user: fdiskadm tags: v1.81
15:42
CI/CD: Automated commit after successful test, build, and deploy for v1.81-journal-based-throttling check-in: 04d06badd2 user: fdiskadm tags: v1.81
14:40
Improve message for journal load check-in: f5a00d621d user: mrwellan tags: v1.81-journal-based-throttling
2024-07-10
23:42
CI/CD: Automated commit after successful test, build, and deploy for v1.81-server-load-and-cores check-in: 600ba0778d user: fdiskadm tags: v1.81
Changes

Modified dashboard.scm from [e5b27b795a] to [89bc431aeb].

166
167
168
169
170
171
172


173
174
175
176
177
178
179
;;    please-update:        #t
;;    update-mutex:         (make-mutex)
;;    updaters:             (make-hash-table)
;;    updating:             #f
;;    hide-not-hide-tabs:   #f
;;    target:               ""
;;    ))



;;======================================================================
;; buttons color using image
;;======================================================================

(define *images* (make-hash-table))








>
>







166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
;;    please-update:        #t
;;    update-mutex:         (make-mutex)
;;    updaters:             (make-hash-table)
;;    updating:             #f
;;    hide-not-hide-tabs:   #f
;;    target:               ""
;;    ))

(set! *journal-stats-enable* #f)

;;======================================================================
;; buttons color using image
;;======================================================================

(define *images* (make-hash-table))

Added docs/csirc version [ab27eade5c].



































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
(cond-expand
 (chicken-4
  ;; chicken 4 stuff here
  (use readline)
  (current-input-port (make-readline-port))
  (install-history-file #f "/.csi.history")
  )
 (chicken-5
  (import (chicken load))
  (import (chicken format))
  (import (chicken process-context))
  (import (chicken process signal))
  (load-verbose #f)
  (let ()
    (unless (get-environment-variable "INSIDE_EMACS")
      (import breadline)
      (import breadline-scheme-completion)
      (history-file (format "~a/.csi_history" (get-environment-variable "HOME")))
      (stifle-history! 10000)
      (completer-word-break-characters-set! "\"\'`;|(")
      (completer-set! scheme-completer)
      (basic-quote-characters-set! "\"|")
      (variable-bind! "blink-matching-paren" "on")
      (paren-blink-timeout-set! 200000)
      (let ((handler (signal-handler signal/int)))
	(set-signal-handler! signal/int
			     (lambda (s)
			       (cleanup-after-signal!)
			       (reset-after-signal!)
			       (handler s))))
      (on-exit reset-terminal!)
      (current-input-port (make-readline-port))))
  ))

Modified rmt.scm from [519878889b] to [0cdd3c737a].

90
91
92
93
94
95
96
97














98
99
100
101
102
103
104
105
106
107
108
109
110
;; NB// area-dat replaced by ttdat
;; 
(define (rmt:send-receive cmd run-id params #!key (attemptnum 1)(ttdat #f))
  (assert (or (not run-id) (number? run-id)) "FATAL: run-id is required to be a number or #f")
  (assert *toppath* "FATAL: rmt:send-receive called with *toppath* not set.")
  (let* ((areapath      *toppath*) ;; TODO - resolve from dbstruct to be compatible with multiple areas
	 (readonly-mode (rmtmod:calc-ro-mode ttdat *toppath*))
	 (testsuite     (common:get-testsuite-name)))














    (case (rmt:transport-mode)
      ((tcp)
       (let* ((start-time    (current-seconds)) ;; snapshot time so all use cases get same value
	      (attemptnum    (+ 1 attemptnum))
	      (mtexe         (common:find-local-megatest))
	      (dbfname       (conc (dbfile:run-id->dbnum run-id)".db"))
	      (ttdat         (rmt:set-ttdat areapath ttdat))
	      (conn          (tt:get-conn ttdat dbfname))
	      (is-main       (equal? dbfname "main.db")) ;; why not (not run-id) ?
	      (server-start-proc (if is-main
				     #f
				     (lambda ()
				       ;; (debug:print-info 0 *default-log-port* "starting server for dbfname: "dbfname)







|
>
>
>
>
>
>
>
>
>
>
>
>
>
>





<







90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116

117
118
119
120
121
122
123
;; NB// area-dat replaced by ttdat
;; 
(define (rmt:send-receive cmd run-id params #!key (attemptnum 1)(ttdat #f))
  (assert (or (not run-id) (number? run-id)) "FATAL: run-id is required to be a number or #f")
  (assert *toppath* "FATAL: rmt:send-receive called with *toppath* not set.")
  (let* ((areapath      *toppath*) ;; TODO - resolve from dbstruct to be compatible with multiple areas
	 (readonly-mode (rmtmod:calc-ro-mode ttdat *toppath*))
	 (testsuite     (common:get-testsuite-name))
	 (dbfname       (conc (dbfile:run-id->dbnum run-id)".db"))
	 (dbdir         (conc areapath "/.mtdb")))
    (if (and (not *journal-stats*)
	     (file-exists? dbdir))
	(tt:start-stats dbdir)) ;; fixme - find the right call to get the db directory
    
    ;; check the load on dbfname and add some delay using a droop curve of sorts
    (if *journal-stats*
	(let* ((load  (tt:get-journal-stats dbfname)))
	  (if (> load 0.1) ;; start activating delay at 10% journal load time
	      (let ((dely (* 50 (* load load)))) ;; 100% journal time=50sec delay
		(debug:print 0 *default-log-port* "Journal load "load" on "dbfname" delaying queries "dely"s.")
		(thread-sleep! dely)))))
	
    (case (rmt:transport-mode)
      ((tcp)
       (let* ((start-time    (current-seconds)) ;; snapshot time so all use cases get same value
	      (attemptnum    (+ 1 attemptnum))
	      (mtexe         (common:find-local-megatest))

	      (ttdat         (rmt:set-ttdat areapath ttdat))
	      (conn          (tt:get-conn ttdat dbfname))
	      (is-main       (equal? dbfname "main.db")) ;; why not (not run-id) ?
	      (server-start-proc (if is-main
				     #f
				     (lambda ()
				       ;; (debug:print-info 0 *default-log-port* "starting server for dbfname: "dbfname)

Modified runs.scm from [0cd899f860] to [adfae1025a].

1145
1146
1147
1148
1149
1150
1151
1152













1153
1154
1155
1156
1157
1158
1159
	 (registry-mutex         (runs:dat-registry-mutex runsdat))
	 (flags                  (runs:dat-flags runsdat))
	 (keyvals                (runs:dat-keyvals runsdat))
	 (run-info               (runs:dat-run-info runsdat))
	 (all-tests-registry     (runs:dat-all-tests-registry runsdat))
	 (run-limits-info        (runs:dat-can-run-more-tests runsdat))
	 ;; (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running
	 (have-resources         (car run-limits-info))













	 (num-running            (list-ref run-limits-info 1))
	 (num-running-in-jobgroup(list-ref run-limits-info 2)) 
	 (max-concurrent-jobs    (list-ref run-limits-info 3))
	 (job-group-limit        (list-ref run-limits-info 4))
	 ;; (prereqs-not-met        (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))
	 ;; (prereqs-not-met         (mt:lazy-get-prereqs-not-met run-id waitons item-path mode: testmode itemmap: itemmap))
	 (fails                  (if (list? prereqs-not-met) ;; TODO: rename fails to failed-prereqs







|
>
>
>
>
>
>
>
>
>
>
>
>
>







1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
	 (registry-mutex         (runs:dat-registry-mutex runsdat))
	 (flags                  (runs:dat-flags runsdat))
	 (keyvals                (runs:dat-keyvals runsdat))
	 (run-info               (runs:dat-run-info runsdat))
	 (all-tests-registry     (runs:dat-all-tests-registry runsdat))
	 (run-limits-info        (runs:dat-can-run-more-tests runsdat))
	 ;; (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running
	 (have-resources         (and (if *journal-stats*
					  (let* ((dbfname (conc
							   (dbfile:run-id->dbnum run-id)
							   ".db"))
						 (load (tt:get-journal-stats dbfname)))
					    (if (> load 0.1) ;; dbs too busy to start more tests
						(begin
						  (debug:print-info 0 *default-log-port* "Gating launch due to db load "load" based on journal file observations for "dbfname)
						 #f)
						#t))
					  (begin
					    (debug:print-info 0 *default-log-port* "Journal gating not started for "run-id)
					    #t)) ;; if journal monitoring not started do not gate
				      (car run-limits-info)))
	 (num-running            (list-ref run-limits-info 1))
	 (num-running-in-jobgroup(list-ref run-limits-info 2)) 
	 (max-concurrent-jobs    (list-ref run-limits-info 3))
	 (job-group-limit        (list-ref run-limits-info 4))
	 ;; (prereqs-not-met        (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))
	 ;; (prereqs-not-met         (mt:lazy-get-prereqs-not-met run-id waitons item-path mode: testmode itemmap: itemmap))
	 (fails                  (if (list? prereqs-not-met) ;; TODO: rename fails to failed-prereqs
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
		      (if (eq? nth-try 'removed) ;; removed is removed - drop it NOW
			  (if (null? tal)
			      #f ;; yes, really
			      (list (car tal)(cdr tal) reg reruns))
			  (begin
			    (if (runs:lownoise (conc "FAILED prerequisites or other issue" hed) 60)
				(debug:print 0 *default-log-port* "WARNING: test " hed " has FAILED prerequisites or other issue. Internal state >" nth-try "< will be overridden and we'll retry."))
			    (let* ((test-id      (rmt:get-test-id run-id testname item-path))
				   (test-info    (rmt:get-testinfo-state-status run-id test-id)) ;; we need *current* info
				   (status       (db:test-status test-info)))
			      (if (equal? status "KEEP_TRYING")
				  (mt:test-set-state-status-by-testname-unless-completed run-id test-name item-path "COMPLETED" "PREQ_FAIL" #f)
				  (mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "KEEP_TRYING" #f)))
			    (hash-table-set! test-registry hed 'removed) ;; was 0
                            (if (not (and (null? reg) (null? tal)))







|







1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
		      (if (eq? nth-try 'removed) ;; removed is removed - drop it NOW
			  (if (null? tal)
			      #f ;; yes, really
			      (list (car tal)(cdr tal) reg reruns))
			  (begin
			    (if (runs:lownoise (conc "FAILED prerequisites or other issue" hed) 60)
				(debug:print 0 *default-log-port* "WARNING: test " hed " has FAILED prerequisites or other issue. Internal state >" nth-try "< will be overridden and we'll retry."))
			    (let* ((test-id      (rmt:get-test-id run-id hed item-path))
				   (test-info    (rmt:get-testinfo-state-status run-id test-id)) ;; we need *current* info
				   (status       (db:test-status test-info)))
			      (if (equal? status "KEEP_TRYING")
				  (mt:test-set-state-status-by-testname-unless-completed run-id test-name item-path "COMPLETED" "PREQ_FAIL" #f)
				  (mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "KEEP_TRYING" #f)))
			    (hash-table-set! test-registry hed 'removed) ;; was 0
                            (if (not (and (null? reg) (null? tal)))

Modified tcp-transportmod.scm from [c9c309998d] to [d0258d10eb].

39
40
41
42
43
44
45

46
47
48
49
50
51
52
	  extras
	  hostinfo

	  ports
	  posix
	  files
	  data-structures

	  tcp
	  ))
 (chicken-5
  (import chicken.base
	  chicken.condition
	  chicken.file
	  chicken.pathname







>







39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
	  extras
	  hostinfo

	  ports
	  posix
	  files
	  data-structures
	  directory-utils
	  tcp
	  ))
 (chicken-5
  (import chicken.base
	  chicken.condition
	  chicken.file
	  chicken.pathname
1126
1127
1128
1129
1130
1131
1132
1133



1134



















































































































  (sort (get-all-ips) ip-pref-less?))

(define (get-all-ips)
  (map address-info-host
       (filter (lambda (x)
		 (equal? (address-info-type x) "tcp"))
	       (address-infos (get-host-name)))))




)



























































































































>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
  (sort (get-all-ips) ip-pref-less?))

(define (get-all-ips)
  (map address-info-host
       (filter (lambda (x)
		 (equal? (address-info-type x) "tcp"))
	       (address-infos (get-host-name)))))

;;======================================================================
;; Other Utils
;;======================================================================

(defstruct jstats
  (count 0)
  (jcount (make-hash-table)) ;; 1.db => journal_count
  )

;; timeblk => jstats
(define *journal-stats* #f) ;; (make-hash-table))
(define *journal-stats-enable* #t) ;; change to #f to turn off

;; monte-carlo-esque random sampling of journal files
;; for all the files:
;;   if .journal
;;      update stats +1 +1
;;      update stats +1  0
;;
(define (tt:write-load-tracking dbdir)
  (if *journal-stats-enable*
      (let* ((cs    (current-seconds))
	     (key   (inexact->exact (quotient cs 10)))
	     (old   (- key 5)) ;; 4 x 10 seconds ago
	     (jstat (if (hash-table-exists? *journal-stats* key)
			(hash-table-ref *journal-stats* key )
			(let ((new (make-jstats)))
			  (hash-table-set! *journal-stats* key new)
			  new))))
	;; clear out old records
	(for-each
	 (lambda (key)
	   (if (< key old)
	       (hash-table-delete! *journal-stats* key)))
	 (hash-table-keys *journal-stats*))

	;; increment our count of observations
	(jstats-count-set! jstat (+ (jstats-count jstat) 1))
	
	;; now find and increment journal file counts
	(directory-fold
	 (lambda (fname res)
	   ;; is it a journal file?
	   (let ((parts (string-match "^(.*\\.db)-journal.*" fname)))
	     (match parts
		    ((_ dbfname)
		     (hash-table-set! (jstats-jcount jstat) dbfname
				      (+ (hash-table-ref/default (jstats-jcount jstat) dbfname 0) 1.0)
				      ))
		    (else #f)
		    )))
	 '()
	 dbdir 
     ))))

(define *journal-stats-mutex* (make-mutex))

(define (tt:journal-stats-run dbdir)
  (if (not *journal-stats*)(set! *journal-stats* (make-hash-table)))
  (let loop ()
    (mutex-lock! *journal-stats-mutex*)
    (tt:write-load-tracking dbdir)
    (mutex-unlock! *journal-stats-mutex*)
    (thread-sleep! (/ (random 1000) 100.0))
    (loop)))

;; call this to start a thread that is keeping the journal-stats up to date.
(define (tt:start-stats dbdir)
  
  (thread-start!
   (make-thread
    (lambda ()(tt:journal-stats-run dbdir)) "Journal stats collection thread")))

(define (tt:get-journal-stats #!optional (dbfname #f))
  (let* ((result    (make-jstats))
	 (hitcounts (jstats-jcount result)))
    (if (and *journal-stats*
	     *journal-stats-enable*)
	(begin
	  (mutex-lock! *journal-stats-mutex*)
	  (hash-table-for-each
	   *journal-stats*
	   (lambda (k v) ;; key jstats
	     (let* ((count  (jstats-count v))
		    (jcount (jstats-jcount v))) ;; dbfname => hit count
	       (jstats-count-set! result
				  (+ (jstats-count result)
				     (jstats-count v)))
	       (hash-table-for-each
		jcount
		(lambda (dbfname hit-count)
		  (hash-table-set! hitcounts dbfname
				   (+ hit-count
				      (hash-table-ref/default hitcounts dbfname 0))))))))
	  (mutex-unlock! *journal-stats-mutex*))
	(debug:print 0 *default-log-port* "INFO: *journal-stats* not set."))
    ;; convert to normalized alist
    (let* ((tot  (max (jstats-count result) 1)) ;; avoid divide by zero
	   (hits (jstats-jcount result)) ;; 1.db => count
	   (res  (hash-table-map
		  hits
		  (lambda (fname hitcount)
		    (cons fname (/ hitcount tot))))))
      (if dbfname
	  (or (alist-ref dbfname res equal?) 0)
	  res))))

;; megatest> (import tcp-transportmod)
;; megatest> (tt:write-load-tracking ".mtdb")
;; megatest> (hash-table-keys *journal-stats*)
;; (172060297)
;; megatest> (jstats->alist (hash-table-ref *journal-stats* 172060297))
;; ((count . 1) (jcount . #<hash-table (1)>))
;; megatest> (jstats-jcount (hash-table-ref *journal-stats* 172060297))
;; #<hash-table (1)>
;; megatest> (hash-table->alist (jstats-jcount (hash-table-ref *journal-stats* 172060297)))
;; (("1.db" . 4))

)

Added utils/setcicd version [bb9f0e8cef].





















>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
#!/bin/bash

branch=$(fossil branch current)
wikiname=${branch}_cicd
echo "ready to merge" > $wikiname
if fossil wiki export $wikiname;then
    fossil wiki commit $wikiname $wikiname
else
    fossil wiki create $wikiname $wikiname
fi