Megatest

Check-in [6a90d15b55]
Login
Overview
Comment:Getting close on gating runs from starting new tests on server load high.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.81-multi-server
Files: files | file ages | folders
SHA1: 6a90d15b55723616e5cd53c9083c7cbcada22020
User & Date: matt on 2024-07-08 03:00:57
Other Links: branch diff | manifest | tags
Context
2024-07-08
06:01
wip (still broke) check-in: 00c25a6b53 user: matt tags: v1.81-multi-server
03:00
Getting close on gating runs from starting new tests on server load high. check-in: 6a90d15b55 user: matt tags: v1.81-multi-server
2024-07-07
20:09
Sort servers based on number of threads running to estimate load check-in: af60709165 user: matt tags: v1.81-multi-server
Changes

Modified rmtmod.scm from [bb5d679cbc] to [1cfe9c07c7].

16
17
18
19
20
21
22

23

24



25
26
27
28
29
30
31
32
33
34
35
36
37




38
39
40
41
42
43
44
;;     You should have received a copy of the GNU General Public License
;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.

;;======================================================================

(declare (unit rmtmod))
(declare (uses debugprint))

(declare (uses commonmod))

(declare (uses dbfile))    ;; needed for records




;; (declare (uses apimod))
;; (declare (uses apimod.import))
;; (declare (uses ulex))

;; (include "ulex/ulex.scm")

(module rmtmod
	*
	
(import scheme chicken data-structures extras matchable srfi-69)
(import (prefix sqlite3 sqlite3:) posix typed-records srfi-18)
(import commonmod dbfile debugprint) ;; (prefix commonmod cmod:))




;; (import apimod)
;; (import (prefix ulex ulex:))

(include "db_records.scm")

(defstruct alldat
  (areapath #f)







>

>

>
>
>













>
>
>
>







16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
;;     You should have received a copy of the GNU General Public License
;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.

;;======================================================================

(declare (unit rmtmod))
(declare (uses debugprint))
;; (declare (uses debugprint.import))
(declare (uses commonmod))
;; (declare (uses commonmod.import))
(declare (uses dbfile))    ;; needed for records
(declare (uses dbmod))
;; (declare (uses tcp-transportmod))
;; (declare (uses tcp-transportmod.import))

;; (declare (uses apimod))
;; (declare (uses apimod.import))
;; (declare (uses ulex))

;; (include "ulex/ulex.scm")

(module rmtmod
	*
	
(import scheme chicken data-structures extras matchable srfi-69)
(import (prefix sqlite3 sqlite3:) posix typed-records srfi-18)
(import commonmod dbfile debugprint) ;; (prefix commonmod cmod:))
(import dbmod
	;; tcp-transportmod
	)

;; (import apimod)
;; (import (prefix ulex ulex:))

(include "db_records.scm")

(defstruct alldat
  (areapath #f)
303
304
305
306
307
308
309



310














311

			      run-id test-id 'foo "COMPLETED" "DEAD"
			      "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.")))))))
	     ;; call end of eud of run detection for posthook - from merge, is it needed?
	     ;; (launch:end-of-run-check run-id)
	     all-ids)
	    )))))



















)








>
>
>

>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
>
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
			      run-id test-id 'foo "COMPLETED" "DEAD"
			      "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.")))))))
	     ;; call end of eud of run detection for posthook - from merge, is it needed?
	     ;; (launch:end-of-run-check run-id)
	     all-ids)
	    )))))

;;======================================================================
;; Misc
;;======================================================================

;; (define (rmtmod:wait-on-server-load run-id ttdat)
;;   (let* ((dbfname                 (dbmod:run-id->dbfname run-id))
;; 	 (get-lowest-thread-load
;; 	  (lambda ()
;; 	    (let* ((sdats (tt:get-server-info-sorted ttdat dbfname)))
;; 	      (car (map tt:get-server-threads sdats))))))
;;     (if ttdat
;; 	(let loop ()
;; 	  (if (> (get-lowest-thread-load) 5) ;; load is pretty high
;; 	      (begin
;; 		(debug:print 0 *default-log-port* "Servers appear overloaded, waiting...")
;; 		(thread-sleep! 1)
;; 		(loop))))
;; 	(debug:print 0 *default-log-port* "Can't wait on server load, *ttdat* not set"))))

)

Modified runs.scm from [c4364e3870] to [0cd899f860].

27
28
29
30
31
32
33

34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52

53
54
55
56
57
58
59
(declare (uses tests))
(declare (uses server))
(declare (uses mt))
(declare (uses archive))
(declare (uses mtargs))
(declare (uses rmtmod))
(declare (uses dbfile))


(use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 (srfi 18) 
     posix-extras directory-utils pathname-expand typed-records format  sxml-serializer
     sxml-modifications matchable)



(include "common_records.scm")
(include "key_records.scm")
(include "db_records.scm")
(include "run_records.scm")
(include "test_records.scm")

;; (include "debugger.scm")

(import commonmod
	debugprint
	rmtmod
	dbfile

	(prefix mtargs args:))

;; use this struct to facilitate refactoring
;;

(defstruct runs:dat
  reglen regfull







>



















>







27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
(declare (uses tests))
(declare (uses server))
(declare (uses mt))
(declare (uses archive))
(declare (uses mtargs))
(declare (uses rmtmod))
(declare (uses dbfile))
(declare (uses tcp-transportmod))

(use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 (srfi 18) 
     posix-extras directory-utils pathname-expand typed-records format  sxml-serializer
     sxml-modifications matchable)



(include "common_records.scm")
(include "key_records.scm")
(include "db_records.scm")
(include "run_records.scm")
(include "test_records.scm")

;; (include "debugger.scm")

(import commonmod
	debugprint
	rmtmod
	dbfile
	tcp-transportmod
	(prefix mtargs args:))

;; use this struct to facilitate refactoring
;;

(defstruct runs:dat
  reglen regfull
1189
1190
1191
1192
1193
1194
1195
1196
1197


1198


1199
1200
1201
1202
1203
1204
1205
	   (if (and (not (common:on-homehost?))
		    maxload) ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized
	       (common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f))
	   
	   ;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues
	   (if maxhomehostload
	       (common:wait-for-homehost-load maxhomehostload
					      (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload))))))
    


 


    
    (if (and (not (null? prereqs-not-met))
	     (runs:lownoise (conc "waiting on tests " prereqs-not-met hed) 60))
	(debug:print-info 2 *default-log-port* "waiting on tests; " (string-intersperse (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) ", ")))

    ;; Don't know at this time if the test have been launched at some time in the past
    ;; i.e. is this a re-launch?







|
|
>
>
|
>
>







1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
	   (if (and (not (common:on-homehost?))
		    maxload) ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized
	       (common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f))
	   
	   ;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues
	   (if maxhomehostload
	       (common:wait-for-homehost-load maxhomehostload
					      (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload)))

	   ;; lastly lets check the servers are not overloaded by looking at threads
	   (tt:wait-on-server-load run-id *ttdat*)
	   
	   )))
    
    
    (if (and (not (null? prereqs-not-met))
	     (runs:lownoise (conc "waiting on tests " prereqs-not-met hed) 60))
	(debug:print-info 2 *default-log-port* "waiting on tests; " (string-intersperse (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) ", ")))

    ;; Don't know at this time if the test have been launched at some time in the past
    ;; i.e. is this a re-launch?

Modified tcp-transportmod.scm from [a6f9fa170f] to [494ffa0754].

283
284
285
286
287
288
289
290

291
292
293
294
295
296
297
298
299

300
301
302
303
304
305
306

307


308
309
310
311
312
313
314















315
316
317
318
319
320
321
	     (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc)))))))

;; returns ( result . ping_time )
(define (tt:timed-ping host port server-id)
  (let* ((start-time (current-milliseconds))
	 (result     (tt:ping host port server-id)))
    (cons result (- (current-milliseconds) start-time))))
    

(define *server-load* (make-hash-table))

(define (tt:save-server-meta host port meta)
  (hash-table-set! *server-load* (conc host":"port) meta))

(define (tt:get-server-threads dat)
  (let* ((host (car  dat))
	 (port (cadr dat))
	 (meta (tt:get-server-meta host port #t)))

    (if (list? meta)
	(alist-ref 'sload meta)
	#f)))
 
;; lazy get, does not auto-refresh meta, this might be a problem
;;
(define (tt:get-server-meta host port #!optional (do-ping #f))

  (let* ((meta (hash-table-ref/default *server-load* (conc host":"port) #f)))


    (if (and (not meta)
	     do-ping)
	(begin
	  (tt:timed-ping host port #f)
	  (hash-table-ref/default *server-load* (conc host":"port) #f))
	meta)))
















(define (tt:ping host port server-id #!optional (tries-left 5))
  (let*  ((res      (tt:send-receive-direct host port `(ping #f #f #f) ping-mode: #t)) ;; please send me your server-id
	  (try-again (lambda ()
		       (if (> tries-left 0)
			   (begin
			     (thread-sleep! 1)
			     (tt:ping host port server-id (- tries-left 1)))







|
>



|




|
>



|



>
|
>
>




|


>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
	     (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc)))))))

;; returns ( result . ping_time )
(define (tt:timed-ping host port server-id)
  (let* ((start-time (current-milliseconds))
	 (result     (tt:ping host port server-id)))
    (cons result (- (current-milliseconds) start-time))))

;; host:port => ( meta . when-updated)
(define *server-load* (make-hash-table))

(define (tt:save-server-meta host port meta)
  (hash-table-set! *server-load* (conc host":"port) (cons meta (current-seconds))))

(define (tt:get-server-threads dat)
  (let* ((host (car  dat))
	 (port (cadr dat))
	 (dat  (tt:get-server-meta host port #t))
	 (meta (car dat)))
    (if (list? meta)
	(alist-ref 'sload meta)
	#f)))

;; lazy get, does not auto-refresh meta, this might be a problem
;;
(define (tt:get-server-meta host port #!optional (do-ping #f))
  (let* ((get-meta (lambda ()
		     (let* ((dat  (hash-table-ref/default *server-load* (conc host":"port) #f)))
		       (if dat (car dat) #f))))
	 (meta     (get-meta)))
    (if (and (not meta)
	     do-ping)
	(begin
	  (tt:timed-ping host port #f)
	  (get-meta))
	meta)))

(define (tt:wait-on-server-load run-id ttdat)
  (let* ((dbfname                 (dbmod:run-id->dbfname run-id))
	 (get-lowest-thread-load
	  (lambda ()
	    (let* ((sdats (tt:get-server-info-sorted ttdat dbfname)))
	      (car (map tt:get-server-threads sdats))))))
    (if ttdat
	(let loop ()
	  (if (> (get-lowest-thread-load) 5) ;; load is pretty high
	      (begin
		(debug:print 0 *default-log-port* "Servers appear overloaded, waiting...")
		(thread-sleep! 1)
		(loop))))
	(debug:print 0 *default-log-port* "Can't wait on server load, *ttdat* not set"))))

(define (tt:ping host port server-id #!optional (tries-left 5))
  (let*  ((res      (tt:send-receive-direct host port `(ping #f #f #f) ping-mode: #t)) ;; please send me your server-id
	  (try-again (lambda ()
		       (if (> tries-left 0)
			   (begin
			     (thread-sleep! 1)
			     (tt:ping host port server-id (- tries-left 1)))