Overview
Comment: | Getting close on gating runs from starting new tests on server load high. |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.81-multi-server |
Files: | files | file ages | folders |
SHA1: |
6a90d15b55723616e5cd53c9083c7cbc |
User & Date: | matt on 2024-07-08 03:00:57 |
Other Links: | branch diff | manifest | tags |
Context
2024-07-08
| ||
06:01 | wip (still broke) check-in: 00c25a6b53 user: matt tags: v1.81-multi-server | |
03:00 | Getting close on gating runs from starting new tests on server load high. check-in: 6a90d15b55 user: matt tags: v1.81-multi-server | |
2024-07-07
| ||
20:09 | Sort servers based on number of threads running to estimate load check-in: af60709165 user: matt tags: v1.81-multi-server | |
Changes
Modified rmtmod.scm from [bb5d679cbc] to [1cfe9c07c7].
︙ | ︙ | |||
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 | ;; You should have received a copy of the GNU General Public License ;; along with Megatest. If not, see <http://www.gnu.org/licenses/>. ;;====================================================================== (declare (unit rmtmod)) (declare (uses debugprint)) (declare (uses commonmod)) (declare (uses dbfile)) ;; needed for records ;; (declare (uses apimod)) ;; (declare (uses apimod.import)) ;; (declare (uses ulex)) ;; (include "ulex/ulex.scm") (module rmtmod * (import scheme chicken data-structures extras matchable srfi-69) (import (prefix sqlite3 sqlite3:) posix typed-records srfi-18) (import commonmod dbfile debugprint) ;; (prefix commonmod cmod:)) ;; (import apimod) ;; (import (prefix ulex ulex:)) (include "db_records.scm") (defstruct alldat (areapath #f) | > > > > > > > > > | 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | ;; You should have received a copy of the GNU General Public License ;; along with Megatest. If not, see <http://www.gnu.org/licenses/>. ;;====================================================================== (declare (unit rmtmod)) (declare (uses debugprint)) ;; (declare (uses debugprint.import)) (declare (uses commonmod)) ;; (declare (uses commonmod.import)) (declare (uses dbfile)) ;; needed for records (declare (uses dbmod)) ;; (declare (uses tcp-transportmod)) ;; (declare (uses tcp-transportmod.import)) ;; (declare (uses apimod)) ;; (declare (uses apimod.import)) ;; (declare (uses ulex)) ;; (include "ulex/ulex.scm") (module rmtmod * (import scheme chicken data-structures extras matchable srfi-69) (import (prefix sqlite3 sqlite3:) posix typed-records srfi-18) (import commonmod dbfile debugprint) ;; (prefix commonmod cmod:)) (import dbmod ;; tcp-transportmod ) ;; (import apimod) ;; (import (prefix ulex ulex:)) (include "db_records.scm") (defstruct alldat (areapath #f) |
︙ | ︙ | |||
303 304 305 306 307 308 309 310 | run-id test-id 'foo "COMPLETED" "DEAD" "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead."))))))) ;; call end of eud of run detection for posthook - from merge, is it needed? ;; (launch:end-of-run-check run-id) all-ids) ))))) | > > > > > > > > > > > > > > > > > | > | 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 | run-id test-id 'foo "COMPLETED" "DEAD" "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead."))))))) ;; call end of eud of run detection for posthook - from merge, is it needed? ;; (launch:end-of-run-check run-id) all-ids) ))))) ;;====================================================================== ;; Misc ;;====================================================================== ;; (define (rmtmod:wait-on-server-load run-id ttdat) ;; (let* ((dbfname (dbmod:run-id->dbfname run-id)) ;; (get-lowest-thread-load ;; (lambda () ;; (let* ((sdats (tt:get-server-info-sorted ttdat dbfname))) ;; (car (map tt:get-server-threads sdats)))))) ;; (if ttdat ;; (let loop () ;; (if (> (get-lowest-thread-load) 5) ;; load is pretty high ;; (begin ;; (debug:print 0 *default-log-port* "Servers appear overloaded, waiting...") ;; (thread-sleep! 1) ;; (loop)))) ;; (debug:print 0 *default-log-port* "Can't wait on server load, *ttdat* not set")))) ) |
Modified runs.scm from [c4364e3870] to [0cd899f860].
︙ | ︙ | |||
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 | (declare (uses tests)) (declare (uses server)) (declare (uses mt)) (declare (uses archive)) (declare (uses mtargs)) (declare (uses rmtmod)) (declare (uses dbfile)) (use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 (srfi 18) posix-extras directory-utils pathname-expand typed-records format sxml-serializer sxml-modifications matchable) (include "common_records.scm") (include "key_records.scm") (include "db_records.scm") (include "run_records.scm") (include "test_records.scm") ;; (include "debugger.scm") (import commonmod debugprint rmtmod dbfile (prefix mtargs args:)) ;; use this struct to facilitate refactoring ;; (defstruct runs:dat reglen regfull | > > | 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 | (declare (uses tests)) (declare (uses server)) (declare (uses mt)) (declare (uses archive)) (declare (uses mtargs)) (declare (uses rmtmod)) (declare (uses dbfile)) (declare (uses tcp-transportmod)) (use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 (srfi 18) posix-extras directory-utils pathname-expand typed-records format sxml-serializer sxml-modifications matchable) (include "common_records.scm") (include "key_records.scm") (include "db_records.scm") (include "run_records.scm") (include "test_records.scm") ;; (include "debugger.scm") (import commonmod debugprint rmtmod dbfile tcp-transportmod (prefix mtargs args:)) ;; use this struct to facilitate refactoring ;; (defstruct runs:dat reglen regfull |
︙ | ︙ | |||
1189 1190 1191 1192 1193 1194 1195 | (if (and (not (common:on-homehost?)) maxload) ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized (common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f)) ;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues (if maxhomehostload (common:wait-for-homehost-load maxhomehostload | | | > > | > > | 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 | (if (and (not (common:on-homehost?)) maxload) ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized (common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f)) ;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues (if maxhomehostload (common:wait-for-homehost-load maxhomehostload (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload))) ;; lastly lets check the servers are not overloaded by looking at threads (tt:wait-on-server-load run-id *ttdat*) ))) (if (and (not (null? prereqs-not-met)) (runs:lownoise (conc "waiting on tests " prereqs-not-met hed) 60)) (debug:print-info 2 *default-log-port* "waiting on tests; " (string-intersperse (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) ", "))) ;; Don't know at this time if the test have been launched at some time in the past ;; i.e. is this a re-launch? |
︙ | ︙ |
Modified tcp-transportmod.scm from [a6f9fa170f] to [494ffa0754].
︙ | ︙ | |||
283 284 285 286 287 288 289 | (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc))))))) ;; returns ( result . ping_time ) (define (tt:timed-ping host port server-id) (let* ((start-time (current-milliseconds)) (result (tt:ping host port server-id))) (cons result (- (current-milliseconds) start-time)))) | | > | | > | > | > > | > > > > > > > > > > > > > > > | 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 | (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc))))))) ;; returns ( result . ping_time ) (define (tt:timed-ping host port server-id) (let* ((start-time (current-milliseconds)) (result (tt:ping host port server-id))) (cons result (- (current-milliseconds) start-time)))) ;; host:port => ( meta . when-updated) (define *server-load* (make-hash-table)) (define (tt:save-server-meta host port meta) (hash-table-set! *server-load* (conc host":"port) (cons meta (current-seconds)))) (define (tt:get-server-threads dat) (let* ((host (car dat)) (port (cadr dat)) (dat (tt:get-server-meta host port #t)) (meta (car dat))) (if (list? meta) (alist-ref 'sload meta) #f))) ;; lazy get, does not auto-refresh meta, this might be a problem ;; (define (tt:get-server-meta host port #!optional (do-ping #f)) (let* ((get-meta (lambda () (let* ((dat (hash-table-ref/default *server-load* (conc host":"port) #f))) (if dat (car dat) #f)))) (meta (get-meta))) (if (and (not meta) do-ping) (begin (tt:timed-ping host port #f) (get-meta)) meta))) (define (tt:wait-on-server-load run-id ttdat) (let* ((dbfname (dbmod:run-id->dbfname run-id)) (get-lowest-thread-load (lambda () (let* ((sdats (tt:get-server-info-sorted ttdat dbfname))) (car (map tt:get-server-threads sdats)))))) (if ttdat (let loop () (if (> (get-lowest-thread-load) 5) ;; load is pretty high (begin (debug:print 0 *default-log-port* "Servers appear overloaded, waiting...") (thread-sleep! 1) (loop)))) (debug:print 0 *default-log-port* "Can't wait on server load, *ttdat* not set")))) (define (tt:ping host port server-id #!optional (tries-left 5)) (let* ((res (tt:send-receive-direct host port `(ping #f #f #f) ping-mode: #t)) ;; please send me your server-id (try-again (lambda () (if (> tries-left 0) (begin (thread-sleep! 1) (tt:ping host port server-id (- tries-left 1))) |
︙ | ︙ |