Overview
Comment: | Sort servers based on number of threads running to estimate load |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.81-multi-server |
Files: | files | file ages | folders |
SHA1: |
af607091653dce6aebab6eca82a5ef85 |
User & Date: | matt on 2024-07-07 20:09:46 |
Other Links: | branch diff | manifest | tags |
Context
2024-07-08
| ||
03:00 | Getting close on gating runs from starting new tests on server load high. check-in: 6a90d15b55 user: matt tags: v1.81-multi-server | |
2024-07-07
| ||
20:09 | Sort servers based on number of threads running to estimate load check-in: af60709165 user: matt tags: v1.81-multi-server | |
2024-07-03
| ||
19:08 | Missing change check-in: 25cae1c008 user: mrwellan tags: v1.81-multi-server | |
Changes
Modified api.scm from [03e4361c1a] to [0ec121fad4].
︙ | ︙ | |||
271 272 273 274 275 276 277 | ((loaded) ;; (if (eq? (rmt:transport-mode) 'tcp) ;; (thread-sleep! 0.5)) (normal-proc cmd run-id params)) (else (normal-proc cmd run-id params)))) (meta (case cmd | | | 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 | ((loaded) ;; (if (eq? (rmt:transport-mode) 'tcp) ;; (thread-sleep! 0.5)) (normal-proc cmd run-id params)) (else (normal-proc cmd run-id params)))) (meta (case cmd ((ping) `((sstate . ,server-state)(sload . ,numthreads))) (else `((wait . ,delay-wait))))) (payload (list status errmsg result meta))) ;; (cmd run-id params meta) (db:add-stats cmd run-id params (- (current-milliseconds) start-t)) payload)) (else (assert #f "FATAL: failed to deserialize indat "indat)))))) |
︙ | ︙ |
Modified tcp-transportmod.scm from [1ca825cede] to [a6f9fa170f].
︙ | ︙ | |||
203 204 205 206 207 208 209 210 211 212 | run-id))))) (if conn (begin (debug:print-info 2 *default-log-port* "already connected to a server for " dbfname) conn) ;; we are already connected to the server ;; no conn (let* ((sdats (tt:get-server-info-sorted ttdat dbfname)) (sdat (if (null? sdats) #f | > > > > > > > > > > | | | | | 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 | run-id))))) (if conn (begin (debug:print-info 2 *default-log-port* "already connected to a server for " dbfname) conn) ;; we are already connected to the server ;; no conn ;; find server with lowest number of threads running (i.e. lowest load) ;; (let* ((sdats (tt:get-server-info-sorted ttdat dbfname)) (sdat (if (null? sdats) #f ;; choose server with lowest threads count (car (sort sdats (lambda (a b) (let* ((load-a (tt:get-server-threads a)) (load-b (tt:get-server-threads b))) (< load-a load-b)))))))) ;; (let ((indx (max (random (- (length sdats) 1)) 0))) ;; (list-ref sdats indx))))) (debug:print-info 1 *default-log-port* "found sdat " sdat" from sdats: "sdats) (match sdat ((host port start-time server-id pid dbfname2 servinffile) (assert (equal? dbfname dbfname2) "FATAL: read server info from wrong file.") (debug:print-info 2 *default-log-port* "no conn - in match servinffile:" servinffile) (let* ((host-port (conc host":"port)) (conn (make-tt-conn host: host port: port |
︙ | ︙ | |||
274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 | ;; returns ( result . ping_time ) (define (tt:timed-ping host port server-id) (let* ((start-time (current-milliseconds)) (result (tt:ping host port server-id))) (cons result (- (current-milliseconds) start-time)))) (define (tt:ping host port server-id #!optional (tries-left 5)) (let* ((res (tt:send-receive-direct host port `(ping #f #f #f) ping-mode: #t)) ;; please send me your server-id (try-again (lambda () (if (> tries-left 0) (begin (thread-sleep! 1) (tt:ping host port server-id (- tries-left 1))) #f)))) ;; ;; need two threads, one a 5 second timer ;; (match res ((status errmsg result meta) (if (equal? result server-id) (let* ((server-state (alist-ref 'sstate meta))) ;; (debug:print 0 *default-log-port* "Ping to "host":"port" successful.") (or server-state 'unk)) ;; then we are good (begin | > > > > > > > > > > > > > > > > > > > > > > > > > | | 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 | ;; returns ( result . ping_time ) (define (tt:timed-ping host port server-id) (let* ((start-time (current-milliseconds)) (result (tt:ping host port server-id))) (cons result (- (current-milliseconds) start-time)))) (define *server-load* (make-hash-table)) (define (tt:save-server-meta host port meta) (hash-table-set! *server-load* (conc host":"port) meta)) (define (tt:get-server-threads dat) (let* ((host (car dat)) (port (cadr dat)) (meta (tt:get-server-meta host port #t))) (if (list? meta) (alist-ref 'sload meta) #f))) ;; lazy get, does not auto-refresh meta, this might be a problem ;; (define (tt:get-server-meta host port #!optional (do-ping #f)) (let* ((meta (hash-table-ref/default *server-load* (conc host":"port) #f))) (if (and (not meta) do-ping) (begin (tt:timed-ping host port #f) (hash-table-ref/default *server-load* (conc host":"port) #f)) meta))) (define (tt:ping host port server-id #!optional (tries-left 5)) (let* ((res (tt:send-receive-direct host port `(ping #f #f #f) ping-mode: #t)) ;; please send me your server-id (try-again (lambda () (if (> tries-left 0) (begin (thread-sleep! 1) (tt:ping host port server-id (- tries-left 1))) #f)))) ;; ;; need two threads, one a 5 second timer ;; (match res ((status errmsg result meta) (tt:save-server-meta host port meta) (if (equal? result server-id) (let* ((server-state (alist-ref 'sstate meta))) ;; (debug:print 0 *default-log-port* "Ping to "host":"port" successful.") (or server-state 'unk)) ;; then we are good (begin (if server-id (debug:print 0 *default-log-port* "WARNING: server-id does not match, expected: "server-id", got: "result)) #f))) (else ;; (debug:print 0 *default-log-port* "res not in form (status errmsg result meta), got: "res) (try-again))))) ;; client side handler ;; |
︙ | ︙ | |||
385 386 387 388 389 390 391 392 393 394 395 396 397 398 | ))))) (begin (thread-sleep! 1) ;; no conn yet set up, give it a rest and try again (tt:handler ttdat cmd run-id params attemptnum readonly-mode dbfname testsuite mtexe server-start-proc))))) ;; gets server info and appends path to server file ;; sorts by age, --oldest-- now newest first ;; ;; returns list of (host port startseconds server-id servinfofile) ;; (define (tt:get-server-info-sorted ttdat dbfname) (let* ((areapath (tt-areapath ttdat)) (sfiles (tt:find-server areapath dbfname)) (sdats (filter car (map tt:server-get-info sfiles))) ;; first element is #f if the file disappeared while being read | > > | 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 | ))))) (begin (thread-sleep! 1) ;; no conn yet set up, give it a rest and try again (tt:handler ttdat cmd run-id params attemptnum readonly-mode dbfname testsuite mtexe server-start-proc))))) ;; gets server info and appends path to server file ;; sorts by age, --oldest-- now newest first ;; ;; move the ping here? ;; ;; returns list of (host port startseconds server-id servinfofile) ;; (define (tt:get-server-info-sorted ttdat dbfname) (let* ((areapath (tt-areapath ttdat)) (sfiles (tt:find-server areapath dbfname)) (sdats (filter car (map tt:server-get-info sfiles))) ;; first element is #f if the file disappeared while being read |
︙ | ︙ | |||
598 599 600 601 602 603 604 605 606 607 608 609 610 611 | (loop (cdr servrs) prime-host result)) ;; drop ))))) (home-host (if (null? good-srvrs) #f (caar good-srvrs)))) ;; by here we have a trustworthy list of servers and we have removed the .servinfo file for any unresponsive servers ;; and the list is in good-srvrs (cond ((not home-host) ;; no servers yet, go ahead and start (debug:print-info 0 *default-log-port* "No servers yet, starting on "(get-host-name))) ((> (length good-srvrs) 3) ;; don't need more, just exit (debug:print-info 0 *default-log-port* "Have "(length good-srvrs)", no need for more, exiting.") (exit)) ((not (equal? home-host (get-host-name))) ;; there is a home-host and we are not on it | > | 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 | (loop (cdr servrs) prime-host result)) ;; drop ))))) (home-host (if (null? good-srvrs) #f (caar good-srvrs)))) ;; by here we have a trustworthy list of servers and we have removed the .servinfo file for any unresponsive servers ;; and the list is in good-srvrs ;; (cond ((not home-host) ;; no servers yet, go ahead and start (debug:print-info 0 *default-log-port* "No servers yet, starting on "(get-host-name))) ((> (length good-srvrs) 3) ;; don't need more, just exit (debug:print-info 0 *default-log-port* "Have "(length good-srvrs)", no need for more, exiting.") (exit)) ((not (equal? home-host (get-host-name))) ;; there is a home-host and we are not on it |
︙ | ︙ | |||
763 764 765 766 767 768 769 | (debug:print 0 *default-log-port* "WARNING: file "servinf" was not created.") (loop (+ count 1)))))) serv-id)) ;; find valid server ;; get servers listed, last part of name must match :<dbfname> ;; if more than one, wait one second and look again | > > | | 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 | (debug:print 0 *default-log-port* "WARNING: file "servinf" was not created.") (loop (+ count 1)))))) serv-id)) ;; find valid server ;; get servers listed, last part of name must match :<dbfname> ;; if more than one, wait one second and look again ;; ;; NOTE: this only gets the servinfo data, no network activity here ;; i.e. no ping etc. ;; (define (tt:find-server areapath dbfname) (let* ((servdir (tt:get-servinfo-dir areapath)) (sfiles (glob (conc servdir"/*:"dbfname))) (goodfiles '())) ;; filter the files here by looking in processes table (if we are not main.db) |
︙ | ︙ |