Overview
Comment: | Servers consolidated on machine where main.db started |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.80-revolution |
Files: | files | file ages | folders |
SHA1: |
9a71fda483ab8cb7c47247910b25cf66 |
User & Date: | mrwellan on 2023-11-15 20:24:10 |
Other Links: | branch diff | manifest | tags |
Context
2023-11-16
| ||
15:49 | Moved sync to inside the server check-in: f249e200e4 user: mrwellan tags: v1.80-revolution | |
2023-11-15
| ||
20:24 | Servers consolidated on machine where main.db started check-in: 9a71fda483 user: mrwellan tags: v1.80-revolution | |
2023-11-14
| ||
16:53 | added comment check-in: ee0ea6396d user: mrwellan tags: v1.80-revolution | |
Changes
Modified api.scm from [5fa313076b] to [5e50d724f1].
︙ | ︙ | |||
322 323 324 325 326 327 328 | ;;=============================================== ;; READ/WRITE QUERIES ;;=============================================== ((get-keys-write) (db:get-keys dbstruct)) ;; force a dummy "write" query to force server; for debug in -repl ;; SERVERS | | | 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 | ;;=============================================== ;; READ/WRITE QUERIES ;;=============================================== ((get-keys-write) (db:get-keys dbstruct)) ;; force a dummy "write" query to force server; for debug in -repl ;; SERVERS ((start-server) (apply tt:server-process-run params)) ((kill-server) (set! *server-run* #f)) ;; TESTS ;;((test-set-state-status-by-id) (apply mt:test-set-state-status-by-id dbstruct params)) ;;BB - commented out above because it was calling below, eventually, incorrectly (dbstruct passed to mt:test-set-state-status-by-id, which previosly did more, but now only passes thru to db:set-state-status-and-roll-up-items. ((test-set-state-status-by-id) |
︙ | ︙ |
Modified common.scm from [0854266963] to [2e7dd411a7].
︙ | ︙ | |||
180 181 182 183 184 185 186 | (define *transport-type* 'http) ;; override with [server] transport http|rpc|nmsg (define *runremote* #f) ;; if set up for server communication this will hold <host port> ;; (define *max-cache-size* 0) (define *logged-in-clients* (make-hash-table)) (define *server-id* #f) (define *server-info* #f) ;; good candidate for easily convert to non-global (define *time-to-exit* #f) | < | 180 181 182 183 184 185 186 187 188 189 190 191 192 193 | (define *transport-type* 'http) ;; override with [server] transport http|rpc|nmsg (define *runremote* #f) ;; if set up for server communication this will hold <host port> ;; (define *max-cache-size* 0) (define *logged-in-clients* (make-hash-table)) (define *server-id* #f) (define *server-info* #f) ;; good candidate for easily convert to non-global (define *time-to-exit* #f) (define *run-id* #f) (define *server-kind-run* (make-hash-table)) (define *home-host* #f) ;; (define *total-non-write-delay* 0) (define *heartbeat-mutex* (make-mutex)) ;; (define *api-process-request-count* 0) ;; (define *max-api-process-requests* 0) |
︙ | ︙ |
Modified rmt.scm from [418c23e0f4] to [ca8b91d9f3].
︙ | ︙ | |||
68 69 70 71 72 73 74 | ;;====================================================================== (define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id ;; how to make area-dat (define (rmt:set-ttdat areapath ttdat) | | > | < | | > > > > > > | | | > > > > > > > > > > | 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 | ;;====================================================================== (define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id ;; how to make area-dat (define (rmt:set-ttdat areapath ttdat) (if ttdat ttdat (let* ((newremote (make-and-init-remote areapath))) (set! *ttdat* newremote) newremote))) ;; NB// area-dat replaced by ttdat ;; (define (rmt:send-receive cmd run-id params #!key (attemptnum 1)(ttdat #f)) (assert *toppath* "FATAL: rmt:send-receive called with *toppath* not set.") (let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value (areapath *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas (attemptnum (+ 1 attemptnum)) (readonly-mode (rmtmod:calc-ro-mode ttdat *toppath*)) (testsuite (common:get-testsuite-name)) (mtexe (common:find-local-megatest)) (dbfname (conc (dbfile:run-id->dbnum run-id)".db")) (ttdat (rmt:set-ttdat areapath ttdat)) (conn (tt:get-conn ttdat dbfname)) (is-main (equal? dbfname "main.db")) ;; why not (not run-id) ? (server-start-proc (if is-main #f (lambda () ;; (debug:print-info 0 *default-log-port* "starting server for dbfname: "dbfname) (rmt:start-server ;; tt:server-process-run areapath testsuite ;; (dbfile:testsuite-name) mtexe run-id))))) ;; here we look at ttdat, if dbfname is NOT main.db we check that a conn exists for it ;; and if there is no conn we first send a request to the main.db server to start a ;; server for the dbfname. #;(if (and (not is-main)(not conn)) ;; no existing connection to non-main server, call in a start up request (begin (server-start-proc) (thread-sleep! 1))) (tt:handler ttdat cmd run-id params attemptnum readonly-mode dbfname testsuite mtexe server-start-proc))) ;; KEEP THIS HERE FOR A WHILE, WE MAY WANT TO RESURECT IT ;; (define (nfs-transport-handler runremote cmd run-id params attemptnum area-dat areapath readonly-mode testsuite mtexe) ;; (let* ((keys (common:get-fields *configdat*)) ;; (dbstruct (dbmod:nfs-get-dbstruct run-id keys (dbfile:db-init-proc) areapath tmpadj: "/dashboard"))) ;; (api:dispatch-request dbstruct cmd run-id params))) |
︙ | ︙ | |||
189 190 191 192 193 194 195 | ;;====================================================================== ;; S E R V E R ;;====================================================================== (define (rmt:kill-server run-id) (rmt:send-receive 'kill-server run-id (list run-id))) | | | | | | 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 | ;;====================================================================== ;; S E R V E R ;;====================================================================== (define (rmt:kill-server run-id) (rmt:send-receive 'kill-server run-id (list run-id))) (define (rmt:start-server areapath testsuite mtexe run-id) ;; run on main.db server (rmt:send-receive 'start-server #f (list areapath testsuite mtexe run-id))) ;;====================================================================== ;; M I S C ;;====================================================================== (define (rmt:login run-id) (rmt:send-receive 'login run-id (list *toppath* megatest-version (client:get-signature)))) ;; This login does no retries under the hood - it acts a bit like a ping. ;; Deprecated for nmsg-transport. ;; ;; (define (rmt:login-no-auto-client-setup runremote) ;; (rmt:send-receive-no-auto-client-setup runremote 'login #f (list *toppath* megatest-version (client:get-signature)))) ;; given a hostname, return a pair of cpu load and update time representing latest intelligence from tests running on that host (define (rmt:get-latest-host-load hostname) (rmt:send-receive 'get-latest-host-load #f (list hostname))) (define (rmt:sdb-qry qry val run-id) ;; add caching if qry is 'getid or 'getstr (rmt:send-receive 'sdb-qry run-id (list qry val))) ;; NOT COMPLETED (define (rmt:runtests user run-id testpatt params) |
︙ | ︙ |
Modified tcp-transportmod.scm from [b05e8dad0b] to [ff1d2811a4].
︙ | ︙ | |||
109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 | ;; parameters ;; (define tt-server-timeout-param (make-parameter 600)) ;; make ttdat visible (define *server-info* #f) (define (tt:make-remote areapath) (make-tt areapath: areapath)) ;; 1 ... or #f ;; and check that dbfname matches. FIXME: the propagation of dbfname and run-id ;; might not make the best sense ;; (define (tt:valid-run-id run-id dbfname) (and (or (number? run-id) (not run-id)) (equal? (dbfile:run-id->dbfname run-id) dbfname))) (tcp-buffer-size 2048) | > | > > > | | | | | | | | | | | < | 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 | ;; parameters ;; (define tt-server-timeout-param (make-parameter 600)) ;; make ttdat visible (define *server-info* #f) (define *server-run* #t) (define (tt:make-remote areapath) (make-tt areapath: areapath)) ;; 1 ... or #f ;; and check that dbfname matches. FIXME: the propagation of dbfname and run-id ;; might not make the best sense ;; (define (tt:valid-run-id run-id dbfname) (and (or (number? run-id) (not run-id)) (equal? (dbfile:run-id->dbfname run-id) dbfname))) (tcp-buffer-size 2048) ;; (max-connections 4096) (define (tt:get-conn ttdat dbfname) (hash-table-ref/default (tt-conns ttdat) dbfname #f)) ;; do all the busy work of finding and setting up conn for ;; connecting to a server ;; (define (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc) (assert (tt:valid-run-id run-id dbfname) "FATAL: invalid run-id "run-id) (debug:print-info 2 *default-log-port* "tt:client-connect-to-server " dbfname " " run-id) (let* ((conn (tt:get-conn ttdat dbfname)) (server-start-proc (or server-start-proc (lambda () (assert (equal? dbfname "main.db") ;; only main.db is started here "FATAL: called server-start-proc for db other than main.db") (tt:server-process-run (tt-areapath ttdat) testsuite ;; (dbfile:testsuite-name) (common:find-local-megatest) run-id))))) (if conn (begin (debug:print-info 2 *default-log-port* "already connected to a server") conn) ;; we are already connected to the server (let* ((sdat (tt:get-current-server-info ttdat dbfname))) (match sdat ((host port start-time server-id pid dbfname2 servinffile) |
︙ | ︙ | |||
174 175 176 177 178 179 180 | ((running) (debug:print-info 2 *default-log-port* "Setting conn = " conn " in hash table") (hash-table-set! (tt-conns ttdat) dbfname conn) ;;; is this ok to save before validating that the connection is good? conn) ((starting) (thread-sleep! 0.5) (debug:print-info 0 *default-log-port* "server for " dbfname " is in starting state, retrying connect") | | | | | 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 | ((running) (debug:print-info 2 *default-log-port* "Setting conn = " conn " in hash table") (hash-table-set! (tt-conns ttdat) dbfname conn) ;;; is this ok to save before validating that the connection is good? conn) ((starting) (thread-sleep! 0.5) (debug:print-info 0 *default-log-port* "server for " dbfname " is in starting state, retrying connect") (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc)) (else (let* ((curr-secs (current-seconds))) ;; rm the (last server) would go here (if (> (- curr-secs (tt-last-serv-start ttdat)) 10) (begin (tt-last-serv-start-set! ttdat curr-secs) (server-start-proc))) ;; start server if 10 sec since last attempt (thread-sleep! 1) (debug:print-info 2 *default-log-port* "server ping result was neither running nor starting. Retrying connect") (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc))))))) (else ;; no good server found, if haven't started server in > 5 secs, start another (if (> (- (current-seconds) (tt-last-serv-start ttdat)) 3) ;; BUG - grow this number really do not want to swamp the machine with servers (begin (debug:print-info 0 *default-log-port* "Starting server for "dbfname) (server-start-proc) (tt-last-serv-start-set! ttdat (current-seconds)) (thread-sleep! 3) )) (thread-sleep! 1) (debug:print-info 0 *default-log-port* "Connect to server for " dbfname) (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc))))))) (define (tt:timed-ping host port server-id) (let* ((start-time (current-milliseconds)) (result (tt:ping host port server-id))) (cons result (- (current-milliseconds) start-time)))) |
︙ | ︙ | |||
231 232 233 234 235 236 237 | ;; (debug:print 0 *default-log-port* "res not in form (status errmsg result meta), got: "res) (try-again))))) ;; client side handler ;; ;;(tt:handler #<tt> get-keys #f () 2 #f "/home/matt/data/megatest/ext-tests" #f "main.db" "ext-tests" "/home/matt/data/megatest/bin/.22.04/../megatest") ;; | | | | 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 | ;; (debug:print 0 *default-log-port* "res not in form (status errmsg result meta), got: "res) (try-again))))) ;; client side handler ;; ;;(tt:handler #<tt> get-keys #f () 2 #f "/home/matt/data/megatest/ext-tests" #f "main.db" "ext-tests" "/home/matt/data/megatest/bin/.22.04/../megatest") ;; (define (tt:handler ttdat cmd run-id params attemptnum readonly-mode dbfname testsuite mtexe server-start-proc) ;; connect-to-server will start a server if needed. (let* ((areapath (tt-areapath ttdat)) (conn (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc))) ;; looks up conn keyed by dbfname (if conn ;; have connection, call the server (let* ((res (tt:send-receive ttdat conn cmd run-id params))) ;; res is (status errmsg result meta) (match res ((status errmsg result meta) (if (list? meta) |
︙ | ︙ | |||
558 559 560 561 562 563 564 565 566 567 568 569 570 571 | (tt:create-server-registration-file ttdat dbfname) ;; now start watching the last-access, if it hasn't been touched ;; in over ten seconds we exit (thread-sleep! 0.05) ;; any real need for delay here? (let loop () (let* ((servers (tt:get-server-info-sorted ttdat dbfname)) (ok (cond ((null? servers) #f) ;; not ok ((equal? (list-ref (car servers) 6) ;; compare the servinfofile (tt-servinf-file ttdat)) (let* ((res (if db-locked-in #t (let* ((lock-result ;; this is the primary lock - need to double verify that got it (dbfile:with-no-sync-db | > > > | 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 | (tt:create-server-registration-file ttdat dbfname) ;; now start watching the last-access, if it hasn't been touched ;; in over ten seconds we exit (thread-sleep! 0.05) ;; any real need for delay here? (let loop () (let* ((servers (tt:get-server-info-sorted ttdat dbfname)) (ok (cond ((not *server-run*) (debug:print 0 *default-log-port* "WARNING: received a stop server from client by remote request.") #f) ((null? servers) #f) ;; not ok ((equal? (list-ref (car servers) 6) ;; compare the servinfofile (tt-servinf-file ttdat)) (let* ((res (if db-locked-in #t (let* ((lock-result ;; this is the primary lock - need to double verify that got it (dbfile:with-no-sync-db |
︙ | ︙ | |||
627 628 629 630 631 632 633 | (cleanup) (exit))) (let* ((last-update (dbr:dbstruct-last-update dbstruct)) (curr-secs (current-seconds))) (if (and (eq? (tt-state ttdat) 'running) (> (- curr-secs last-update) 3)) ;; every 3-4 seconds update the db? | | > | | 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 | (cleanup) (exit))) (let* ((last-update (dbr:dbstruct-last-update dbstruct)) (curr-secs (current-seconds))) (if (and (eq? (tt-state ttdat) 'running) (> (- curr-secs last-update) 3)) ;; every 3-4 seconds update the db? (let* ((sinfo-file (tt-servinf-file ttdat))) ;; (debug:print 0 *default-log-port* "INFO: touching "sinfo-file) (set! (file-modification-time sinfo-file) (current-seconds)) ((dbr:dbstruct-sync-proc dbstruct) last-update) (dbr:dbstruct-last-update-set! dbstruct curr-secs)))) (if (< (- (current-seconds) (tt-last-access ttdat)) (tt-server-timeout-param)) (begin (thread-sleep! 5) (loop))))) |
︙ | ︙ | |||
715 716 717 718 719 720 721 | ;; find valid server ;; get servers listed, last part of name must match :<dbfname> ;; if more than one, wait one second and look again ;; future: ping oldest, if alive remove other :<dbfname> files ;; (define (tt:find-server areapath dbfname) (let* ((servdir (tt:get-servinfo-dir areapath)) | | > | | > > > > > > | > | 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 | ;; find valid server ;; get servers listed, last part of name must match :<dbfname> ;; if more than one, wait one second and look again ;; future: ping oldest, if alive remove other :<dbfname> files ;; (define (tt:find-server areapath dbfname) (let* ((servdir (tt:get-servinfo-dir areapath)) (sfiles (glob (conc servdir"/*:"dbfname))) (goodfiles '())) ;; filter the files here by looking in processes table (if we are not main.db) ;; and or look at the time stamp on the servinfo file, a running server will ;; touch the file every minute (again, this will only apply for main.db) (for-each (lambda (fname) (let* ((age (- (current-seconds)(file-modification-time fname)))) (if (> age 10) ;; can't trust it if over ten seconds old (begin (debug:print 0 *default-log-port* "WARNING: removing stale servinfo file "fname) (delete-file fname)) (set! goodfiles (cons fname goodfiles))))) sfiles) goodfiles)) ;; given a path to a server info file return: host port startseconds server-id pid dbfname logf ;; example of what it's looking for in the log file: ;; SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 ;; (define (tt:server-get-info logf) (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+) dbfname: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id |
︙ | ︙ | |||
767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 | (string->number pid) dbfname logf)) (else (debug:print 0 *default-log-port* "ERROR: did not recognise SERVER line info "mlst) bad-dat))))))))) ;; Given an area path, start a server process ### NOTE ### > file 2>&1 ;; if the target-host is set ;; try running on that host ;; incidental: rotate logs in logs/ dir. ;; (define (tt:server-process-run areapath testsuite mtexe run-id #!key (profile-mode "")) ;; areapath is *toppath* for a given testsuite area (assert areapath "FATAL: tt:server-process-run called without areapath defined.") (assert testsuite "FATAL: tt:server-process-run called without testsuite defined.") (assert mtexe "FATAL: tt:server-process-run called without mtexe defined.") ;; mtest -server - -m testsuite:ext-tests -db 6.db | > > > > > > > | > > | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | > | | | | | | | | | | 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 | (string->number pid) dbfname logf)) (else (debug:print 0 *default-log-port* "ERROR: did not recognise SERVER line info "mlst) bad-dat))))))))) (define *last-server-start* (make-hash-table)) (define (tt:too-recent-server-start dbfname) (let* ((last-run-time (hash-table-ref/default *last-server-start* dbfname #f))) (and last-run-time (< (- (current-seconds) last-run-time) 5)))) ;; Given an area path, start a server process ### NOTE ### > file 2>&1 ;; if the target-host is set ;; try running on that host ;; incidental: rotate logs in logs/ dir. ;; (define (tt:server-process-run areapath testsuite mtexe run-id #!key (profile-mode "")) ;; areapath is *toppath* for a given testsuite area (assert areapath "FATAL: tt:server-process-run called without areapath defined.") (assert testsuite "FATAL: tt:server-process-run called without testsuite defined.") (assert mtexe "FATAL: tt:server-process-run called without mtexe defined.") ;; mtest -server - -m testsuite:ext-tests -db 6.db (let* ((dbfname (dbmod:run-id->dbfname run-id))) (if (tt:too-recent-server-start dbfname) #f (let* ((load (get-normalized-cpu-load)) (srvrs (tt:find-server areapath dbfname)) (trying (length srvrs)) (nrun (number-of-processes-running (conc "mtest.*server.*"testsuite".*"dbfname)))) (cond ((> load 2.0) (debug:print 0 *default-log-port* "Normalized load "load" on " (get-host-name) " is over the limit of 2.0. Not starting a server. Please reduce the load on "(get-host-name)" by killing some processes") (thread-sleep! 1) #f) ((> nrun 100) (debug:print 0 *default-log-port* nrun" servers running on " (get-host-name) ", not starting another.") (thread-sleep! 1) #f) ((> trying 2) (debug:print 0 *default-log-port* trying" servers registered in .servinfo dir. not starting another.") (thread-sleep! 1) #f) (else (if (not (file-exists? (conc areapath"/logs"))) (create-directory (conc areapath"/logs") #t)) (let* ((logfile (conc areapath "/logs/server-"dbfname"-"(current-process-id)".log")) ;; -" curr-pid "-" target-host ".log")) (cmdln (conc mtexe " -startdir "areapath " -server - ";; (or target-host "-") " -m testsuite:"testsuite " -db "dbfname ;; (dbmod:run-id->dbfname run-id) " " profile-mode (conc " >> " logfile " 2>&1 &")))) ;; we want the remote server to start in *toppath* so push there ;; (push-directory areapath) ;; use cd in the command line instead (debug:print 2 *default-log-port* "INFO: Trying to start server in tcp mode (" cmdln ") at "(common:human-time)" for "areapath) ;; (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time)) (system cmdln) (hash-table-set! *last-server-start* dbfname (current-seconds)) ;; ;; use below to go back to nbfake - nbfake does cause trouble ... ;; (setenv "NBFAKE_QUIET" "yes") ;; BUG: change to with-environment-variable ... ;; (setenv "NBFAKE_LOG" logfile) ;; (system (conc "cd "areapath" ; nbfake " cmdln)) ;; (unsetenv "NBFAKE_QUIET") ;; (unsetenv "NBFAKE_LOG") ;;(pop-directory) #t))))))) ;;====================================================================== ;; tcp connection stuff ;;====================================================================== ;; find a port and start tcp-server. This only starts the tcp portion of ;; the server, look at (tt:start-server ...) above for the entry point |
︙ | ︙ |