Overview
Comment: | Proper calling of exit cleanup for servers. Go back to nbfake for running servers (proper logs kept). Remove .servinfo file for a server that does not respond to ping (returns #f). |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.80-revolution |
Files: | files | file ages | folders |
SHA1: |
2725343c921c38fcc631f342252ca32b |
User & Date: | matt on 2023-12-03 02:16:07 |
Other Links: | branch diff | manifest | tags |
Context
2023-12-03
| ||
17:49 | Added todo items check-in: d4f2f2c1ef user: matt tags: v1.80-revolution, v1.8023 | |
02:16 | Proper calling of exit cleanup for servers. Go back to nbfake for running servers (proper logs kept). Remove .servinfo file for a server that does not respond to ping (returns #f). check-in: 2725343c92 user: matt tags: v1.80-revolution | |
2023-11-28
| ||
13:45 | Bypass all the mutexes in dashboard. It seems to help with performance quite a bit. check-in: f4844a3801 user: mrwellan tags: v1.80-revolution, v1.8022 | |
Changes
Modified TODO from [d1855eb2e9] to [4fe902f98b].
︙ | ︙ | |||
14 15 16 17 18 19 20 21 22 23 24 25 26 27 | # # You should have received a copy of the GNU General Public License # along with Megatest. If not, see <http://www.gnu.org/licenses/>. TODO ==== 23WW47 . Finding server .. look at .servinfo for likely prime main .. ask the .servinfo prime main for real prime main .. save prime main (for how long, 10 seconds or 10 minutes?) . Starting prime main | > > > > > > > | 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | # # You should have received a copy of the GNU General Public License # along with Megatest. If not, see <http://www.gnu.org/licenses/>. TODO ==== 23WW48 . Allow two or three servers to run for any given db . Update avg call count/sec every 30 sec in no-sync . get server uses no-sync process info to decide which server to suggest . Use process table to decide who will do sync back . Fix metadat being synced over and over 23WW47 . Finding server .. look at .servinfo for likely prime main .. ask the .servinfo prime main for real prime main .. save prime main (for how long, 10 seconds or 10 minutes?) . Starting prime main |
︙ | ︙ |
Modified server.scm from [39953c681c] to [8ac9dab770].
︙ | ︙ | |||
726 727 728 729 730 731 732 | ;; (set! *db-last-access* (current-seconds)) ;; might not be needed. ;; (if (equal? *toppath* toppath) ;; #t ;; #f))) ;; timeout is hms string: 1h 5m 3s, default is 1 minute ;; This is currently broken. Just use the number of hours with no unit. | | | 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 | ;; (set! *db-last-access* (current-seconds)) ;; might not be needed. ;; (if (equal? *toppath* toppath) ;; #t ;; #f))) ;; timeout is hms string: 1h 5m 3s, default is 1 minute ;; This is currently broken. Just use the number of hours with no unit. ;; Default is 600 seconds. ;; (define (server:expiration-timeout) (let* ((tmo (configf:lookup *configdat* "server" "timeout"))) (if (string? tmo) (let* ((num (string->number tmo))) (if num (* 3600 num) |
︙ | ︙ |
Modified tcp-transportmod.scm from [fb9929d164] to [5cef09f36d].
︙ | ︙ | |||
183 184 185 186 187 188 189 190 191 192 193 194 195 | conn) ((starting) (thread-sleep! 0.5) (debug:print-info 0 *default-log-port* "server for " dbfname " is in starting state, retrying connect") (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc)) (else (let* ((curr-secs (current-seconds))) ;; rm the (last server) would go here (if (> (- curr-secs (tt-last-serv-start ttdat)) 10) (begin (tt-last-serv-start-set! ttdat curr-secs) (server-start-proc))) ;; start server if 10 sec since last attempt (thread-sleep! 1) | > > > > > > > > > | | 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 | conn) ((starting) (thread-sleep! 0.5) (debug:print-info 0 *default-log-port* "server for " dbfname " is in starting state, retrying connect") (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc)) (else (let* ((curr-secs (current-seconds))) (if (not ping-res) ;; the server is actually dead, remove the .servinfo file (begin (debug:print-info 0 *default-log-port* "Unreachable server at " host":"port" with servinfo file "servinffile", removing it") (if (file-exists? servinffile) (handle-exceptions exn #f (delete-file servinffile))))) ;; rm the (last server) would go here (if (> (- curr-secs (tt-last-serv-start ttdat)) 10) (begin (tt-last-serv-start-set! ttdat curr-secs) (server-start-proc))) ;; start server if 10 sec since last attempt (thread-sleep! 1) (debug:print-info 0 *default-log-port* "server ping result was "ping-res" neither running nor starting. Retrying connect") (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc))))))) (else ;; no good server found, if haven't started server in > 5 secs, start another (if (> (- (current-seconds) (tt-last-serv-start ttdat)) 3) ;; BUG - grow this number really do not want to swamp the machine with servers (begin (debug:print-info 0 *default-log-port* "Starting server for "dbfname) (server-start-proc) (tt-last-serv-start-set! ttdat (current-seconds)) |
︙ | ︙ | |||
564 565 566 567 568 569 570 571 572 573 574 575 576 577 | nosyncdbpath (lambda (nsdb) (dbfile:insert-or-update-process nsdb *procinf*))) (thread-start! run-thread) (thread-join! run-thread) ;; run thread will exit on timeout or other conditions ;; replace with call to (dbfile:set-process-done nsdb host pid reason) (procinf-status-set! *procinf* "done") (procinf-end-set! *procinf* (current-seconds)) ;; either convert this to use set-process-done or get rid of set-process-done (dbfile:with-no-sync-db nosyncdbpath (lambda (nsdb) | > > | 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 | nosyncdbpath (lambda (nsdb) (dbfile:insert-or-update-process nsdb *procinf*))) (thread-start! run-thread) (thread-join! run-thread) ;; run thread will exit on timeout or other conditions ;; (tcp-close (tt-socket ttdat)) ;; close up ports here ;; replace with call to (dbfile:set-process-done nsdb host pid reason) (procinf-status-set! *procinf* "done") (procinf-end-set! *procinf* (current-seconds)) ;; either convert this to use set-process-done or get rid of set-process-done (dbfile:with-no-sync-db nosyncdbpath (lambda (nsdb) |
︙ | ︙ | |||
627 628 629 630 631 632 633 | ((dbr:dbstruct-sync-proc dbstruct) last-update) (dbr:dbstruct-last-update-set! dbstruct curr-secs)))) (if (< (- (current-seconds) (tt-last-access ttdat)) (tt-server-timeout-param)) (begin (thread-sleep! 5) (loop))))) | | < | 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 | ((dbr:dbstruct-sync-proc dbstruct) last-update) (dbr:dbstruct-last-update-set! dbstruct curr-secs)))) (if (< (- (current-seconds) (tt-last-access ttdat)) (tt-server-timeout-param)) (begin (thread-sleep! 5) (loop))))) (tt:shutdown-server ttdat) (debug:print 0 *default-log-port* "INFO: Server timed out, exiting from tt:keep-running."))) (define (tt:shutdown-server ttdat) (let* ((host (tt-host ttdat)) (port (tt-port ttdat)) (sinf (tt-servinf-file ttdat))) (tt-state-set! ttdat 'shutdown) (portlogger:open-run-close portlogger:set-port port "released") (if (file-exists? sinf) (delete-file* sinf)) )) ;; return servid ;; side-effects: ;; ttdat-cleanup-proc is populated with function to remove the serverinfo file (define (tt:create-server-registration-file ttdat dbfname) (let* ((areapath (tt-areapath ttdat)) |
︙ | ︙ | |||
770 771 772 773 774 775 776 | (debug:print 0 *default-log-port* trying" servers registered in .servinfo dir. not starting another.") (thread-sleep! 1) #f) (else (if (not (file-exists? (conc areapath"/logs"))) (create-directory (conc areapath"/logs") #t)) (let* ((logfile (conc areapath "/logs/server-"dbfname"-"(current-process-id)".log")) ;; -" curr-pid "-" target-host ".log")) | | | > > > > > | | 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 | (debug:print 0 *default-log-port* trying" servers registered in .servinfo dir. not starting another.") (thread-sleep! 1) #f) (else (if (not (file-exists? (conc areapath"/logs"))) (create-directory (conc areapath"/logs") #t)) (let* ((logfile (conc areapath "/logs/server-"dbfname"-"(current-process-id)".log")) ;; -" curr-pid "-" target-host ".log")) (cmdln (conc mtexe " -startdir "areapath " -server - ";; (or target-host "-") " -m testsuite:"testsuite " -db "dbfname ;; (dbmod:run-id->dbfname run-id) " " profile-mode #;(conc " >> " logfile " 2>&1 &")))) ;; we want the remote server to start in *toppath* so push there ;; (push-directory areapath) ;; use cd in the command line instead (debug:print 2 *default-log-port* "INFO: Trying to start server in tcp mode (" cmdln ") at "(common:human-time)" for "areapath) ;; (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time)) (setenv "NBFAKE_QUIET" "yes") ;; BUG: change to with-environment-variable ... (setenv "NBFAKE_LOG" logfile) (system (conc "cd "areapath" ; nbfake " cmdln)) (unsetenv "NBFAKE_QUIET") (unsetenv "NBFAKE_LOG") ;; (system cmdln) (hash-table-set! *last-server-start* dbfname (current-seconds)) ;; ;; use below to go back to nbfake - nbfake does cause trouble ... ;; (setenv "NBFAKE_QUIET" "yes") ;; BUG: change to with-environment-variable ... ;; (setenv "NBFAKE_LOG" logfile) ;; (system (conc "cd "areapath" ; nbfake " cmdln)) ;; (unsetenv "NBFAKE_QUIET") ;; (unsetenv "NBFAKE_LOG") |
︙ | ︙ |