Overview
Comment: | Moving to .servinfo files WIP. Creating the files is working. |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | v1.71 |
Files: | files | file ages | folders |
SHA1: |
e1443922b3e27d49240ee5667daa4aa7 |
User & Date: | mmgraham on 2024-06-24 18:42:09 |
Other Links: | branch diff | manifest | tags |
Context
2024-06-24
| ||
18:42 | Moving to .servinfo files WIP. Creating the files is working. Leaf check-in: e1443922b3 user: mmgraham tags: v1.71 | |
2024-05-30
| ||
14:29 | Changed version to 1.7105 check-in: 88e277ab72 user: mmgraham tags: v1.71, v1.7105 | |
Changes
Modified common.scm from [b21afe069a] to [6bed0cbf4e].
︙ | |||
29 30 31 32 33 34 35 | 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 | - - + + - + + + | (declare (unit common)) (declare (uses commonmod)) (import commonmod) (include "common_records.scm") |
︙ | |||
679 680 681 682 683 684 685 686 687 688 689 690 691 692 | 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 | + + + + + + | (if dat dat "")))) (define (common:alist-ref/default key alist default) (or (alist-ref key alist) default)) ;; The `common:low-noise-print` function is a utility that can be used to throttle the ;; frequency of certain operations. It does this by tracking the last time an operation was ;; performed and only allowing it again after a specified interval (`waitval`). This can be useful ;; for reducing noise in logs or limiting the rate of user notifications, among other use cases. (define (common:low-noise-print waitval . keys) (let* ((key (string-intersperse (map conc keys) "-" )) (lasttime (hash-table-ref/default *common:denoise* key 0)) (currtime (current-seconds))) (if (> (- currtime lasttime) waitval) (begin (hash-table-set! *common:denoise* key currtime) |
︙ |
Modified http-transport.scm from [d2af089e7c] to [3385ca6732].
︙ | |||
75 76 77 78 79 80 81 | 75 76 77 78 79 80 81 82 83 84 85 86 87 88 | - | (server:get-best-guess-address hostname) #f))) (if ipstr ipstr hostn))) ;; hostname))) (start-port (portlogger:open-run-close portlogger:find-port)) (link-tree-path (common:get-linktree)) (tmp-area (common:get-db-tmp-area)) (start-file (conc tmp-area "/.server-start"))) |
︙ | |||
462 463 464 465 466 467 468 469 470 471 472 473 474 475 | 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 | + + + | (server-log-file (args:get-arg "-log"))) ;; always set when we are a server (handle-exceptions exn (debug:print 0 *default-log-port* "Failed to create " started-file ", exn=" exn) (with-output-to-file started-file (lambda ()(print (current-process-id))))) (debug:print 0 *default-log-port* "Creating servinfo file for " (get-host-name) ":" (cadr *server-info*)) (http:create-server-registration-file *toppath* (get-host-name) (cadr *server-info*)) (let loop ((count 0) (server-state 'available) (bad-sync-count 0) (start-time (current-milliseconds))) ;; Use this opportunity to sync the tmp db to megatest.db (if (not server-going) ;; *dbstruct-dbs* |
︙ | |||
533 534 535 536 537 538 539 | 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 | - + + - - + + - - - - - - - - - - - - | (begin (if (not *server-id*) (set! *server-id* (server:mk-signature))) (debug:print 0 *default-log-port* (current-seconds) (current-directory) (current-process-id) (argv)) (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds) " server-id: " *server-id*) (flush-output *default-log-port*))) (if (common:low-noise-print 60 "dbstats") |
︙ | |||
653 654 655 656 657 658 659 660 | 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 | + - + + + + + + + + + + + + + + + + + + + | (args:get-arg "-server") "-") )) "Server run")) (th3 (make-thread (lambda () (debug:print-info 0 *default-log-port* "Server monitor thread started") (http-transport:keep-running) "Keep running")))) (thread-start! th2) |
︙ |
Modified megatest.scm from [7456a6bbe1] to [a23040756c].
︙ | |||
957 958 959 960 961 962 963 | 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 | - + - - + - - - - - - - - + + + + + + + + + + + + + + + + + - - + - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - + + + - - + - - + - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + | ;; a specific Megatest area. Detail are being hashed out and this may change. ;; (if (args:get-arg "-adjutant") (begin (adjutant-run) (set! *didsomething* #t))) |
︙ |
Modified server.scm from [d1a731db91] to [8914b0f2d7].
︙ | |||
416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 | 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 | + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + | (begin (debug:print-info 2 *default-log-port* "Gating server start, last start: " (seconds->time-string fmodtime) ", time since last start: " delta ", required idletime: " idletime ", gating reason:" (if old-enough "another job started a server" "too soon to start another server")) (thread-sleep! ( + 1 idletime)) (server:wait-for-server-start-last-flag areapath))))))) (define (server:get-servinfo-dir areapath) (let* ((spath (conc areapath"/.servinfo"))) (if (not (file-exists? spath)) (create-directory spath #t)) spath)) ;; gets server info and appends path to server file ;; sorts by age, oldest first ;; ;; returns list of (host port startseconds server-id servinfofile) ;; (define (server:get-server-info-sorted areapath dbfname) (let* ( (sfiles (server:find-server areapath dbfname)) (sdats (filter car (map server:server-get-info sfiles))) ;; first element is #f if the file disappeared while being read (sorted (sort sdats (lambda (a b) (let* ((starta (list-ref a 2)) (startb (list-ref b 2))) (if (eq? starta startb) (string>? (list-ref a 3)(list-ref b 3)) ;; if servers started at same time look at server-id (< starta startb)))))) (count 0)) (for-each (lambda (rec) (if (or (> (length sorted) 1) (common:low-noise-print 120 "server info sorted")) (debug:print 2 *default-log-port* "SERVER #"count": "(string-intersperse (map conc sorted) ", "))) (set! count (+ count 1))) sorted) sorted)) (define (server:clean-up-old areapath) ;; any server file that has not been touched in ten minutes is effectively dead (let* ((sfiles (glob (conc (server:get-servinfo-dir areapath)"/*")))) (for-each (lambda (sfile) (let* ((modtime (handle-exceptions exn (begin (debug:print 0 *default-log-port* "WARNING: failed to get modification file for "sfile) (current-seconds)) (file-modification-time sfile)))) (if (and (number? modtime) (> (- (current-seconds) modtime) 600)) (begin (debug:print 0 *default-log-port* "WARNING: found old server info file "sfile", removing it.") (handle-exceptions exn (debug:print 0 *default-log-port* "WARNING: failed to delete old server info file "sfile) (delete-file sfile)))))) sfiles))) (define server-last-start 0) ;; oldest server alive determines host then choose random of youngest ;; five servers on that host ;; ;; mode: ;; best - get best server (random of newest five) ;; home - get home host based on oldest server ;; info - print info (define (server:choose-server areapath #!optional (mode 'best)) ;; age is current-starttime ;; find oldest alive ;; 1. sort by age ascending and ping until good ;; find alive rand from youngest ;; 1. sort by age descending ;; 2. take five ;; 3. check alive, discard if not and repeat ;; first we clean up old server files (assert (eq? (rmt:transport-mode) 'http) "FATAL: server:run called with rmt:transport-mode="(rmt:transport-mode)) (server:clean-up-old areapath) (let* ((since-last (- (current-seconds) server-last-start)) (server-start-delay 10)) (if ( < (- (current-seconds) server-last-start) 10 ) (begin (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start)) (debug:print 2 *default-log-port* "server:choose-server: last server start less than " server-start-delay " seconds ago. Sleeping " server-start-delay " seconds") (thread-sleep! server-start-delay) ) (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start)) ) ) (let* ((serversdat (server:get-servers-info areapath)) (servkeys (hash-table-keys serversdat)) (by-time-asc (if (not (null? servkeys)) ;; NOTE: Oldest is last (sort servkeys ;; list of "host:port" (lambda (a b) (>= (list-ref (hash-table-ref serversdat a) 2) (list-ref (hash-table-ref serversdat b) 2)))) '()))) (debug:print 2 *default-log-port* "server:choose-server: serversdat: " serversdat) (debug:print 2 *default-log-port* "server:choose-server: servkeys: " servkeys) (if (not (null? by-time-asc)) (let* ((oldest (last by-time-asc)) (oldest-dat (hash-table-ref serversdat oldest)) (host (list-ref oldest-dat 0)) (all-valid (filter (lambda (x) (equal? host (list-ref (hash-table-ref serversdat x) 0))) by-time-asc)) (best-ten (lambda () (if (> (length all-valid) 11) (take (drop-right all-valid 1) 10) ;; remove the oldest from consideration so it can age out (if (> (length all-valid) 8) (drop-right all-valid 1) all-valid)))) (names->dats (lambda (names) (map (lambda (x) (hash-table-ref serversdat x)) names))) (am-home? (lambda () (let* ((currhost (get-host-name)) (bestadrs (server:get-best-guess-address currhost))) (or (equal? host currhost) (equal? host bestadrs)))))) (case mode ((info) (debug:print 0 *default-log-port* "oldest: "oldest-dat", selected host: "host", all-valid: "all-valid) (debug:print 0 *default-log-port* "youngest: "(hash-table-ref serversdat (car all-valid)))) ((home) host) ((homehost) (cons host (am-home?))) ;; shut up old code ((home?) (am-home?)) ((best-ten)(names->dats (best-ten))) ((all-valid)(names->dats all-valid)) ((best) (let* ((best-ten (best-ten)) (len (length best-ten))) (hash-table-ref serversdat (list-ref best-ten (random len))))) ((count)(length all-valid)) (else (debug:print 0 *default-log-port* "ERROR: invalid command "mode) #f))) (begin (server:run areapath) (set! server-last-start (current-seconds)) ;; (thread-sleep! 3) (case mode ((homehost) (cons #f #f)) (else #f)))))) ;; oldest server alive determines host then choose random of youngest ;; five servers on that host ;; (define (server:get-servers-info areapath) ;; (assert *toppath* "FATAL: server:get-servers-info called before *toppath* has been set.") (let* ((servinfodir (server:get-servinfo-dir areapath))) ;; (conc *toppath*"/.servinfo"))) (if (not (file-exists? servinfodir)) (create-directory servinfodir)) (let* ((allfiles (glob (conc servinfodir"/*"))) (res (make-hash-table))) (for-each (lambda (f) (let* ((hostport (pathname-strip-directory f)) (serverdat (server:logf-get-start-info f))) (match serverdat ((host port start server-id pid) (if (and host port start server-id pid) (hash-table-set! res hostport serverdat) (debug:print-info 2 *default-log-port* "bad server info for "f": "serverdat))) (else (debug:print-info 2 *default-log-port* "bad server info for "f": "serverdat))))) allfiles) res))) ;; given a path to a server info file return: host port startseconds server-id pid dbfname logf ;; example of what it's looking for in the log file: ;; SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 ;; (define (server:server-get-info logf) (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+) dbfname: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id (bad-dat (list #f #f #f #f #f #f logf))) (let ((fdat (handle-exceptions exn (begin ;; WARNING: this is potentially dangerous to blanket ignore the errors (debug:print-info 0 *default-log-port* "Unable to get server info from "logf", exn="(condition->list exn)) '()) ;; no idea what went wrong, call it a bad server, return empty list (with-input-from-file logf read-lines)))) (if (null? fdat) ;; bad data, return bad-dat bad-dat (let loop ((inl (car fdat)) (tail (cdr fdat)) (lnum 0)) (let ((mlst (string-match server-rx inl))) (if (not mlst) (if (> lnum 500) ;; give up if more than 500 lines of server log read bad-dat (if (null? tail) bad-dat (loop (car tail)(cdr tail)(+ lnum 1)))) (match mlst ;; have a not null list ((_ host port start server-id pid dbfname) (list host (string->number port) (string->number start) server-id (string->number pid) dbfname logf)) (else (debug:print 0 *default-log-port* "ERROR: did not recognise SERVER line info "mlst) bad-dat))))))))) ;; find valid server ;; get servers listed, last part of name must match :<dbfname> ;; if more than one, wait one second and look again ;; future: ping oldest, if alive remove other :<dbfname> files ;; (define (server:find-server areapath dbfname) (let* ((servdir (server:get-servinfo-dir areapath)) (sfiles (glob (conc servdir"/*:"dbfname))) (good-files '())) (for-each (lambda (sfile) (let* ((sinfo (tt:server-get-info sfile)) (host (list-ref sinfo 0)) (port (list-ref sinfo 1)) (server-id (list-ref sinfo 3)) (pid (list-ref sinfo 4)) (status (system (conc "ssh " host " ps " pid " > /dev/null"))) ) (if (= status 0) (set! good-files (cons sfile good-files)) (delete-file* sfile) ) ) ) sfiles ) (debug:print-info 2 *default-log-port* "server:find-server: good-files: " good-files " sfiles: " sfiles) good-files)) ;; kind start up of server, wait before allowing another server for a given ;; area to be launched ;; (define (server:kind-run areapath) ;; look for $MT_RUN_AREA_HOME/logs/server-start-last ;; and wait for it to be at least <server idletime> seconds old (server:wait-for-server-start-last-flag areapath) (if (not (server:check-if-running areapath)) ;; why try if there is already a server running? (let* ((lock-file (conc areapath "/logs/server-start.lock"))) (let* ((start-flag (conc areapath "/logs/server-start-last"))) (common:simple-file-lock-and-wait lock-file expire-time: 25) (debug:print-info 2 *default-log-port* "server:kind-run: touching " start-flag) (system (conc "touch " start-flag)) ;; lazy but safe (server:run areapath) (thread-sleep! 20) ;; don't release the lock for at least a few seconds. And allow time for the server startup to get to "SERVER STARTED". (common:simple-file-release-lock lock-file))) (debug:print-info 0 *default-log-port* "Found server already running. NOT trying to start another."))) ;; return servid ;; side-effects: ;; ttdat-cleanup-proc is populated with function to remove the serverinfo file (define (server:create-server-registration-file areapath host port) (let* ( (servdir (server:get-servinfo-dir areapath)) (servinf (conc servdir"/"host":"port"-"(current-process-id))) (serv-id (server:mk-signature areapath)) (clean-proc (lambda () (delete-file* servinf) ))) (assert (and host port) "FATAL: tt:create-server-registration-file called with no conn") (tt-cleanup-proc-set! ttdat clean-proc) (tt-servinf-file-set! ttdat servinf) (with-output-to-file servinf (lambda () (print "SERVER STARTED: "host":"port" AT "(current-seconds)" server-id: "serv-id" pid: "(current-process-id)))) serv-id)) ;; this one seems to be the general entry point ;; (define (server:start-and-wait areapath #!key (timeout 60)) (let ((give-up-time (+ (current-seconds) timeout))) (let loop ((server-info (server:check-if-running areapath)) (try-num 0)) |
︙ |