470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
|
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
|
-
+
-
-
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
-
-
-
-
-
-
+
-
-
-
-
+
-
-
-
-
-
+
+
+
+
+
+
+
-
-
-
-
-
-
+
+
+
-
-
-
+
+
-
-
-
-
+
+
-
-
-
-
-
+
-
-
-
-
-
-
-
-
+
-
-
-
-
-
-
+
-
-
-
+
-
-
-
-
-
-
-
+
-
-
-
-
-
-
-
-
+
+
+
-
-
-
|
#f)
;; start the listener and start responding to requests
;;
;; NOTE: organise by dbfname, not run-id so we don't need
;; to pull in more modules
;;
;; This is the routine called in megatest.scm to start a server
;; This is the routine called in megatest.scm to start a server. NOTE: sequence is different for main.db vs. X.db
;;
;; Server viability is checked in keep-running. Blindly start and run here.
;;
(define (tt:start-server areapath run-id dbfname-in handler keys)
(assert areapath "FATAL: areapath not provided for tt:start-server")
;; is there already a server for this dbfile? Then exit.
(debug:print 2 *default-log-port* "tt:start-server: " dbfname-in)
(let* ((ttdat (make-tt areapath: areapath))
(dbfname (or dbfname-in (dbmod:run-id->dbfname run-id)))
(dbfname (or dbfname-in (dbmod:run-id->dbfname run-id))))
(servers (tt:find-server areapath dbfname))) ;; should use tt:get-current-server-info instead
(debug:print 0 *default-log-port* "Found " (length servers) " already running for " dbfname)
(if (> (length servers) 0)
(begin
(debug:print 0 *default-log-port* "INFO: found server(s) already running for db "dbfname", "(string-intersperse servers ",")" Exiting.")
(exit))
(let* ((dbstruct (dbmod:open-dbmoddb areapath run-id dbfname (dbfile:db-init-proc) keys)))
(tt-handler-set! ttdat (handler dbstruct))
(let* ((tcp-thread (make-thread
(lambda ()
(tt:start-tcp-server ttdat)) ;; start the tcp-server which applies handler to incoming data
"tcp-server-thread"))
(run-thread (make-thread
(lambda ()
(tt:keep-running ttdat dbfname dbstruct)))))
(thread-start! tcp-thread)
(thread-start! run-thread)
(let* ((dbstruct (dbmod:open-dbmoddb areapath run-id dbfname (dbfile:db-init-proc) keys)))
(tt-handler-set! ttdat (handler dbstruct))
(let* ((servinf-created #f)
(tcp-thread (make-thread
(lambda ()
;; NOTE: tt-port and tt-host are set in connect-listener which is called under tt:start-tcp-server
(tt:start-tcp-server ttdat)) ;; start the tcp-server which applies handler to incoming data
"tcp-server-thread"))
(run-thread (make-thread
(lambda ()
(tt:keep-running ttdat dbfname dbstruct)))))
(thread-start! tcp-thread)
(let* ((areapath (tt-areapath ttdat))
(nosyncdbpath (conc areapath"/.mtdb"))
(servers ;; (tt:find-server areapath dbfname)))
(tt:get-server-info-sorted ttdat dbfname)) ;; (host port startseconds server-id servinfofile)
(good-srvrs
;; contact servers via ping, if no response remove the .servinfo file
(let loop ((servrs servers)
(prime-host #f)
(result '()))
(if (null? servrs)
(reverse result)
(let* ((servdat (car servrs)))
(match servdat
((host port startseconds server-id servinfofile)
(let* ((ping-res (tt:timed-ping host port server-id))
(good-ping (match ping-res
((result . ping-time)
(not result)) ;; we couldn't reach the server or it was not a megatest server
(else #f))) ;; the ping failed completely?
(same-host (or (not prime-host) ;; i.e. this is the first host
(equal? prime-host host)))
(keep-srv (and good-ping same-host)))
(if keep-srv
(loop (cdr servrs)
host
(cons servdat result))
(begin
(handle-exceptions
exn
(debug:print-info 0 *default-log-port* "Error removing server info file: "servinfofile", "
(condition->list exn))
(delete-file* servinfofile))
(loop (cdr servrs) prime-host result)))))
(else
;; can't delete it as we don't have a filename. NOTE: Should really never get here.
(debug:print-info 0 *default-log-port* "ERROR: bad servinfo record \""servdat"\"")
(loop (cdr servrs) prime-host result)) ;; drop
)))))
(home-host (if (null? good-srvrs)
#f
(caar good-srvrs))))
;; by here we have a trustworthy list of servers and we have removed the .servinfo file for any unresponsive servers
;; and the list is in good-srvrs
(cond
((not home-host) ;; no servers yet, go ahead and start
(debug:print-info 0 *default-log-port* "No servers yet, starting on "(get-host-name)))
((> (length good-srvrs) 2) ;; don't need more, just exit
(debug:print-info 0 *default-log-port* "Have "(length good-srvrs)", no need for more, exiting.")
(exit))
((not (equal? home-host (get-host-name))) ;; there is a home-host and we are not on it
(debug:print-info 0 *default-log-port* "Prime main server is on host "home-host", but we are on host "(get-host-name)", exiting.")
(exit))
(else
(debug:print-info 0 *default-log-port* "Starting on host "(get-host-name)", along with "(length good-srvrs)" other servers.")))
(let* ((areapath (tt-areapath ttdat))
(nosyncdbpath (conc areapath"/.mtdb")))
;; this didn't seem to work, is port not available yet?
(let loop ((count 0))
(if (tt-port ttdat)
(begin
(procinf-port-set! *procinf* (tt-port ttdat))
(procinf-dbname-set! *procinf* dbfname)
(dbfile:with-no-sync-db
nosyncdbpath
(lambda (nsdb)
(dbfile:insert-or-update-process nsdb *procinf*))))
(if (< count 10)
(begin
(thread-sleep! 0.25)
(loop (+ count 1)))
(debug:print 0 *default-log-port* "ERROR: (tt-port ttdat) no port set!"))))
(thread-join! run-thread) ;; run thread will exit on timeout or other conditions
;; replace with call to (dbfile:set-process-done nsdb host pid reason)
(procinf-status-set! *procinf* "done")
(procinf-end-set! *procinf* (current-seconds))
;; either convert this to use set-process-done or get rid of set-process-done
(dbfile:with-no-sync-db
nosyncdbpath
(lambda (nsdb)
(dbfile:insert-or-update-process nsdb *procinf*)))
(debug:print 0 *default-log-port* "Exiting now.")
(exit)))))))
;; this didn't seem to work, is port not available yet?
(let loop ((count 0))
(if (tt-port ttdat)
(begin
(procinf-port-set! *procinf* (tt-port ttdat))
(procinf-dbname-set! *procinf* dbfname)
(dbfile:with-no-sync-db
nosyncdbpath
(lambda (nsdb)
(dbfile:insert-or-update-process nsdb *procinf*))))
(if (< count 10)
(begin
(thread-sleep! 0.25)
(loop (+ count 1)))
(begin
(debug:print 0 *default-log-port* "ERROR: (tt-port ttdat) no port set! Exiting.")
(exit)))))
;; create a servinfo file start keep-running
(tt:create-server-registration-file ttdat dbfname)
(procinf-status-set! *procinf* "running")
(dbfile:with-no-sync-db
nosyncdbpath
(lambda (nsdb)
(dbfile:insert-or-update-process nsdb *procinf*)))
(thread-start! run-thread)
(thread-join! run-thread) ;; run thread will exit on timeout or other conditions
;; replace with call to (dbfile:set-process-done nsdb host pid reason)
(procinf-status-set! *procinf* "done")
(procinf-end-set! *procinf* (current-seconds))
;; either convert this to use set-process-done or get rid of set-process-done
(dbfile:with-no-sync-db
nosyncdbpath
(lambda (nsdb)
(dbfile:insert-or-update-process nsdb *procinf*)))
(debug:print 0 *default-log-port* "Exiting now.")
(exit))))))
(define (tt:keep-running ttdat dbfname dbstruct)
;; verfiy conn for ready
;; listener socket has been started by this stage
;; wait for a port before creating the registration file
;;
(let* ((db-locked-in #f)
(areapath (tt-areapath ttdat))
(nosyncdbpath (conc areapath"/.mtdb")))
(set! *server-info* ttdat)
(let loop ((count 0))
(if (> count 240)
(begin
(debug:print 0 *default-log-port* "FATAL: Could not start a tcp server, giving up.")
(exit 1))
(if (not (tt-port ttdat)) ;; no connection yet
(begin
(thread-sleep! 0.25)
(loop (+ count 1))))))
(tt:create-server-registration-file ttdat dbfname)
;; now start watching the last-access, if it hasn't been touched
;; in over ten seconds we exit
(thread-sleep! 0.05) ;; any real need for delay here?
(let loop ()
(let* ((servers (tt:get-server-info-sorted ttdat dbfname))
(ok (cond
((not *server-run*)
(debug:print 0 *default-log-port* "WARNING: received a stop server from client by remote request.")
#f)
((null? servers) #f) ;; not ok
((equal? (list-ref (car servers) 6) ;; compare the servinfofile
(tt-servinf-file ttdat))
(let* ((res (if db-locked-in
#t
;;
;; let's replace the below "winning" lock method with:
;; 1. create a lock file with pid etc.
;; 2. if there are no other lock files make an entry in the no-sync db
;; 3. gather the lock entries, apply the "winner" heuristic
;; 4. if I'm the winner, set tt-state to 'running else set to 'notthewinner
;;
;; New idea:
;; 1. check all processes entries that match the db
;; 2. sort by fixed heuristic
;; 3. if I'm number one, set state to 'running and db-locked-in to #t
;; at this point the server is running and responding to calls, we just monitor
(let* ((candidates (map dbfile:row->procinf
(dbfile:with-no-sync-db
nosyncdbpath
(lambda (nsdb)
(dbfile:get-process-options nsdb "server" dbfname)))))
(primecand (begin
;; for db calls and exit if there are none.
(assert (not (null? candidates))
"HOW CAN WE NOT BE IN THE PROCESSES DB AS A SERVER?")
(car candidates))))
;; compare primecand with myself
;; if not me check that it is reachable
;; if reachable exit
#f)
;; if I am not in the first 3 servers, exit
(let loop ()
(let* ((servers (tt:get-server-info-sorted ttdat dbfname))
(home-host (if (null? servers)
#f
(caar servers)))
#;(let* ((lock-result ;; this is the primary lock - need to double verify that got it
(dbfile:with-no-sync-db
nosyncdbpath
(lambda (db)
(db:no-sync-lock-and-check db dbfname
(tt-servinf-file ttdat)
(my-index (list-index (lambda (x)
(equal? (list-ref x 6)
(tt-servinf-file ttdat)))
;; (dbr:dbstruct-dbtmpname dbstruct)
))))
(success (car lock-result)))
servers))
(ok (cond
(if success
(begin
(tt-state-set! ttdat 'running)
(debug:print 0 *default-log-port* "Got server lock for " dbfname)
((not *server-run*)
(debug:print 0 *default-log-port* "WARNING: received a stop server from client by remote request.")
(set! db-locked-in #t)
#t)
(begin
(debug:print 0 *default-log-port* "Failed to get server lock for "dbfname)
#f))))))
#f)
(if (and res (common:low-noise-print 120 "top server message"))
(debug:print-info 0 *default-log-port* "Keep running, I'm the top server for "
dbfname" on "(tt-host ttdat)":"(tt-port ttdat)))
res))
(else
;; wrong servinfo file
(debug:print-info 0 *default-log-port* "I'm not the lead server: "servers)
(let* ((leadsrv (car servers)))
((null? servers)
(match leadsrv
((host port startseconds server-id pid dbfname servinfofile)
(let* ((result (tt:timed-ping host port server-id))
(res (car result))
(ping (cdr result)))
(debug:print-info 0 *default-log-port* "Ping to "host":"port", with server-id "server-id
(debug:print 0 *default-log-port* "WARNING: no servinfo files found, this cannot be.")
", and file "servinfofile" returned "res)
(if res
#f ;; not the server, but all good, want to exit
#f) ;; not ok
(if (and (file-exists? servinfofile)
(> (- (current-seconds)(file-modification-time servinfofile)) 30))
(begin
;; can't ping and file has been on disk 15 seconds, go ahead and try to remove it
(debug:print-info 0 *default-log-port* "Removing apparently dead server info file: "servinfofile)
(handle-exceptions
exn
((> my-index 2)
(debug:print-info 0 *default-log-port* "Error removing server info file: "servinfofile)
(delete-file* servinfofile)
)
#t) ;; not the server but the server is not reachable
(begin
(debug:print 0 *default-log-port* "I'm not the server but could not ping "host":"port", will try again.")
(thread-sleep! 1) ;; just because
#t)))))
(debug:print 0 *default-log-port* "WARNING: there are more than two servers ahead of me, I'm not needed, exiting.")
#f) ;; not ok to not be in first three
(else #t))))
(else ;; should never get here
(debug:print 0 *default-log-port* "BAD SERVER RECORD: "leadsrv)
(assert #f "Bad server record "leadsrv))))))))
(if ok
(tt-last-access-set! ttdat *db-last-access*) ;; bit silly, just use db-last-access
(begin
(debug:print 0 *default-log-port* "Exiting immediately")
(tt:shutdown-server ttdat)
(exit)))
|
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
|
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
|
-
-
+
+
|
((dbr:dbstruct-sync-proc dbstruct) last-update)
(dbr:dbstruct-last-update-set! dbstruct curr-secs))))
(if (< (- (current-seconds) (tt-last-access ttdat)) (tt-server-timeout-param))
(begin
(thread-sleep! 5)
(loop)))))
;; (cleanup) ;; all done by tt:shutdown-server
(debug:print 0 *default-log-port* "INFO: Server timed out, exiting from tt:keep-running.")))
;; (cleanup) ;; all done by tt:shutdown-server
(debug:print 0 *default-log-port* "INFO: Server timed out, exiting from tt:keep-running."))
(define (tt:shutdown-server ttdat)
(let* ((host (tt-host ttdat))
(port (tt-port ttdat))
(sinf (tt-servinf-file ttdat)))
(tt-state-set! ttdat 'shutdown)
|