Megatest

Check-in [87813c75d2]
Login
Overview
Comment:Improve some of the logic around servers
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.80
Files: files | file ages | folders
SHA1: 87813c75d277c0130b42616fd865e1a02029dde1
User & Date: mrwellan on 2023-03-02 21:32:23
Other Links: branch diff | manifest | tags
Context
2023-03-03
02:16
Speculative fix for wrong server access issue check-in: 0f348aa84b user: mrwellan tags: v1.80
2023-03-02
21:32
Improve some of the logic around servers check-in: 87813c75d2 user: mrwellan tags: v1.80
19:19
Better message check-in: c3879943f0 user: mrwellan tags: v1.80
Changes

Modified tcp-transportmod.scm from [99ffeec5af] to [6b42c5c5cc].

215
216
217
218
219
220
221
222
223






224
225
226
227
228
229
230
215
216
217
218
219
220
221


222
223
224
225
226
227
228
229
230
231
232
233
234







-
-
+
+
+
+
+
+







			(port    (tt-conn-port conn))
			;; (dbfname (tt-conn-port conn)) ;; 192.168.0.127:4242-726924:4.db
			(pid     (tt-conn-pid  conn))
			(servinf (conc areapath"/.servinfo/"host":"port"-"pid":"dbfname)))
		   (hash-table-set! (tt-conns ttdat) dbfname #f)
		   (if (file-exists? servinf)
		       (begin
			 (debug:print 0 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", attempting to remove servinfo file.")
			 (delete-file* servinf))
			 (debug:print 0 *default-log-port* "INFO: no ping response from server "host":"port" for "dbfname)
			 (if (and (file-exists? servinf)
				  (> (- (current-seconds)(file-modification-time servinf)) 60))
			     (begin
			       (debug:print 0 *default-log-port* "INFO: "servinf" file seems old and no ping response, removing it.")
			       (delete-file* servinf))))
		       (debug:print 0 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", but do not see servinf file "servinf))
		   (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
		 (assert #f "FATAL: tt:handler received bad data "res)))))
	(begin
	  (thread-sleep! 1) ;; give it a rest and try again
	  (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)))))

370
371
372
373
374
375
376
377

378
379
380
381
382
383
384
385
386

387
388
389
390
391
392
393
394
395
396
397
398
399

400
401
402
403
404


405

406
407
408
409
410
411
412
374
375
376
377
378
379
380

381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403

404
405
406
407
408
409
410
411

412
413
414
415
416
417
418
419







-
+









+












-
+





+
+
-
+







    (thread-sleep! 0.05) ;; any real need for delay here?
    (let loop ()
      (let* ((servers (tt:get-server-info-sorted ttdat dbfname))
	     (ok      (cond
		       ((null? servers) #f) ;; not ok
		       ((equal? (list-ref (car servers) 6) ;; compare the servinfofile
				(tt-servinf-file ttdat))
			(debug:print-info 0 *default-log-port* "Keep running, I'm the top server.")
			(debug:print-info 0 *default-log-port* "Keep running, I'm the top server on "(tt-host ttdat)":"(tt-port ttdat))
			(if db-locked-in
			    #t
			    (let* ((lockinfo  (dbfile:with-no-sync-db nosyncdbpath
								      (lambda (db)
									(db:no-sync-get-lock db dbfname))))
				   (success   (car lockinfo)))
			      (if success
				  (begin
				    (debug:print 0 *default-log-port* "Got server lock for "dbfname)
				    (set! db-locked-in #t)
				    #t)
				  (begin
				    (debug:print 0 *default-log-port* "Failed to get server lock for "dbfname)
				    #f)))))
		       (else
			(debug:print-info 0 *default-log-port* "I'm not the lead server: "servers)
			(let* ((leadsrv (car servers)))
			  (match leadsrv
			    ((host port startseconds server-id pid dbfname servinfofile)
			     (if (tt:ping host port server-id)
				 #f ;; not the server, but all good, want to exit
				 (if (and (file-exists? servinfofile)
					  (> (- (current-seconds)(file-modification-time servinfofile)) 5))
					  (> (- (current-seconds)(file-modification-time servinfofile)) 15))
				     (begin
				       ;; can't ping and file has been on disk 5 seconds, go ahead and try to remove it
				       (debug:print-info 0 *default-log-port* "Removing apparently dead server info file: "servinfofile)
				       (delete-file* servinfofile)
				       #t) ;; not the server but the server is not reachable
				     (begin
				       (debug:print 0 *default-log-port* "I'm not the server but will try again since "servinfofile" is fresh")
				     #t)))
				       #t))))
			    (else ;; should never get here
			     (debug:print 0 *default-log-port* "BAD SERVER RECORD: "leadsrv)
			     (assert #f "Bad server record "leadsrv))))))))
	(if ok
	    ;; (if (> *api-process-request-count* 0) ;; have requests in flight
	    ;;	(tt-last-access-set! ttdat (current-seconds)))
	    (tt-last-access-set! ttdat *db-last-access*) ;; bit silly, just use db-last-access
614
615
616
617
618
619
620


621

622
623
624
625
626
627
628
621
622
623
624
625
626
627
628
629

630
631
632
633
634
635
636
637







+
+
-
+







;;  if there is already a serv-listener return the udata
;;
(define (setup-listener uconn #!optional (port 4242))
  (assert (tt? uconn) "FATAL: setup-listener called with wrong struct "uconn)
  (handle-exceptions
   exn
   (if (< port 65535)
       (begin
	 (thread-sleep! 0.25)
       (setup-listener uconn (+ port 1))
	 (setup-listener uconn (+ port 1)))
       #f)
   (connect-listener uconn port)))

(define (connect-listener uconn port)
  ;; (tcp-listener-socket LISTENER)(socket-name so)
  ;; sockaddr-address, sockaddr-port, sockaddr->string
  (let* ((tlsn (tcp-listen port 1000 #f)) ;; (tcp-listen TCPPORT [BACKLOG [HOST]])