Index: client.scm ================================================================== --- client.scm +++ client.scm @@ -78,11 +78,11 @@ (exit 1)) ;; ;; Alternatively here, we can get the list of candidate servers and work our way ;; through them searching for a good one. ;; - (let* ((server-dat (server:get-first-best areapath)) + (let* ((server-dat (server:get-rand-best areapath)) ;; (server:get-first-best areapath)) (runremote (or area-dat *runremote*))) (if (not server-dat) ;; no server found (client:setup-http areapath remaining-tries: (- remaining-tries 1)) (let ((host (cadr server-dat)) (port (caddr server-dat))) Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -414,38 +414,46 @@ ;; dot-locking egg seems not to work, using this for now ;; if lock is older than expire-time then remove it and try again ;; to get the lock ;; (define (common:simple-file-lock fname #!key (expire-time 300)) - (if (file-exists? fname) - (if (> (- (current-seconds)(file-modification-time fname)) expire-time) - (begin - (delete-file* fname) - (common:simple-file-lock fname expire-time: expire-time)) - #f) - (let ((key-string (conc (get-host-name) "-" (current-process-id)))) - (with-output-to-file fname - (lambda () - (print key-string))) - (thread-sleep! 0.25) - (if (file-exists? fname) - (with-input-from-file fname - (lambda () - (equal? key-string (read-line)))) - #f)))) + (handle-exceptions + exn + #f ;; don't really care what went wrong right now. NOTE: I have not seen this one actually fail. + (if (file-exists? fname) + (if (> (- (current-seconds)(file-modification-time fname)) expire-time) + (begin + (delete-file* fname) + (common:simple-file-lock fname expire-time: expire-time)) + #f) + (let ((key-string (conc (get-host-name) "-" (current-process-id)))) + (with-output-to-file fname + (lambda () + (print key-string))) + (thread-sleep! 0.25) + (if (file-exists? fname) + (with-input-from-file fname + (lambda () + (equal? key-string (read-line)))) + #f))))) (define (common:simple-file-lock-and-wait fname #!key (expire-time 300)) (let ((end-time (+ expire-time (current-seconds)))) (let loop ((got-lock (common:simple-file-lock fname expire-time: expire-time))) (if got-lock #t (if (> end-time (current-seconds)) - (loop (common:simple-file-lock fname expire-time: expire-time)) + (begin + (thread-sleep! 3) + (loop (common:simple-file-lock fname expire-time: expire-time))) #f))))) (define (common:simple-file-release-lock fname) - (delete-file* fname)) + (handle-exceptions + exn + #f ;; I don't really care why this failed (at least for now) + (delete-file* fname))) ;;====================================================================== ;; S T A T E S A N D S T A T U S E S ;;====================================================================== Index: dashboard.scm ================================================================== --- dashboard.scm +++ dashboard.scm @@ -2718,11 +2718,11 @@ (define (dashboard:get-youngest-run-db-mod-time dbdir) (handle-exceptions exn (begin - (debug:print 0 *default-log-port* "WARNING: error in accessing databases in get-youngest-run-db-mod-time: " ((condition-property-accessor 'exn 'message) exn) " db-dir="dbdir) + (debug:print 2 *default-log-port* "WARNING: error in accessing databases in get-youngest-run-db-mod-time: " ((condition-property-accessor 'exn 'message) exn) " db-dir="dbdir) (current-seconds)) ;; something went wrong - just print an error and return current-seconds (common:max (map (lambda (filen) (file-modification-time filen)) (glob (conc dbdir "/*.db*")))))) Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -471,11 +471,16 @@ (if (and start-res ping-res) (let ((url (http-transport:server-dat-make-url start-res))) (remote-conndat-set! *runremote* start-res) (remote-server-url-set! *runremote* url) - (debug:print-info 0 *default-log-port* "connected to " url " using CMDINFO data.")) + (if (server:ping url) + (debug:print-info 0 *default-log-port* "connected to " url " using CMDINFO data.") + (begin + (debug:print-info 0 *default-log-port* "have CMDINFO data but failed to connect to " url) + (remote-conndat-set! *runremote* #f) + (remote-server-url-set! *runremote* #f)))) (debug:print-info 0 *default-log-port* "received " host ":" port " for url but could not connect.") ))))))) ;; NFS might not have propagated the directory meta data to the run host - give it time if needed (let loop ((count 0)) (if (or (file-exists? top-path) Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -167,19 +167,19 @@ (day-seconds (* 24 60 60))) ;; if the directory exists continue to get the list ;; otherwise attempt to create the logs dir and then ;; continue (if (if (directory-exists? (conc areapath "/logs")) - #t + '() (if (file-write-access? areapath) (begin (condition-case (create-directory (conc areapath "/logs") #t) (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs"))) (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list."))) (directory-exists? (conc areapath "/logs"))) - #f)) + '())) (let* ((server-logs (glob (conc areapath "/logs/server-*.log"))) (num-serv-logs (length server-logs))) (if (null? server-logs) '() (let loop ((hed (car server-logs)) @@ -218,21 +218,24 @@ ;; (define (server:get-best srvlst) (let ((now (current-seconds))) (sort (filter (lambda (rec) - (let ((start-time (list-ref rec 3)) - (mod-time (list-ref rec 0))) - ;; (print "start-time: " start-time " mod-time: " mod-time) - (and start-time mod-time - (> (- now start-time) 0) ;; been running at least 0 seconds - (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds - (< (- now start-time) - (+ (- (string->number (or (configf:lookup *configdat* "server" "runtime") "3600")) - 180) - (random 360))) ;; under one hour running time +/- 180 - ))) + (if (and (list? rec) + (> (length rec) 2)) + (let ((start-time (list-ref rec 3)) + (mod-time (list-ref rec 0))) + ;; (print "start-time: " start-time " mod-time: " mod-time) + (and start-time mod-time + (> (- now start-time) 0) ;; been running at least 0 seconds + (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds + (< (- now start-time) + (+ (- (string->number (or (configf:lookup *configdat* "server" "runtime") "3600")) + 180) + (random 360))) ;; under one hour running time +/- 180 + )) + #f)) srvlst) (lambda (a b) (< (list-ref a 3) (list-ref b 3)))))) @@ -240,10 +243,19 @@ (let ((srvrs (server:get-best (server:get-list areapath)))) (if (and srvrs (not (null? srvrs))) (car srvrs) #f))) + +(define (server:get-rand-best areapath) + (let ((srvrs (server:get-best (server:get-list areapath)))) + (if (list? srvrs) + (let* ((len (length srvrs)) + (idx (random len))) + (list-ref srvrs idx)) + #f))) + (define (server:record->url servr) (match-let (((mod-time host port start-time pid) servr)) (if (and host port) @@ -292,13 +304,20 @@ (define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG. ;; no longer care if multiple servers are started by accident. older servers will drop off in time. ;; -(define (server:check-if-running areapath) - (let* ((servers (server:get-best (server:get-list areapath)))) - (if (null? servers) +(define (server:check-if-running areapath #!key (numservers "2")) + (let* ((ns (string->number + (or (configf:lookup *configdat* "server" "numservers") numservers))) + (servers (server:get-best (server:get-list areapath)))) + ;; (print "servers: " servers " ns: " ns) + (if (or (and servers + (null? servers)) + (not servers) + (and (list? servers) + (< (length servers) (random ns)))) ;; somewhere between 0 and numservers #f (let loop ((hed (car servers)) (tal (cdr servers))) (let ((res (server:check-server hed))) (if res