Changes In Branch multi-server-hack Through [8a6ca9fd18] Excluding Merge-Ins
This is equivalent to a diff from 6a476e9ca7 to 8a6ca9fd18
2017-03-24
| ||
13:51 | Show connection stats every 60 seconds. Remove stat of megatest.db from rmt:send-receive, it was happening on every call. check-in: 6baac6187e user: matt tags: multi-server-hack | |
11:27 | Merged v1.63 changes to multi-server-hack check-in: 8a6ca9fd18 user: matt tags: multi-server-hack | |
11:19 | Added some error handling on the locking calls. Silenced a dashboard message. check-in: a5dbcdd2ac user: matt tags: v1.63 | |
2017-03-23
| ||
17:50 | Couple fixes for variable server hack check-in: 842f12e5fe user: matt tags: multi-server-hack | |
16:52 | Hack for variable number of servers (default 3) check-in: e86b57ccb0 user: matt tags: multi-server-hack | |
13:56 | Added message when server is forced Closed-Leaf check-in: 6a476e9ca7 user: matt tags: runaway-servers-fix | |
13:27 | Remove -O4, fixed the force server switch. check-in: c5f5a4ad19 user: matt tags: runaway-servers-fix | |
Modified client.scm from [3c66569adb] to [5611deb23d].
︙ | ︙ | |||
76 77 78 79 80 81 82 | (begin (debug:print-error 0 *default-log-port* "failed to start or connect to server") (exit 1)) ;; ;; Alternatively here, we can get the list of candidate servers and work our way ;; through them searching for a good one. ;; | | | 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 | (begin (debug:print-error 0 *default-log-port* "failed to start or connect to server") (exit 1)) ;; ;; Alternatively here, we can get the list of candidate servers and work our way ;; through them searching for a good one. ;; (let* ((server-dat (server:get-rand-best areapath)) ;; (server:get-first-best areapath)) (runremote (or area-dat *runremote*))) (if (not server-dat) ;; no server found (client:setup-http areapath remaining-tries: (- remaining-tries 1)) (let ((host (cadr server-dat)) (port (caddr server-dat))) (debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) (if (and (not area-dat) |
︙ | ︙ |
Modified common.scm from [22d5399385] to [4158ce55d8].
︙ | ︙ | |||
412 413 414 415 416 417 418 | (read (open-input-string (z3:decode-buffer (base64:base64-decode instr)))))) ;; dot-locking egg seems not to work, using this for now ;; if lock is older than expire-time then remove it and try again ;; to get the lock ;; (define (common:simple-file-lock fname #!key (expire-time 300)) | > > > | | | | | | | | | | | | | | | | > > | > > > | | 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 | (read (open-input-string (z3:decode-buffer (base64:base64-decode instr)))))) ;; dot-locking egg seems not to work, using this for now ;; if lock is older than expire-time then remove it and try again ;; to get the lock ;; (define (common:simple-file-lock fname #!key (expire-time 300)) (handle-exceptions exn #f ;; don't really care what went wrong right now. NOTE: I have not seen this one actually fail. (if (file-exists? fname) (if (> (- (current-seconds)(file-modification-time fname)) expire-time) (begin (delete-file* fname) (common:simple-file-lock fname expire-time: expire-time)) #f) (let ((key-string (conc (get-host-name) "-" (current-process-id)))) (with-output-to-file fname (lambda () (print key-string))) (thread-sleep! 0.25) (if (file-exists? fname) (with-input-from-file fname (lambda () (equal? key-string (read-line)))) #f))))) (define (common:simple-file-lock-and-wait fname #!key (expire-time 300)) (let ((end-time (+ expire-time (current-seconds)))) (let loop ((got-lock (common:simple-file-lock fname expire-time: expire-time))) (if got-lock #t (if (> end-time (current-seconds)) (begin (thread-sleep! 3) (loop (common:simple-file-lock fname expire-time: expire-time))) #f))))) (define (common:simple-file-release-lock fname) (handle-exceptions exn #f ;; I don't really care why this failed (at least for now) (delete-file* fname))) ;;====================================================================== ;; S T A T E S A N D S T A T U S E S ;;====================================================================== (define *common:std-states* '((0 "ARCHIVED") |
︙ | ︙ |
Modified dashboard.scm from [20957064b0] to [267452dd0e].
︙ | ︙ | |||
2716 2717 2718 2719 2720 2721 2722 | ;; Force creation of the db in case it isn't already there. (tasks:open-db) (define (dashboard:get-youngest-run-db-mod-time dbdir) (handle-exceptions exn (begin | | | 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 | ;; Force creation of the db in case it isn't already there. (tasks:open-db) (define (dashboard:get-youngest-run-db-mod-time dbdir) (handle-exceptions exn (begin (debug:print 2 *default-log-port* "WARNING: error in accessing databases in get-youngest-run-db-mod-time: " ((condition-property-accessor 'exn 'message) exn) " db-dir="dbdir) (current-seconds)) ;; something went wrong - just print an error and return current-seconds (common:max (map (lambda (filen) (file-modification-time filen)) (glob (conc dbdir "/*.db*")))))) (define (dashboard:monitor-changed? commondat tabdat) (let* ((run-update-time (current-seconds)) |
︙ | ︙ |
Modified launch.scm from [c20e1b3984] to [442d86a53d].
︙ | ︙ | |||
469 470 471 472 473 474 475 | (start-res (http-transport:client-connect host port)) (ping-res (rmt:login-no-auto-client-setup start-res))) (if (and start-res ping-res) (let ((url (http-transport:server-dat-make-url start-res))) (remote-conndat-set! *runremote* start-res) (remote-server-url-set! *runremote* url) | > | > > > > | 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 | (start-res (http-transport:client-connect host port)) (ping-res (rmt:login-no-auto-client-setup start-res))) (if (and start-res ping-res) (let ((url (http-transport:server-dat-make-url start-res))) (remote-conndat-set! *runremote* start-res) (remote-server-url-set! *runremote* url) (if (server:ping url) (debug:print-info 0 *default-log-port* "connected to " url " using CMDINFO data.") (begin (debug:print-info 0 *default-log-port* "have CMDINFO data but failed to connect to " url) (remote-conndat-set! *runremote* #f) (remote-server-url-set! *runremote* #f)))) (debug:print-info 0 *default-log-port* "received " host ":" port " for url but could not connect.") ))))))) ;; NFS might not have propagated the directory meta data to the run host - give it time if needed (let loop ((count 0)) (if (or (file-exists? top-path) (> count 10)) (change-directory top-path) |
︙ | ︙ |
Modified server.scm from [ba8be5ee9a] to [a07a79fe32].
︙ | ︙ | |||
165 166 167 168 169 170 171 | (define (server:get-list areapath #!key (limit #f)) (let ((fname-rx (regexp "^(|.*/)server-(\\d+)-(\\S+).log$")) (day-seconds (* 24 60 60))) ;; if the directory exists continue to get the list ;; otherwise attempt to create the logs dir and then ;; continue (if (if (directory-exists? (conc areapath "/logs")) | < > | | 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 | (define (server:get-list areapath #!key (limit #f)) (let ((fname-rx (regexp "^(|.*/)server-(\\d+)-(\\S+).log$")) (day-seconds (* 24 60 60))) ;; if the directory exists continue to get the list ;; otherwise attempt to create the logs dir and then ;; continue (if (if (directory-exists? (conc areapath "/logs")) '() (if (file-write-access? areapath) (begin (condition-case (create-directory (conc areapath "/logs") #t) (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs"))) (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list."))) (directory-exists? (conc areapath "/logs"))) '())) (let* ((server-logs (glob (conc areapath "/logs/server-*.log"))) (num-serv-logs (length server-logs))) (if (null? server-logs) '() (let loop ((hed (car server-logs)) (tal (cdr server-logs)) (res '())) |
︙ | ︙ | |||
216 217 218 219 220 221 222 | ;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off ;; and servers should stick around for about two hours or so. ;; (define (server:get-best srvlst) (let ((now (current-seconds))) (sort (filter (lambda (rec) | > > | | | | | | | | | | | > > > > > > > > > > | 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 | ;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off ;; and servers should stick around for about two hours or so. ;; (define (server:get-best srvlst) (let ((now (current-seconds))) (sort (filter (lambda (rec) (if (and (list? rec) (> (length rec) 2)) (let ((start-time (list-ref rec 3)) (mod-time (list-ref rec 0))) ;; (print "start-time: " start-time " mod-time: " mod-time) (and start-time mod-time (> (- now start-time) 0) ;; been running at least 0 seconds (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds (< (- now start-time) (+ (- (string->number (or (configf:lookup *configdat* "server" "runtime") "3600")) 180) (random 360))) ;; under one hour running time +/- 180 )) #f)) srvlst) (lambda (a b) (< (list-ref a 3) (list-ref b 3)))))) (define (server:get-first-best areapath) (let ((srvrs (server:get-best (server:get-list areapath)))) (if (and srvrs (not (null? srvrs))) (car srvrs) #f))) (define (server:get-rand-best areapath) (let ((srvrs (server:get-best (server:get-list areapath)))) (if (list? srvrs) (let* ((len (length srvrs)) (idx (random len))) (list-ref srvrs idx)) #f))) (define (server:record->url servr) (match-let (((mod-time host port start-time pid) servr)) (if (and host port) (conc host ":" port) #f))) |
︙ | ︙ | |||
290 291 292 293 294 295 296 | (thread-sleep! 5) (loop (server:check-if-running areapath))))))) (define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG. ;; no longer care if multiple servers are started by accident. older servers will drop off in time. ;; | | > > | > > | > > > | 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 | (thread-sleep! 5) (loop (server:check-if-running areapath))))))) (define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG. ;; no longer care if multiple servers are started by accident. older servers will drop off in time. ;; (define (server:check-if-running areapath #!key (numservers "2")) (let* ((ns (string->number (or (configf:lookup *configdat* "server" "numservers") numservers))) (servers (server:get-best (server:get-list areapath)))) ;; (print "servers: " servers " ns: " ns) (if (or (and servers (null? servers)) (not servers) (and (list? servers) (< (length servers) (random ns)))) ;; somewhere between 0 and numservers #f (let loop ((hed (car servers)) (tal (cdr servers))) (let ((res (server:check-server hed))) (if res res (if (null? tal) |
︙ | ︙ |