Overview
Comment: | re-implemented old v1.60 style server timeout handling. Something hahad gotten lost in the translation |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.64 |
Files: | files | file ages | folders |
SHA1: |
7654d9b17b55fc464cd6bc8ee3b2f17b |
User & Date: | matt on 2017-07-12 23:12:34 |
Other Links: | branch diff | manifest | tags |
Context
2017-07-13
| ||
04:19 | Oops. Change units for server expiration time back to hours check-in: fafe58dfc3 user: matt tags: v1.64 | |
2017-07-12
| ||
23:12 | re-implemented old v1.60 style server timeout handling. Something hahad gotten lost in the translation check-in: 7654d9b17b user: matt tags: v1.64 | |
17:37 | fixed bug check-in: b27bd09cf2 user: bjbarcla tags: v1.64 | |
Changes
Modified common.scm from [b3a2e136d0] to [d2bb16b225].
︙ | ︙ | |||
158 159 160 161 162 163 164 | (defstruct remote (hh-dat (common:get-homehost)) ;; homehost record ( addr . hhflag ) (server-url (if *toppath* (server:check-if-running *toppath*))) ;; (server:check-if-running *toppath*) #f)) (last-server-check 0) ;; last time we checked to see if the server was alive (conndat #f) (transport *transport-type*) | | | 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 | (defstruct remote (hh-dat (common:get-homehost)) ;; homehost record ( addr . hhflag ) (server-url (if *toppath* (server:check-if-running *toppath*))) ;; (server:check-if-running *toppath*) #f)) (last-server-check 0) ;; last time we checked to see if the server was alive (conndat #f) (transport *transport-type*) (server-timeout (server:expiration-timeout)) (force-server #f) (ro-mode #f) (ro-mode-checked #f)) ;; flag that indicates we have checked for ro-mode ;; launching and hosts (defstruct host (reachable #f) |
︙ | ︙ |
Modified http-transport.scm from [9dc12bb28f] to [e390d2210b].
︙ | ︙ | |||
381 382 383 384 385 386 387 | (exit)) (loop start-time (equal? sdat last-sdat) sdat))))))) (iface (car server-info)) (port (cadr server-info)) (last-access 0) | | | 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 | (exit)) (loop start-time (equal? sdat last-sdat) sdat))))))) (iface (car server-info)) (port (cadr server-info)) (last-access 0) (server-timeout (server:expiration-timeout)) (server-going #f) (server-log-file (args:get-arg "-log"))) ;; always set when we are a server (with-output-to-file started-file (lambda ()(print (current-process-id)))) (let loop ((count 0) (server-state 'available) |
︙ | ︙ | |||
438 439 440 441 442 443 444 | (begin (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds)) (flush-output *default-log-port*))) (if (common:low-noise-print 60 "dbstats") (begin (debug:print 0 *default-log-port* "Server stats:") (db:print-current-query-stats))) | | < < < < < | < | 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 | (begin (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds)) (flush-output *default-log-port*))) (if (common:low-noise-print 60 "dbstats") (begin (debug:print 0 *default-log-port* "Server stats:") (db:print-current-query-stats))) (let* ((hrs-since-start (/ (- (current-seconds) server-start-time) 3600))) (cond ((and *server-run* (> (+ last-access server-timeout) (current-seconds))) (if (common:low-noise-print 120 "server continuing") (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access)) (let ((curr-time (current-seconds))) (handle-exceptions exn (debug:print 0 *default-log-port* "ERROR: Failed to change timestamp on log file " server-log-file ". Are you out of space on that disk?") (if (not *server-overloaded*) |
︙ | ︙ |
Modified launch.scm from [bc68bfb44c] to [efe1a9c8e9].
︙ | ︙ | |||
465 466 467 468 469 470 471 | ;; immediated set some key variables from CMDINFO data, yes, these will be set again below ... ;; (setenv "MT_TESTSUITENAME" areaname) (setenv "MT_RUN_AREA_HOME" top-path) (set! *toppath* top-path) (setenv "MT_TEST_RUN_DIR" work-area) | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 | ;; immediated set some key variables from CMDINFO data, yes, these will be set again below ... ;; (setenv "MT_TESTSUITENAME" areaname) (setenv "MT_RUN_AREA_HOME" top-path) (set! *toppath* top-path) (setenv "MT_TEST_RUN_DIR" work-area) ;; ;; On NFS it can be slow and unreliable to get needed startup information. ;; ;; i. Check if we are on the homehost, if so, proceed ;; ;; ii. Check if host and port passed in via CMDINFO are valid and if ;; ;; possible use them. ;; (let ((bestadrs (server:get-best-guess-address (get-host-name))) ;; (needcare #f)) ;; (if (equal? homehost bestadrs) ;; we are likely on the homehost ;; (debug:print-info 0 *default-log-port* "test " test-name " appears to be running on the homehost " homehost) ;; (let ((host-port (if serverurl (string-split serverurl ":") #f))) ;; (if (not *runremote*)(set! *runremote* (make-remote))) ;; init *runremote* ;; (if (string? homehost) ;; (if (and host-port ;; (> (length host-port) 1)) ;; (let* ((host (car host-port)) ;; (port (cadr host-port)) ;; (start-res (http-transport:client-connect host port)) ;; (ping-res (rmt:login-no-auto-client-setup start-res))) ;; (if (and start-res ;; ping-res) ;; ;; (begin ;; let ((url (http-transport:server-dat-make-url start-res))) ;; (begin ;; (remote-conndat-set! *runremote* start-res) ;; ;; (remote-server-url-set! *runremote* url) ;; ;; (if (server:ping url) ;; (debug:print-info 0 *default-log-port* "connected to " host ":" port " using CMDINFO data.")) ;; (begin ;; (debug:print-info 0 *default-log-port* "have CMDINFO data but failed to connect to " host ":" port) ;; (set! *runremote* #f)) ;; ;; (remote-conndat-set! *runremote* #f)) ;; )) ;; (begin ;; (set! *runremote* #f) ;; (debug:print-info 0 *default-log-port* (if host-port ;; (conc "received invalid host-port information " host-port) ;; "no host-port information received")) ;; ;; potential for bad situation if simultaneous starting of hundreds of jobs on servers, set needcare. ;; (set! needcare #t))) ;; (begin ;; (set! *runremote* #f) ;; (debug:print-info 0 *default-log-port* "received no homehost information. Please report this to support as it should not happen.") ;; (set! needcare #t))))) ;; (if needcare ;; due to very slow NFS we will do a brute force mkdir to ensure that the directory inode it truly available on this host ;; (let ((logdir (conc top-path "/logs"))) ;; we'll try to create this directory ;; (handle-exceptions ;; exn ;; (debug:print 0 *default-log-port* "Failed to create directory " logdir " expect problems, message: " ((condition-property-accessor 'exn 'message) exn)) ;; (create-directory logdir #t))))) ;; ;; NFS might not have propagated the directory meta data to the run host - give it time if needed (let loop ((count 0)) (if (or (common:file-exists? top-path) (> count 10)) (change-directory top-path) (begin (debug:print 0 *default-log-port* "INFO: Not starting job yet - directory " top-path " not found") |
︙ | ︙ |
Modified rmt.scm from [bc431c6c0b] to [efb226ceca].
︙ | ︙ | |||
126 127 128 129 130 131 132 | (debug:print 0 *default-log-port* "WARNING: write transaction requested on a readonly area. cmd="cmd" params="params) #f) ;; This block was for pre-emptively resetting the connection if there had been no communication for some time. ;; I don't think it adds any value. If the server is not there, just fail and start a new connection. ;; also, the expire-time calculation might not be correct. We want, time-since-last-server-access > (server:get-timeout) ;; | | | | | | | | | > | | | | | | | 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 | (debug:print 0 *default-log-port* "WARNING: write transaction requested on a readonly area. cmd="cmd" params="params) #f) ;; This block was for pre-emptively resetting the connection if there had been no communication for some time. ;; I don't think it adds any value. If the server is not there, just fail and start a new connection. ;; also, the expire-time calculation might not be correct. We want, time-since-last-server-access > (server:get-timeout) ;; ;;DOT CASE4 [label="reset\nconnection"]; ;;DOT MUTEXLOCK -> CASE4 [label="have connection,\nlast_access > expire_time"]; {rank=same "case 4" CASE4} ;;DOT CASE4 -> "rmt:send-receive"; ;; reset the connection if it has been unused too long ((and runremote (remote-conndat runremote) (> (current-seconds) ;; if it has been more than server-timeout seconds since last contact, close this connection and start a new on (+ (http-transport:server-dat-get-last-access (remote-conndat runremote)) (remote-server-timeout runremote)))) (debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses, forcing new connection.") (http-transport:close-connections area-dat: runremote) (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection. (mutex-unlock! *rmt-mutex*) (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;;DOT CASE5 [label="local\nread"]; ;;DOT MUTEXLOCK -> CASE5 [label="server not required,\non homehost,\nread-only query"]; {rank=same "case 5" CASE5}; ;;DOT CASE5 -> "rmt:open-qry-close-locally"; ;; on homehost and this is a read ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required (cdr (remote-hh-dat runremote)) ;; on homehost (member cmd api:read-only-queries)) ;; this is a read |
︙ | ︙ | |||
242 243 244 245 246 247 248 249 | ((exn)(vector #f "other fail" (print-call-chain))))) (else (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport runremote) " not supported") (exit)))) (success (if (vector? dat) (vector-ref dat 0) #f)) (res (if (vector? dat) (vector-ref dat 1) #f))) (if (and (vector? conninfo) (> 5 (vector-length conninfo))) (http-transport:server-dat-update-last-access conninfo) ;; refresh access time | > > > > | | 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 | ((exn)(vector #f "other fail" (print-call-chain))))) (else (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport runremote) " not supported") (exit)))) (success (if (vector? dat) (vector-ref dat 0) #f)) (res (if (vector? dat) (vector-ref dat 1) #f))) (if (and (vector? conninfo) (> 5 (vector-length conninfo))) (http-transport:server-dat-update-last-access conninfo) ;; refresh access time (begin (set! conninfo #f) (remote-conndat-set! runremote #f))) ;; (mutex-unlock! *rmt-mutex*) (debug:print-info 13 *default-log-port* "rmt:send-receive, case 9. conninfo=" conninfo " dat=" dat " runremote = " runremote) (mutex-unlock! *rmt-mutex*) (if success ;; success only tells us that the transport was successful, have to examine the data to see if there was a detected issue at the other end (if (and (vector? res) |
︙ | ︙ |
Modified server.scm from [52a482f03f] to [e95564f4fe].
︙ | ︙ | |||
437 438 439 440 441 442 443 | (define (server:login toppath) (lambda (toppath) (set! *db-last-access* (current-seconds)) ;; might not be needed. (if (equal? *toppath* toppath) #t #f))) | | > | | | | < < < | | 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 | (define (server:login toppath) (lambda (toppath) (set! *db-last-access* (current-seconds)) ;; might not be needed. (if (equal? *toppath* toppath) #t #f))) ;; timeout is hms string: 1h 5m 3s, default is 1 minute ;; (define (server:expiration-timeout) (let ((tmo (configf:lookup *configdat* "server" "timeout"))) (if (and (string? tmo) (common:hms-string->seconds tmo)) (string->number tmo) 60))) ;; moving this here as it needs access to db and cannot be in common. ;; (define (server:writable-watchdog dbstruct) (thread-sleep! 0.05) ;; delay for startup (let ((legacy-sync (common:run-sync?)) (debug-mode (debug:debug-mode 1)) |
︙ | ︙ |