Overview
Comment: | Remove -O4, fixed the force server switch. |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | runaway-servers-fix |
Files: | files | file ages | folders |
SHA1: |
c5f5a4ad1973b77052bd8d7cadef4db5 |
User & Date: | matt on 2017-03-23 13:27:38 |
Other Links: | branch diff | manifest | tags |
Context
2017-03-23
| ||
13:56 | Added message when server is forced Closed-Leaf check-in: 6a476e9ca7 user: matt tags: runaway-servers-fix | |
13:38 | merged lockfile fix check-in: f25594cb64 user: bjbarcla tags: v1.63 | |
13:27 | Remove -O4, fixed the force server switch. check-in: c5f5a4ad19 user: matt tags: runaway-servers-fix | |
12:49 | adding some file-exists? protections and a fix to force-server? from 1.63 check-in: 211ecbabeb user: bjbarcla tags: runaway-servers-fix | |
Changes
Modified Makefile from [daa4306c4d] to [693a5b2d45].
1 2 3 4 | # make install CSCOPTS='-accumulate-profile -profile-name $(PWD)/profile-ww$(shell date +%V.%u)' # rm <files>.o ; make install CSCOPTS='-profile' ; ... ; chicken-profile | less PREFIX=$(PWD) | | | 1 2 3 4 5 6 7 8 9 10 11 12 | # make install CSCOPTS='-accumulate-profile -profile-name $(PWD)/profile-ww$(shell date +%V.%u)' # rm <files>.o ; make install CSCOPTS='-profile' ; ... ; chicken-profile | less PREFIX=$(PWD) CSCOPTS= INSTALL=install SRCFILES = common.scm items.scm launch.scm \ ods.scm runconfig.scm server.scm configf.scm \ db.scm keys.scm margs.scm megatest-version.scm \ process.scm runs.scm tasks.scm tests.scm genexample.scm \ http-transport.scm filedb.scm \ client.scm synchash.scm daemon.scm mt.scm \ |
︙ | ︙ |
Modified common.scm from [fe39965c84] to [73b38805aa].
︙ | ︙ | |||
428 429 430 431 432 433 434 | (print key-string))) (thread-sleep! 0.25) (if (file-exists? fname) (with-input-from-file fname (lambda () (equal? key-string (read-line)))) #f)))) | | > > > > > > > > > | 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 | (print key-string))) (thread-sleep! 0.25) (if (file-exists? fname) (with-input-from-file fname (lambda () (equal? key-string (read-line)))) #f)))) (define (common:simple-file-lock-and-wait fname #!key (expire-time 300)) (let ((end-time (+ expire-time (current-seconds)))) (let loop ((got-lock (common:simple-file-lock fname expire-time: expire-time))) (if got-lock #t (if (> end-time (current-seconds)) (loop (common:simple-file-lock fname expire-time: expire-time)) #f))))) (define (common:simple-file-release-lock fname) (delete-file* fname)) ;;====================================================================== ;; S T A T E S A N D S T A T U S E S ;;====================================================================== |
︙ | ︙ | |||
1042 1043 1044 1045 1046 1047 1048 | (not (or (args:get-arg "-no-cache") (and *configdat* (equal? (configf:lookup *configdat* "setup" "use-cache") "no"))))) ;; force use of server? ;; (define (common:force-server?) | | | | > > > | 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 | (not (or (args:get-arg "-no-cache") (and *configdat* (equal? (configf:lookup *configdat* "setup" "use-cache") "no"))))) ;; force use of server? ;; (define (common:force-server?) (let* ((force-setting (configf:lookup *configdat* "server" "force")) (force-type (if force-setting (string->symbol force-setting) #f))) (case force-type ((#f) #f) ((always) #t) ((test) (if (args:get-arg "-execute") ;; we are in a test #t #f)) (else (debug:print 0 *default-log-port* "ERROR: Bad server force setting " force-setting ", forcing server.") #t)))) ;; default to requiring server ;;====================================================================== ;; M I S C L I S T S ;;====================================================================== ;; items in lista are matched value and position in listb ;; return the remaining items in listb or #f |
︙ | ︙ |
Modified rmt.scm from [9187b44a8c] to [1adf35b1f4].
︙ | ︙ | |||
93 94 95 96 97 98 99 | (< (http-transport:server-dat-get-last-access (remote-conndat runremote)) expire-time))) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 8") (remote-conndat-set! runremote #f) (mutex-unlock! *rmt-mutex*) (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; ensure we have a record for our connection for given area ((not runremote) | | | 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 | (< (http-transport:server-dat-get-last-access (remote-conndat runremote)) expire-time))) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 8") (remote-conndat-set! runremote #f) (mutex-unlock! *rmt-mutex*) (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; ensure we have a record for our connection for given area ((not runremote) (set! *runremote* (make-remote)) ;; new runremote will come from this on next iteration (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 1") (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; ensure we have a homehost record ((not (pair? (remote-hh-dat runremote))) ;; not on homehost (thread-sleep! 0.1) ;; since we shouldn't get here, delay a little (remote-hh-dat-set! runremote (common:get-homehost)) |
︙ | ︙ | |||
140 141 142 143 144 145 146 | (cdr (remote-hh-dat runremote)) ;; new (not (remote-server-url runremote)) (not (member cmd api:read-only-queries))) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5") (let ((server-url (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call (if server-url (remote-server-url-set! runremote server-url) ;; the string can be consumed by the client setup if needed | > > | > > > | | | | 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 | (cdr (remote-hh-dat runremote)) ;; new (not (remote-server-url runremote)) (not (member cmd api:read-only-queries))) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5") (let ((server-url (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call (if server-url (remote-server-url-set! runremote server-url) ;; the string can be consumed by the client setup if needed (if (common:force-server?) (server:start-and-wait *toppath*) (server:kind-run *toppath*)))) (remote-force-server-set! runremote (common:force-server?)) (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5.1") (rmt:open-qry-close-locally cmd 0 params)) ((or (and (remote-force-server runremote) ;; we are forcing a server and don't yet have a connection to one (not (remote-conndat runremote))) (and (not (cdr (remote-hh-dat runremote))) ;; not on a homehost (not (remote-conndat runremote)))) ;; and no connection (debug:print-info 12 *default-log-port* "rmt:send-receive, case 6 hh-dat: " (remote-hh-dat runremote) " conndat: " (remote-conndat runremote)) (mutex-unlock! *rmt-mutex*) (server:start-and-wait *toppath*) (remote-force-server-set! runremote (common:force-server?)) (remote-conndat-set! runremote (rmt:get-connection-info *toppath*)) ;; calls client:setup which calls client:setup-http (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as ;; all set up if get this far, dispatch the query ((and (not (remote-force-server runremote)) (cdr (remote-hh-dat runremote))) ;; we are on homehost (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 7") |
︙ | ︙ |
Modified server.scm from [948061a1f4] to [ba8be5ee9a].
︙ | ︙ | |||
268 269 270 271 272 273 274 | ((1) 20) ((2) 300) (else 600)) (random 5))) ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously (lock-file (conc areapath "/logs/server-start.lock"))) (if (> (- (current-seconds) when-run) run-delay) (begin | | | 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 | ((1) 20) ((2) 300) (else 600)) (random 5))) ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously (lock-file (conc areapath "/logs/server-start.lock"))) (if (> (- (current-seconds) when-run) run-delay) (begin (common:simple-file-lock-and-wait lock-file expire-time: 15) (server:run areapath) (thread-sleep! 5) ;; don't release the lock for at least a few seconds (common:simple-file-release-lock lock-file))) (hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds)))))) (define (server:start-and-wait areapath #!key (timeout 60)) (let ((give-up-time (+ (current-seconds) timeout))) |
︙ | ︙ |
Modified tests/fdktestqa/testqa/megatest.config from [d32541500d] to [200e742890].
1 2 3 4 5 | [setup] testcopycmd cp --remove-destination -rlv TEST_SRC_PATH/. TEST_TARG_PATH/. >> TEST_TARG_PATH/mt_launch.log 2>> TEST_TARG_PATH/mt_launch.log # launchwait no launch-delay 0 | < < < | 1 2 3 4 5 6 7 8 9 10 11 12 | [setup] testcopycmd cp --remove-destination -rlv TEST_SRC_PATH/. TEST_TARG_PATH/. >> TEST_TARG_PATH/mt_launch.log 2>> TEST_TARG_PATH/mt_launch.log # launchwait no launch-delay 0 # All these are overridden in ../fdk.config # [jobtools] # launcher nbfake # launcher bsub -q priority -o $MT_TEST_RUN_DIR/openlava.log [include ../fdk.config] |
︙ | ︙ |