Overview
Comment: | Improve behavior under high load where servers were being detected as dead and new servers were started |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
569cc10ef2cafb6ddc6016dcd21f7ef5 |
User & Date: | matt on 2012-11-03 21:32:50 |
Other Links: | manifest | tags |
Context
2012-11-05
| ||
10:21 | Bumped version, reduced noise from the server process in normal debug mode check-in: 5043b0da9e user: mrwellan tags: trunk, v1.5109 | |
2012-11-03
| ||
21:32 | Improve behavior under high load where servers were being detected as dead and new servers were started check-in: 569cc10ef2 user: matt tags: trunk | |
19:47 | Reversed order for selecting servers so oldest is always choosen. check-in: 9b1da0d111 user: mrwellan tags: trunk | |
Changes
Modified server.scm from [d8bc67a0d6] to [927a23d32e].
︙ | ︙ | |||
31 32 33 34 35 36 37 | (define *server-loop-heart-beat* (current-seconds)) (define *heartbeat-mutex* (make-mutex)) (define (server:self-ping iface port) (let ((zsocket (server:client-connect iface port))) (let loop () | | | 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 | (define *server-loop-heart-beat* (current-seconds)) (define *heartbeat-mutex* (make-mutex)) (define (server:self-ping iface port) (let ((zsocket (server:client-connect iface port))) (let loop () (thread-sleep! 2) (cdb:client-call zsocket 'ping #t) (debug:print 4 "server:self-ping - I'm alive on " iface ":" port "!") (mutex-lock! *heartbeat-mutex*) (set! *server-loop-heart-beat* (current-seconds)) (mutex-unlock! *heartbeat-mutex*) (loop)))) |
︙ | ︙ | |||
122 123 124 125 126 127 128 | ;; run server:keep-running in a parallel thread to monitor that the db is being ;; used and to shutdown after sometime if it is not. ;; (define (server:keep-running) ;; if none running or if > 20 seconds since ;; server last used then start shutdown (let loop ((count 0)) | | | | | > | | | | | | | | > | 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 | ;; run server:keep-running in a parallel thread to monitor that the db is being ;; used and to shutdown after sometime if it is not. ;; (define (server:keep-running) ;; if none running or if > 20 seconds since ;; server last used then start shutdown (let loop ((count 0)) (thread-sleep! 4) ;; no need to do this very often (db:write-cached-data) ;; (print "Server running, count is " count) (if (< count 1) ;; 3x3 = 9 secs aprox (loop (+ count 1)) (let (;; (numrunning (open-run-close db:get-count-tests-running #f)) (server-loop-heartbeat #f) (server-info #f) (pulse 0)) ;; BUG add a wait on server alive here!! ;; ;; Ugly yuk. (mutex-lock! *heartbeat-mutex*) (set! server-loop-heartbeat *server-loop-heart-beat*) (set! server-info *server-info*) (mutex-unlock! *heartbeat-mutex*) ;; The logic here is that if the server loop gets stuck blocked in working ;; we don't want to update our heartbeat (set! pulse (- (current-seconds) server-loop-heartbeat)) (debug:print-info 1 "Heartbeat period is " pulse " seconds on " (cadr server-info) ":" (caddr server-info) ", last db access is " (- (current-seconds) *last-db-access*) " seconds ago") (if (> pulse 15) ;; must stay less than 10 seconds (begin (open-run-close tasks:server-deregister tasks:open-db (cadr server-info) port: (caddr server-info)) (debug:print 0 "ERROR: Heartbeat failed, committing servercide") (exit)) (open-run-close tasks:server-update-heartbeat tasks:open-db (car server-info))) ;; (if ;; (or (> numrunning 0) ;; stay alive for two days after last access (if (> (+ *last-db-access* ;; (* 48 60 60) ;; 48 hrs ;; 60 ;; one minute (* 60 60) ;; one hour ) (current-seconds)) (begin ;; (debug:print-info 2 "Server continuing, tests running: " numrunning ", seconds since last db access: " (- (current-seconds) *last-db-access*)) (debug:print-info 2 "Server continuing, seconds since last db access: " (- (current-seconds) *last-db-access*)) (loop 0)) (begin (debug:print-info 0 "Starting to shutdown the server.") ;; need to delete only *my* server entry (future use) (set! *time-to-exit* #t) (open-run-close tasks:server-deregister-self tasks:open-db (get-host-name)) (thread-sleep! 1) |
︙ | ︙ | |||
292 293 294 295 296 297 298 | (debug:print-info 1 "NOT starting new server, one is already running on " (car hostinfo) ":" (cadr hostinfo)) (if *toppath* (let* ((th1 (make-thread (lambda () (let ((server-info #f)) ;; wait for the server to be online and available (let loop () (debug:print-info 1 "Waiting for the server to come online before starting heartbeat") | | | | 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 | (debug:print-info 1 "NOT starting new server, one is already running on " (car hostinfo) ":" (cadr hostinfo)) (if *toppath* (let* ((th1 (make-thread (lambda () (let ((server-info #f)) ;; wait for the server to be online and available (let loop () (debug:print-info 1 "Waiting for the server to come online before starting heartbeat") (thread-sleep! 2) (mutex-lock! *heartbeat-mutex*) (set! server-info *server-info* ) (mutex-unlock! *heartbeat-mutex*) (if (not server-info)(loop))) (debug:print 1 "Server alive, starting self-ping") (server:self-ping (cadr server-info)(caddr server-info)))) "Self ping")) (th2 (make-thread (lambda () (server:run (args:get-arg "-server"))) "Server run")) (th3 (make-thread (lambda () (server:keep-running)) "Keep running"))) (set! *client-non-blocking-mode* #t) (thread-start! th1) (thread-start! th2) (thread-start! th3) (set! *didsomething* #t) (thread-join! th3)) (debug:print 0 "ERROR: Failed to setup for megatest"))) (exit))) (define (server:client-launch #!key (do-ping #f)) (if (server:client-setup) (debug:print-info 2 "connected as client") (begin (debug:print 0 "ERROR: Failed to connect as client") (exit)))) ;; ping a server and return number of clients or #f (if no response) (define (server:ping host port #!key (secs 10)(return-socket #f)) |
︙ | ︙ |
Modified tasks.scm from [228d197129] to [15052bf640].
︙ | ︙ | |||
24 25 26 27 28 29 30 31 32 33 34 35 36 37 | (define (tasks:open-db) (let* ((dbpath (conc *toppath* "/monitor.db")) (exists (file-exists? dbpath)) (mdb (sqlite3:open-database dbpath)) ;; (never-give-up-open-db dbpath)) (handler (make-busy-timeout 36000))) (sqlite3:set-busy-handler! mdb handler) (if (not exists) (begin (sqlite3:execute mdb "CREATE TABLE IF NOT EXISTS tasks_queue (id INTEGER PRIMARY KEY, action TEXT DEFAULT '', owner TEXT, state TEXT DEFAULT 'new', target TEXT DEFAULT '', | > | 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 | (define (tasks:open-db) (let* ((dbpath (conc *toppath* "/monitor.db")) (exists (file-exists? dbpath)) (mdb (sqlite3:open-database dbpath)) ;; (never-give-up-open-db dbpath)) (handler (make-busy-timeout 36000))) (sqlite3:set-busy-handler! mdb handler) (sqlite3:execute mdb (conc "PRAGMA synchronous = 0;")) (if (not exists) (begin (sqlite3:execute mdb "CREATE TABLE IF NOT EXISTS tasks_queue (id INTEGER PRIMARY KEY, action TEXT DEFAULT '', owner TEXT, state TEXT DEFAULT 'new', target TEXT DEFAULT '', |
︙ | ︙ | |||
186 187 188 189 190 191 192 | (loop (car tal)(cdr tal)))))))))) (define (tasks:kill-server status hostname port pid) (debug:print-info 1 "Removing defunct server record for " hostname ":" port) (if port (open-run-close tasks:server-deregister tasks:open-db hostname port: port) (open-run-close tasks:server-deregister tasks:open-db hostname pid: pid)) | < | 187 188 189 190 191 192 193 194 195 196 197 198 199 200 | (loop (car tal)(cdr tal)))))))))) (define (tasks:kill-server status hostname port pid) (debug:print-info 1 "Removing defunct server record for " hostname ":" port) (if port (open-run-close tasks:server-deregister tasks:open-db hostname port: port) (open-run-close tasks:server-deregister tasks:open-db hostname pid: pid)) (if status ;; #t means alive (begin (if (equal? hostname (get-host-name)) (handle-exceptions exn (debug:print-info 0 "server may or may not be dead, check for megatest -server running as pid " pid "\n" " EXCEPTION: " ((condition-property-accessor 'exn 'message) exn)) |
︙ | ︙ |