Overview
Comment: | Added exception handling to server kill |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk | v1.5108 |
Files: | files | file ages | folders |
SHA1: |
6aea490eacae55955f4ae0e77cb7b1d4 |
User & Date: | matt on 2012-11-03 18:20:15 |
Other Links: | manifest | tags |
Context
2012-11-03
| ||
19:47 | Reversed order for selecting servers so oldest is always choosen. check-in: 9b1da0d111 user: mrwellan tags: trunk | |
18:20 | Added exception handling to server kill check-in: 6aea490eac user: matt tags: trunk, v1.5108 | |
17:18 | Added more instrumentation info on server pulse. Removed rm of monitor.db in Makefile check-in: 04c31891a9 user: matt tags: trunk | |
Changes
Modified megatest-version.scm from [94672ddb9c] to [c0fe9af7ba].
1 2 3 4 5 | ;; Always use two digit decimal ;; 1.01, 1.02...1.10,1.11 ... 1.99,2.00.. (declare (unit megatest-version)) | | | 1 2 3 4 5 6 7 | ;; Always use two digit decimal ;; 1.01, 1.02...1.10,1.11 ... 1.99,2.00.. (declare (unit megatest-version)) (define megatest-version 1.5108) |
Modified megatest.scm from [c4c3f3d52b] to [5c2117b3b6].
︙ | ︙ | |||
294 295 296 297 298 299 300 | (state (vector-ref server 7)) (mt-ver (vector-ref server 8)) (status (open-run-close tasks:server-alive? tasks:open-db #f hostname: hostname port: port)) (killed #f) (zmq-socket (if status (server:client-connect hostname port) #f))) ;; no need to login as status of #t indicates we are connecting to correct ;; server | | > > | | | | | 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 | (state (vector-ref server 7)) (mt-ver (vector-ref server 8)) (status (open-run-close tasks:server-alive? tasks:open-db #f hostname: hostname port: port)) (killed #f) (zmq-socket (if status (server:client-connect hostname port) #f))) ;; no need to login as status of #t indicates we are connecting to correct ;; server (if (not status) ;; no point in keeping dead records in the db (open-run-close tasks:server-deregister tasks:open-db hostname port: port pid: pid)) (if (and khost-port ;; kill by host/port (equal? hostname (car khost-port)) (equal? port (string->number (cadr khost-port)))) (tasks:kill-server status hostname port pid)) (if (and kpid (equal? hostname (get-host-name)) (equal? kpid pid)) ;;; YEP, ALL WITH PID WILL BE KILLED!!! (tasks:kill-server status hostname #f pid)) (format #t fmtstr id mt-ver pid hostname interface port start-time priority (if status "alive" "dead")))) servers) (debug:print-info 1 "Done with listservers") |
︙ | ︙ |
Modified server.scm from [ebf5897ef4] to [d8bc67a0d6].
︙ | ︙ | |||
140 141 142 143 144 145 146 | (mutex-lock! *heartbeat-mutex*) (set! server-loop-heartbeat *server-loop-heart-beat*) (set! server-info *server-info*) (mutex-unlock! *heartbeat-mutex*) ;; The logic here is that if the server loop gets stuck blocked in working ;; we don't want to update our heartbeat (set! pulse (- (current-seconds) server-loop-heartbeat)) | | | 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 | (mutex-lock! *heartbeat-mutex*) (set! server-loop-heartbeat *server-loop-heart-beat*) (set! server-info *server-info*) (mutex-unlock! *heartbeat-mutex*) ;; The logic here is that if the server loop gets stuck blocked in working ;; we don't want to update our heartbeat (set! pulse (- (current-seconds) server-loop-heartbeat)) (debug:print-info 1 "Heartbeat period is " pulse " seconds on " (cadr server-info) ":" (caddr server-info) ", last db access is " (- (current-seconds) *last-db-access*) " seconds ago") (if (> pulse 11) ;; must stay less than 10 seconds (begin (debug:print 0 "ERROR: Heartbeat failed, committing servercide") (exit)) (open-run-close tasks:server-update-heartbeat tasks:open-db (car server-info))) (if (or (> numrunning 0) ;; stay alive for two days after last access (> (+ *last-db-access* |
︙ | ︙ |
Modified tasks.scm from [6c83278de0] to [67418262aa].
︙ | ︙ | |||
122 123 124 125 126 127 128 | server-id (tasks:server-get-server-id mdb hostname port pid))) (heartbeat-delta 99e9)) (sqlite3:for-each-row (lambda (delta) (set! heartbeat-delta delta)) mdb "SELECT strftime('%s','now')-heartbeat FROM servers WHERE id=?;" server-id) | < | 122 123 124 125 126 127 128 129 130 131 132 133 134 135 | server-id (tasks:server-get-server-id mdb hostname port pid))) (heartbeat-delta 99e9)) (sqlite3:for-each-row (lambda (delta) (set! heartbeat-delta delta)) mdb "SELECT strftime('%s','now')-heartbeat FROM servers WHERE id=?;" server-id) (< heartbeat-delta 10))) (define (tasks:client-register mdb pid hostname cmdline) (sqlite3:execute mdb "INSERT OR REPLACE INTO clients (server_id,pid,hostname,cmdline,login_time) VALUES(?,?,?,?,strftime('%s','now'));") (tasks:server-get-server-id mdb hostname #f pid) |
︙ | ︙ | |||
185 186 187 188 189 190 191 | (if (null? tal) #f (loop (car tal)(cdr tal)))))))))) (define (tasks:kill-server status hostname port pid) (debug:print-info 1 "Removing defunct server record for " hostname ":" port) (if port | | | > | > > | | | | | 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 | (if (null? tal) #f (loop (car tal)(cdr tal)))))))))) (define (tasks:kill-server status hostname port pid) (debug:print-info 1 "Removing defunct server record for " hostname ":" port) (if port (open-run-close tasks:server-deregister tasks:open-db hostname port: port) (open-run-close tasks:server-deregister tasks:open-db hostname pid: pid)) (if status ;; #t means alive (begin (if (equal? hostname (get-host-name)) (handle-exceptions exn (debug:print-info 0 "server may or may not be dead, check for megatest -server running as pid " pid "\n" " EXCEPTION: " ((condition-property-accessor 'exn 'message) exn)) (debug:print 1 "Sending signal/term to " pid " on " hostname) (process-signal pid signal/term) (thread-sleep! 5) ;; give it five seconds to die peacefully then do a brutal kill (process-signal pid signal/kill)) ;; local machine, send sig term (begin (debug:print-info 1 "Telling alive server on " hostname ":" port " to commit servercide") (cdb:kill-server zmq-socket)))) ;; remote machine, try telling server to commit suicide (begin (if status (if (equal? hostname (get-host-name)) (begin |
︙ | ︙ |