Overview
Comment: | borked server heartbeat logic |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
ece909ab1c2ccad47eaf87bac003a67d |
User & Date: | mrwellan on 2012-11-02 18:33:58 |
Other Links: | manifest | tags |
Context
2012-11-03
| ||
17:11 | Heartbeat monitoring, on-the-fly server starting all working in simple manual testing check-in: 12d923326a user: matt tags: trunk | |
2012-11-02
| ||
18:33 | borked server heartbeat logic check-in: ece909ab1c user: mrwellan tags: trunk | |
17:36 | Added interface to the monitor db and appropriate handling thereof. check-in: 5f757480e6 user: mrwellan tags: trunk, v1.506 | |
Changes
Modified megatest-version.scm from [4bf7ad110a] to [94672ddb9c].
1 2 3 4 5 | ;; Always use two digit decimal ;; 1.01, 1.02...1.10,1.11 ... 1.99,2.00.. (declare (unit megatest-version)) | | | 1 2 3 4 5 6 7 | ;; Always use two digit decimal ;; 1.01, 1.02...1.10,1.11 ... 1.99,2.00.. (declare (unit megatest-version)) (define megatest-version 1.5107) |
Modified megatest.scm from [1433ed4462] to [71ef6a547c].
︙ | ︙ | |||
154 155 156 157 158 159 160 161 162 163 164 165 166 167 | ":value" ":expected" ":tol" ":units" ;; misc "-server" "-killserver" "-extract-ods" "-pathmod" "-env2file" "-setvars" "-set-state-status" "-debug" ;; for *verbosity* > 2 "-gen-megatest-test" | > | 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 | ":value" ":expected" ":tol" ":units" ;; misc "-server" "-killserver" "-port" "-extract-ods" "-pathmod" "-env2file" "-setvars" "-set-state-status" "-debug" ;; for *verbosity* > 2 "-gen-megatest-test" |
︙ | ︙ | |||
302 303 304 305 306 307 308 | (equal? hostname (car khost-port)) (equal? port (string->number (cadr khost-port))))) (begin (open-run-close tasks:server-deregister tasks:open-db hostname port: port) (if status ;; #t means alive (begin (if (equal? hostname (get-host-name)) | | | | | 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 | (equal? hostname (car khost-port)) (equal? port (string->number (cadr khost-port))))) (begin (open-run-close tasks:server-deregister tasks:open-db hostname port: port) (if status ;; #t means alive (begin (if (equal? hostname (get-host-name)) (process-signal pid signal/term) ;; local machine, send sig term (cdb:kill-server zmq-socket)) ;; remote machine, try telling server to commit suicide (debug:print-info 1 "Killed server by host:port at " hostname ":" port)) (debug:print-info 1 "Removing defunct server record for " hostname ":" port)) (set! killed #t))) (if (and kpid ;; (equal? hostname (car khost-port)) (equal? kpid pid)) ;;; YEP, ALL WITH PID WILL BE KILLED!!! (begin (open-run-close tasks:server-deregister tasks:open-db hostname pid: pid) (set! killed #t) (if status (if (equal? hostname (get-host-name)) (process-signal pid signal/term) ;; local machine, send sig term (debug:print 0 "WARNING: Can't kill a dead server on host " hostname))) (debug:print-info 1 "Killed server by pid at " hostname ":" port))) ;; (if zmq-socket (close-socket zmq-socket)) (format #t fmtstr id mt-ver pid hostname interface port start-time priority (if status "alive" "dead")))) servers) (debug:print-info 1 "Done with listservers") |
︙ | ︙ |
Modified server.scm from [000964c6b9] to [aa1a28766d].
︙ | ︙ | |||
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 | (include "db_records.scm") (define (server:make-server-url hostport) (if (not hostport) #f (conc "tcp://" (car hostport) ":" (cadr hostport)))) (define (server:run hostn) (debug:print 0 "Attempting to start the server ...") (if (not *toppath*) (if (not (setup-for-run)) (begin (debug:print 0 "ERROR: cannot find megatest.config, cannot start server, exiting") (exit)))) (let* ((zmq-socket #f) (iface (if (string=? "-" hostn) "*" ;; (get-host-name) hostn)) (hostname (get-host-name)) (ipaddrstr (let ((ipstr (if (string=? "-" hostn) (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".") #f))) (if ipstr ipstr hostname)))) ;; (set! zmq-socket (server:find-free-port-and-open iface zmq-socket 5555 0)) | > > | > > > | 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 | (include "db_records.scm") (define (server:make-server-url hostport) (if (not hostport) #f (conc "tcp://" (car hostport) ":" (cadr hostport)))) (define *server-loop-heart-beat* (list 'start (current-seconds))) (define (server:run hostn) (debug:print 0 "Attempting to start the server ...") (if (not *toppath*) (if (not (setup-for-run)) (begin (debug:print 0 "ERROR: cannot find megatest.config, cannot start server, exiting") (exit)))) (let* ((zmq-socket #f) (iface (if (string=? "-" hostn) "*" ;; (get-host-name) hostn)) (hostname (get-host-name)) (ipaddrstr (let ((ipstr (if (string=? "-" hostn) (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".") #f))) (if ipstr ipstr hostname)))) ;; (set! zmq-socket (server:find-free-port-and-open iface zmq-socket 5555 0)) (set! zmq-socket (server:find-free-port-and-open ipaddrstr zmq-socket (if (args:get-arg "-port") (string->number (args:get-arg "-port")) 5555) 0)) (set! *cache-on* #t) ;; what to do when we quit ;; (on-exit (lambda () (if (and *toppath* *server-id*) (begin |
︙ | ︙ | |||
65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 | (begin (debug:print-info 0 "Queue not flushed, waiting ...") (loop)))))))) ;; The heavy lifting ;; (let loop () (let* ((rawmsg (receive-message* zmq-socket)) (params (db:string->obj rawmsg)) ;; (with-input-from-string rawmsg (lambda ()(deserialize)))) (res #f)) (debug:print-info 12 "server=> received params=" params) (set! res (cdb:cached-access params)) (debug:print-info 12 "server=> processed res=" res) (send-message zmq-socket (db:obj->string res)) (if (not *time-to-exit*) (loop) (begin | > > > > > > > > | 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 | (begin (debug:print-info 0 "Queue not flushed, waiting ...") (loop)))))))) ;; The heavy lifting ;; (let loop () ;; Ugly yuk. (mutex-lock! *incoming-mutex*) (set! *server-loop-heart-beat* (list 'waiting (current-seconds))) (mutex-unlock! *incoming-mutex*) (let* ((rawmsg (receive-message* zmq-socket)) (params (db:string->obj rawmsg)) ;; (with-input-from-string rawmsg (lambda ()(deserialize)))) (res #f)) ;;; Ugly yuk. (mutex-lock! *incoming-mutex*) (set! *server-loop-heart-beat* (list 'working (current-seconds))) (mutex-unlock! *incoming-mutex*) (debug:print-info 12 "server=> received params=" params) (set! res (cdb:cached-access params)) (debug:print-info 12 "server=> processed res=" res) (send-message zmq-socket (db:obj->string res)) (if (not *time-to-exit*) (loop) (begin |
︙ | ︙ | |||
92 93 94 95 96 97 98 | ;; server last used then start shutdown (let loop ((count 0)) (thread-sleep! 3) ;; no need to do this very often (db:write-cached-data) ;; (print "Server running, count is " count) (if (< count 2) ;; 3x3 = 9 secs aprox (loop (+ count 1)) | | > > > > > > > > > > > | > | 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | ;; server last used then start shutdown (let loop ((count 0)) (thread-sleep! 3) ;; no need to do this very often (db:write-cached-data) ;; (print "Server running, count is " count) (if (< count 2) ;; 3x3 = 9 secs aprox (loop (+ count 1)) (let ((numrunning (open-run-close db:get-count-tests-running #f)) (server-loop-heartbeat #f)) ;;; Ugly yuk. (mutex-lock! *incoming-mutex*) (set! server-loop-heartbeat *server-loop-heart-beat*) (mutex-unlock! *incoming-mutex*) ;; The logic here is that if the server loop gets stuck blocked in working ;; we don't want to update our heartbeat (let ((server-state (car server-loop-heartbeat)) (server-update (cadr server-loop-heartbeat))) (if (or (eq? server-state 'waiting) (< (- (current-seconds) server-update) 10)) (open-run-close tasks:server-update-heartbeat tasks:open-db *server-id*) (debug:print "ERROR: No heartbeat update, server appears stuck"))) (if (or (> numrunning 0) ;; stay alive for two days after last access (> (+ *last-db-access* (* 48 60 60))(current-seconds))) (begin (debug:print-info 2 "Server continuing, tests running: " numrunning ", seconds since last db access: " (- (current-seconds) *last-db-access*)) (loop 0)) (begin (debug:print-info 0 "Starting to shutdown the server.") |
︙ | ︙ | |||
170 171 172 173 174 175 176 | (define (server:client-logout zmq-socket) (let ((ok (and (socket? zmq-socket) (cdb:logout zmq-socket *toppath* (server:get-client-signature))))) ;; (close-socket zmq-socket) ok)) ;; Do all the connection work, start a server if not already running | | | 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 | (define (server:client-logout zmq-socket) (let ((ok (and (socket? zmq-socket) (cdb:logout zmq-socket *toppath* (server:get-client-signature))))) ;; (close-socket zmq-socket) ok)) ;; Do all the connection work, start a server if not already running (define (server:client-setup #!key (numtries 50)(do-ping #f)) (if (not *toppath*) (if (not (setup-for-run)) (begin (debug:print 0 "ERROR: failed to find megatest.config, exiting") (exit)))) (let ((hostinfo (open-run-close tasks:get-best-server tasks:open-db do-ping: do-ping))) (if hostinfo |
︙ | ︙ | |||
210 211 212 213 214 215 216 | (if (> numtries 0) (let ((exe (car (argv)))) (debug:print-info 1 "No server available, attempting to start one...") (process-run exe (list "-server" "-" "-debug" (conc *verbosity*))) (sleep 2) ;; not doing ping, assume the server started and registered itself (server:client-setup numtries: (- numtries 1) do-ping: #f)) | | | 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 | (if (> numtries 0) (let ((exe (car (argv)))) (debug:print-info 1 "No server available, attempting to start one...") (process-run exe (list "-server" "-" "-debug" (conc *verbosity*))) (sleep 2) ;; not doing ping, assume the server started and registered itself (server:client-setup numtries: (- numtries 1) do-ping: #f)) (debug:print-info 1 "Too many attempts, giving up"))))) (define (server:launch) (if (not *toppath*) (if (not (setup-for-run)) (begin (debug:print 0 "ERROR: cannot find megatest.config, exiting") (exit)))) |
︙ | ︙ |
Modified tasks.scm from [3a1458e323] to [efc84088da].
︙ | ︙ | |||
117 118 119 120 121 122 123 | server-id (tasks:server-get-server-id mdb hostname port pid))) (heartbeat-delta 99e9)) (sqlite3:for-each-row (lambda (delta) (set! heartbeat-delta delta)) mdb "SELECT strftime('%s','now')-heartbeat FROM servers WHERE id=?;" server-id) | | | 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 | server-id (tasks:server-get-server-id mdb hostname port pid))) (heartbeat-delta 99e9)) (sqlite3:for-each-row (lambda (delta) (set! heartbeat-delta delta)) mdb "SELECT strftime('%s','now')-heartbeat FROM servers WHERE id=?;" server-id) (< heartbeat-delta 10))) (define (tasks:client-register mdb pid hostname cmdline) (sqlite3:execute mdb "INSERT OR REPLACE INTO clients (server_id,pid,hostname,cmdline,login_time) VALUES(?,?,?,?,strftime('%s','now'));") (tasks:server-get-server-id mdb) pid hostname cmdline) |
︙ | ︙ |