Overview
Comment: | Partial fix to server start issue (breaks startup when db contains dead server). |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.60 |
Files: | files | file ages | folders |
SHA1: |
05fa3869fb95d8746b773d1e2985529a |
User & Date: | mrwellan on 2014-02-26 14:36:52 |
Other Links: | branch diff | manifest | tags |
Context
2014-02-26
| ||
23:42 | Moved ping of server into a separate process check-in: cb5f28cabe user: matt tags: v1.60 | |
14:36 | Partial fix to server start issue (breaks startup when db contains dead server). check-in: 05fa3869fb user: mrwellan tags: v1.60 | |
08:16 | Side stuff, was not checked in Closed-Leaf check-in: db9154c20c user: mrwellan tags: side-stuff | |
2014-02-25
| ||
23:11 | Centralize server starts to server 0 and pace out same-run-id starts by 40 seconds check-in: 837646d7e8 user: matt tags: v1.60 | |
Changes
Modified Makefile from [26b6092a78] to [ac2c437624].
︙ | ︙ | |||
60 61 62 63 64 65 66 67 68 69 70 71 72 73 | # Special dependencies for the includes tests.o db.o launch.o runs.o dashboard-tests.o dashboard-guimonitor.o dashboard-main.o monitor.o dashboard.o megatest.o : db_records.scm tests.o runs.o dashboard.o dashboard-tests.o dashboard-main.o : run_records.scm db.o ezsteps.o keys.o launch.o megatest.o monitor.o runs-for-ref.o runs.o tests.o : key_records.scm tests.o tasks.o dashboard-tasks.o : task_records.scm runs.o : test_records.scm megatest.o : megatest-fossil-hash.scm # Temporary while transitioning to new routine # runs.o : run-tests-queue-classic.scm run-tests-queue-new.scm megatest-fossil-hash.scm : $(SRCFILES) megatest.scm *_records.scm echo "(define megatest-fossil-hash \"$(MTESTHASH)\")" > megatest-fossil-hash.new if ! diff -q megatest-fossil-hash.new megatest-fossil-hash.scm ; then echo copying .new to .scm;cp -f megatest-fossil-hash.new megatest-fossil-hash.scm;fi | > | 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 | # Special dependencies for the includes tests.o db.o launch.o runs.o dashboard-tests.o dashboard-guimonitor.o dashboard-main.o monitor.o dashboard.o megatest.o : db_records.scm tests.o runs.o dashboard.o dashboard-tests.o dashboard-main.o : run_records.scm db.o ezsteps.o keys.o launch.o megatest.o monitor.o runs-for-ref.o runs.o tests.o : key_records.scm tests.o tasks.o dashboard-tasks.o : task_records.scm runs.o : test_records.scm megatest.o : megatest-fossil-hash.scm client.scm common.scm configf.scm dashboard-guimonitor.scm dashboard-tests.scm dashboard.scm db.scm dcommon.scm ezsteps.scm fs-transport.scm http-transport.scm index-tree.scm items.scm keys.scm launch.scm megatest.scm monitor.scm mt.scm newdashboard.scm runconfig.scm runs.scm server.scm tdb.scm tests.scm tree.scm zmq-transport.scm : common_records.scm # Temporary while transitioning to new routine # runs.o : run-tests-queue-classic.scm run-tests-queue-new.scm megatest-fossil-hash.scm : $(SRCFILES) megatest.scm *_records.scm echo "(define megatest-fossil-hash \"$(MTESTHASH)\")" > megatest-fossil-hash.new if ! diff -q megatest-fossil-hash.new megatest-fossil-hash.scm ; then echo copying .new to .scm;cp -f megatest-fossil-hash.new megatest-fossil-hash.scm;fi |
︙ | ︙ |
Modified client.scm from [470e0682b1] to [dd7ce02868].
︙ | ︙ | |||
57 58 59 60 61 62 63 | ;; (define (client:setup run-id #!key (remaining-tries 10) (failed-connects 0)) (debug:print 0 "INFO: client:setup remaining-tries=" remaining-tries) (if (<= remaining-tries 0) (begin (debug:print 0 "ERROR: failed to start or connect to server for run-id " run-id) (exit 1)) | | | | | | | > | | | > > | > | > > > > > > | > | | > | | < < < | 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 | ;; (define (client:setup run-id #!key (remaining-tries 10) (failed-connects 0)) (debug:print 0 "INFO: client:setup remaining-tries=" remaining-tries) (if (<= remaining-tries 0) (begin (debug:print 0 "ERROR: failed to start or connect to server for run-id " run-id) (exit 1)) (let ((host-info (hash-table-ref/default *runremote* run-id #f))) (if host-info (let ((start-res (http-transport:client-connect run-id ;; NB// confusion over host-info and connection result! (car host-info) (cadr host-info)))) (if start-res ;; sucessful login? start-res (if (member remaining-tries '(3 4 6)) (begin ;; login failed (debug:print 25 "INFO: client:setup start-res=" start-res ", run-id=" run-id ", server-dat=" host-info) (hash-table-delete! *runremote* run-id) (open-run-close tasks:server-force-clean-run-record tasks:open-db run-id (car host-info) (cadr host-info) " client:setup (host-info=#t)") (thread-sleep! 5) (client:setup run-id remaining-tries: 10)) ;; (- remaining-tries 1))) (begin (debug:print 25 "INFO: client:setup failed to connect, start-res=" start-res ", run-id=" run-id ", host-info=" host-info) (thread-sleep! 5) (client:setup run-id remaining-tries: (- remaining-tries 1)))))) ;; YUK: rename server-dat here (let* ((server-dat (open-run-close tasks:get-server tasks:open-db run-id))) (if server-dat (let ((start-res (http-transport:client-connect run-id (tasks:hostinfo-get-interface server-dat) (tasks:hostinfo-get-port server-dat)))) (if start-res start-res (if (member remaining-tries '(2 5)) (begin ;; login failed (debug:print 25 "INFO: client:setup start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat) (hash-table-delete! *runremote* run-id) (open-run-close tasks:server-force-clean-run-record tasks:open-db run-id (tasks:hostinfo-get-interface server-dat) (tasks:hostinfo-get-port server-dat) " client:setup (server-dat = #t)") (thread-sleep! 2) (server:try-running run-id) (thread-sleep! 10) ;; give server a little time to start up (client:setup run-id remaining-tries: 10)) ;; (- remaining-tries 1))) (begin (debug:print 25 "INFO: client:setup start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat) (thread-sleep! 5) (client:setup run-id remaining-tries: (- remaining-tries 1)))))) (begin ;; no server registered (if (eq? remaining-tries 2) (begin ;; (open-run-close tasks:server-clean-out-old-records-for-run-id tasks:open-db run-id " client:setup (server-dat=#f)") (client:setup run-id remaining-tries: 10)) (begin (thread-sleep! 2) (debug:print 25 "INFO: client:setup start-res (not defined here), run-id=" run-id ", server-dat=" server-dat) (if (< (open-run-close tasks:num-in-available-state tasks:open-db run-id) 3) (begin ;; (open-run-close tasks:server-clean-out-old-records-for-run-id tasks:open-db run-id " client:setup (server-dat=#f)") (server:try-running run-id))) (thread-sleep! 10) ;; give server a little time to start up (client:setup run-id remaining-tries: (- remaining-tries 1))))))))))) ;; keep this as a function to ease future (define (client:start run-id server-info) (http-transport:client-connect run-id (tasks:hostinfo-get-interface server-info) (tasks:hostinfo-get-port server-info))) |
︙ | ︙ |
Modified common_records.scm from [cae7d63b14] to [df4619fb90].
︙ | ︙ | |||
9 10 11 12 13 14 15 | ;; PURPOSE. ;;====================================================================== (use trace) (define (debug:calc-verbosity vstr) (cond | | > | | | | | | | | 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 | ;; PURPOSE. ;;====================================================================== (use trace) (define (debug:calc-verbosity vstr) (cond ((number? vstr) vstr) ((not (string? vstr)) 1) ;; ((string-match "^\\s*$" vstr) 1) (vstr (let ((debugvals (filter number? (map string->number (string-split vstr ","))))) (cond ((> (length debugvals) 1) debugvals) ((> (length debugvals) 0)(car debugvals)) (else 1)))) ((args:get-arg "-v") 2) ((args:get-arg "-q") 0) (else 1))) ;; check verbosity, #t is ok (define (debug:check-verbosity verbosity vstr) (if (not (or (number? verbosity) (list? verbosity))) |
︙ | ︙ |
Modified db.scm from [e88ea07f82] to [00b6a2d48b].
︙ | ︙ | |||
466 467 468 469 470 471 472 | (thread-sleep! (random 120)) (debug:print-info 0 "trying db call one more time....") (apply open-run-close-no-exception-handling proc idb params)) (apply open-run-close-no-exception-handling proc idb params))) ;; (define open-run-close (define open-run-close ;; (if (debug:debug-mode 2) | | | > | 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 | (thread-sleep! (random 120)) (debug:print-info 0 "trying db call one more time....") (apply open-run-close-no-exception-handling proc idb params)) (apply open-run-close-no-exception-handling proc idb params))) ;; (define open-run-close (define open-run-close ;; (if (debug:debug-mode 2) ;; open-run-close-no-exception-handling open-run-close-exception-handling) ;;) (define (db:initialize-main-db db) (let* ((configdat (car *configinfo*)) ;; tut tut, global warning... (keys (keys:config-get-fields configdat)) (havekeys (> (length keys) 0)) (keystr (keys->keystr keys)) (fieldstr (keys->key/field keys))) |
︙ | ︙ |
Modified dcommon.scm from [8f2d34a514] to [6ced1731f9].
︙ | ︙ | |||
474 475 476 477 478 479 480 | ;; (vector-ref server 6) ;; Start time ;; (vector-ref server 7) ;; Priority ;; (vector-ref server 8) ;; State (vector-ref server 8) ;; State (vector-ref server 12) ;; RunId ))) (for-each (lambda (val) | | > > > | | | | 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 | ;; (vector-ref server 6) ;; Start time ;; (vector-ref server 7) ;; Priority ;; (vector-ref server 8) ;; State (vector-ref server 8) ;; State (vector-ref server 12) ;; RunId ))) (for-each (lambda (val) (let* ((row-col (conc rownum ":" colnum)) (curr-val (iup:attribute servers-matrix row-col))) (if (not (equal? (conc val) curr-val)) (begin (iup:attribute-set! servers-matrix row-col val) (iup:attribute-set! servers-matrix "FITTOTEXT" (conc "C" colnum)))) (set! colnum (+ 1 colnum)))) vals) (set! rownum (+ rownum 1))) (iup:attribute-set! servers-matrix "REDRAW" "ALL")) servers))))) (set! colnum 0) (for-each (lambda (colname) (iup:attribute-set! servers-matrix (conc "0:" colnum) colname) |
︙ | ︙ |
Modified http-transport.scm from [c4fe5c65f6] to [80c3efc43e].
︙ | ︙ | |||
143 144 145 146 147 148 149 | (debug:print 0 "WARNING: failed to start on portnum: " portnum ", trying next port") (thread-sleep! 0.1) ;; get_next_port goes here (http-transport:try-start-server ipaddrstr (+ portnum 1) server-id)) (begin | | | | 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 | (debug:print 0 "WARNING: failed to start on portnum: " portnum ", trying next port") (thread-sleep! 0.1) ;; get_next_port goes here (http-transport:try-start-server ipaddrstr (+ portnum 1) server-id)) (begin (open-run-close tasks:server-force-clean-run-record tasks:open-db run-id ipaddrstr portnum " http-transport:try-start-server") (print "ERROR: Tried and tried but could not start the server")))) ;; any error in following steps will result in a retry (set! *server-info* (list ipaddrstr portnum)) (open-run-close tasks:server-set-interface-port tasks:open-db server-id ipaddrstr portnum) (debug:print 1 "INFO: Trying to start server on " ipaddrstr ":" portnum) ;; This starts the spiffy server ;; NEED WAY TO SET IP TO #f TO BIND ALL (start-server bind-address: ipaddrstr port: portnum) (open-run-close tasks:server-force-clean-run-record tasks:open-db run-id ipaddrstr portnum " http-transport:try-start-server") (debug:print 1 "INFO: server has been stopped"))) ;;====================================================================== ;; S E R V E R U T I L I T I E S ;;====================================================================== ;;====================================================================== |
︙ | ︙ | |||
269 270 271 272 273 274 275 | ;; connect ;; (define (http-transport:client-connect run-id iface port) (let* ((uri-dat (make-request method: 'POST uri: (uri-reference (conc "http://" iface ":" port "/ctrl")))) (uri-api-dat (make-request method: 'POST uri: (uri-reference (conc "http://" iface ":" port "/api")))) (serverdat (list iface port uri-dat uri-api-dat)) (login-res (rmt:login-no-auto-client-setup serverdat run-id))) | < > | 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 | ;; connect ;; (define (http-transport:client-connect run-id iface port) (let* ((uri-dat (make-request method: 'POST uri: (uri-reference (conc "http://" iface ":" port "/ctrl")))) (uri-api-dat (make-request method: 'POST uri: (uri-reference (conc "http://" iface ":" port "/api")))) (serverdat (list iface port uri-dat uri-api-dat)) (login-res (rmt:login-no-auto-client-setup serverdat run-id))) (if (and (list? login-res) (car login-res)) (begin (hash-table-set! *runremote* run-id serverdat) (debug:print-info 2 "Logged in and connected to " iface ":" port) (hash-table-set! *runremote* run-id serverdat) serverdat) (begin (debug:print-info 0 "ERROR: Failed to login or connect to " iface ":" port) #f)))) |
︙ | ︙ |
Modified rmt.scm from [3da7dfaf01] to [24de58d13b].
︙ | ︙ | |||
39 40 41 42 43 44 45 | ;; (define (rmt:send-receive cmd rid params) (let* ((run-id (if rid rid 0)) (connection-info (let ((cinfo (hash-table-ref/default *runremote* run-id #f))) (if cinfo cinfo (let loop ((numtries 100)) | < > > | | 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 | ;; (define (rmt:send-receive cmd rid params) (let* ((run-id (if rid rid 0)) (connection-info (let ((cinfo (hash-table-ref/default *runremote* run-id #f))) (if cinfo cinfo (let loop ((numtries 100)) (let ((res (client:setup run-id))) (if res (hash-table-ref/default *runremote* run-id #f) ;; client:setup filled this in (hopefully) (if (> numtries 0) (begin (thread-sleep! 10) (loop (- numtries 1))) (begin (debug:print 0 "ERROR: 100 tries and no server, giving up") (exit 1))))))))) (jparams (db:obj->string params)) (res (http-transport:client-api-send-receive run-id connection-info cmd jparams))) (if res (db:string->obj res) ;; (rmt:json-str->dat res) |
︙ | ︙ |
Modified tasks.scm from [4288c524ba] to [b27552d9de].
︙ | ︙ | |||
126 127 128 129 130 131 132 | (set! res num-in-queue)) mdb "SELECT count(id) FROM servers WHERE run_id=? AND state = 'available';" run-id) res)) (define (tasks:server-clean-out-old-records-for-run-id mdb run-id tag) | | | 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 | (set! res num-in-queue)) mdb "SELECT count(id) FROM servers WHERE run_id=? AND state = 'available';" run-id) res)) (define (tasks:server-clean-out-old-records-for-run-id mdb run-id tag) (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE state in ('available','shutting-down') AND (strftime('%s','now') - start_time) > 50 AND run_id=?;" (conc "defunct" tag) run-id)) (define (tasks:server-force-clean-running-records-for-run-id mdb run-id tag) (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE state = 'running' AND run_id=?;" (conc "defunct" tag) run-id)) (define (tasks:server-force-clean-run-record mdb run-id iface port tag) |
︙ | ︙ |