Changes In Branch v1.60-broken-test1 Through [1b36724a57] Excluding Merge-Ins
This is equivalent to a diff from 19f6ae918c to 1b36724a57
2014-12-10
| ||
08:05 | Cherrypicked nodes f0a3 and 1b36 into v1.60 check-in: 9f5898d48f user: mrwellan tags: v1.60 | |
2014-12-09
| ||
23:43 | Fixed typo Closed-Leaf check-in: 62ccd43251 user: matt tags: v1.60-broken-test1 | |
23:00 | Tweak dashboard updates check-in: 1b36724a57 user: matt tags: v1.60-broken-test1 | |
22:56 | Added some defense to local queries check-in: f0a33deb29 user: matt tags: v1.60-broken-test1 | |
21:00 | Hand-merged refactor of server launch/client start code check-in: e7355f3724 user: matt tags: v1.60-broken-test1 | |
10:51 | Removing mark-incomplete from runs queue processing check-in: 76ef9fc5ad user: mrwellan tags: temp-hacks | |
10:27 | Treat any exceptions when logging into server as a dead server (for now) check-in: 19f6ae918c user: mrwellan tags: v1.60 | |
09:38 | Allow some retries on run queue processing if server died (temporary work-around until the recovery is coded correctly) check-in: 187be74df3 user: mrwellan tags: v1.60 | |
Modified client.scm from [a61d4e6d81] to [e12d06ffcd].
︙ | ︙ | |||
175 176 177 178 179 180 181 | ((nmsg)(let ((logininfo (rmt:login-no-auto-client-setup start-res run-id))) (if logininfo (car (vector-ref logininfo 1)) #f)))))) (if (and start-res ping-res) (begin | | | 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 | ((nmsg)(let ((logininfo (rmt:login-no-auto-client-setup start-res run-id))) (if logininfo (car (vector-ref logininfo 1)) #f)))))) (if (and start-res ping-res) (begin ;; (hash-table-set! *runremote* run-id start-res) (debug:print-info 2 "connected to " (http-transport:server-dat-make-url start-res)) start-res) (begin ;; login failed but have a server record, clean out the record and try again (debug:print-info 0 "client:setup, login failed, will attempt to start server ... start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat) (case *transport-type* ((http)(http-transport:close-connections run-id))) (hash-table-delete! *runremote* run-id) |
︙ | ︙ | |||
197 198 199 200 201 202 203 | (thread-sleep! (+ 1 (random 5))) ;; spread out the starts a little (thread-sleep! (+ 15 (random 20)))) ;; it isn't going well. give it plenty of time (server:try-running run-id) (thread-sleep! 5) ;; give server a little time to start up (client:setup run-id remaining-tries: (- remaining-tries 1)) ))) (begin ;; no server registered | | | | | > | | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 | (thread-sleep! (+ 1 (random 5))) ;; spread out the starts a little (thread-sleep! (+ 15 (random 20)))) ;; it isn't going well. give it plenty of time (server:try-running run-id) (thread-sleep! 5) ;; give server a little time to start up (client:setup run-id remaining-tries: (- remaining-tries 1)) ))) (begin ;; no server registered ;; (let ((num-available (tasks:num-in-available-state (db:dbdat-get-db tdbdat) run-id))) ;; (debug:print-info 0 "client:setup, no server registered, remaining-tries=" remaining-tries " num-available=" num-available) ;; (if (< num-available 2) ;; (server:try-running run-id)) (tasks:start-and-wait-for-server tdbdat run-id delay-max-tries) ;; (thread-sleep! (+ 2 (random (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms. (client:setup run-id remaining-tries: (- remaining-tries 1)))))))) ;; keep this as a function to ease future (define (client:start run-id server-info) (http-transport:client-connect (tasks:hostinfo-get-interface server-info) (tasks:hostinfo-get-port server-info))) ;; client:signal-handler |
︙ | ︙ |
Modified dashboard.scm from [6d6a8350b9] to [2960f85268].
︙ | ︙ | |||
1471 1472 1473 1474 1475 1476 1477 | (let* ((modtime (dashboard:get-youngest-run-db-mod-time)) ;; (file-modification-time *db-file-path*)) (monitor-modtime (if (file-exists? *monitor-db-path*) (file-modification-time *monitor-db-path*) -1)) (run-update-time (current-seconds)) (recalc (dashboard:recalc modtime *please-update-buttons* *last-db-update-time*))) (if (and (eq? *current-tab-number* 0) | | > | | 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 | (let* ((modtime (dashboard:get-youngest-run-db-mod-time)) ;; (file-modification-time *db-file-path*)) (monitor-modtime (if (file-exists? *monitor-db-path*) (file-modification-time *monitor-db-path*) -1)) (run-update-time (current-seconds)) (recalc (dashboard:recalc modtime *please-update-buttons* *last-db-update-time*))) (if (and (eq? *current-tab-number* 0) (or (> monitor-modtime *last-monitor-update-time*) (> (- run-update-time *last-monitor-update-time*) 5))) ;; update every 1/2 minute just in case (begin (set! *last-monitor-update-time* run-update-time) ;; monitor-modtime) (if dashboard:update-servers-table (dashboard:update-servers-table)))) (if recalc (begin (case *current-tab-number* ((0) (if dashboard:update-summary-tab (dashboard:update-summary-tab))) ((1) ;; The runs table is active |
︙ | ︙ |
Modified rmt.scm from [c2992f6eb2] to [bd51259062].
︙ | ︙ | |||
26 27 28 29 30 31 32 33 34 35 36 37 | ;; (require-library trace) ;; (import trace) ;; (trace ;; rmt:send-receive ;; api:execute-requests ;; ) ;;====================================================================== ;; S U P P O R T F U N C T I O N S ;;====================================================================== | > > > > | | | | | | | | | 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 | ;; (require-library trace) ;; (import trace) ;; (trace ;; rmt:send-receive ;; api:execute-requests ;; ) ;; generate entries for ~/.megatestrc with the following ;; ;; grep define ../rmt.scm | grep rmt: |perl -pi -e 's/\(define\s+\((\S+)\W.*$/\1/'|sort -u ;;====================================================================== ;; S U P P O R T F U N C T I O N S ;;====================================================================== ;; NOT USED? ;; ;; (define (rmt:call-transport run-id connection-info cmd jparams) ;; (case (server:get-transport) ;; ((rpc) ( rpc-transport:client-api-send-receive run-id connection-info cmd jparams)) ;; ((http) (http-transport:client-api-send-receive run-id connection-info cmd jparams)) ;; ((fs) ( fs-transport:client-api-send-receive run-id connection-info cmd jparams)) ;; ((zmq) (zmq-transport:client-api-send-receive run-id connection-info cmd jparams)) ;; (else ( rpc-transport:client-api-send-receive run-id connection-info cmd jparams)))) ;; (define (rmt:write-frequency-over-limit? cmd run-id) (and (not (member cmd api:read-only-queries)) (let* ((tmprec (hash-table-ref/default *write-frequency* run-id #f)) (record (if tmprec tmprec (let ((v (vector (current-seconds) 0))) |
︙ | ︙ | |||
61 62 63 64 65 66 67 | (if (and (> count 10) (> queries-per-second 10)) (begin (debug:print-info 1 "db write rate too high, starting a server, count=" count " start=" start " run-id=" run-id " queries-per-second=" queries-per-second) #t) #f)))) | | | | | | | | > > | | > | > > | > > | > > < < < < < < < < < < < < | < < < < | 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 | (if (and (> count 10) (> queries-per-second 10)) (begin (debug:print-info 1 "db write rate too high, starting a server, count=" count " start=" start " run-id=" run-id " queries-per-second=" queries-per-second) #t) #f)))) ;; (define (rmt:get-connection-info run-id) ;; (let ((cinfo (hash-table-ref/default *runremote* run-id #f))) ;; (if cinfo ;; cinfo ;; ;; NB// can cache the answer for server running for 10 seconds ... ;; ;; ;; (and (not (rmt:write-frequency-over-limit? cmd run-id)) ;; ;; (if (tasks:server-running-or-starting? (db:delay-if-busy (tasks:open-db)) run-id) ;; ;; (begin ;; ;; (tasks:start-and-wait-for-server (db:delay-if-busy (tasks:open-db)) run-id) ;; (client:setup run-id)))) ;; ;; #f)))) (define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id (define (rmt:send-receive cmd rid params #!key (attemptnum 1)) ;; start attemptnum at 1 so the modulo below works as expected ;; clean out old connections (mutex-lock! *db-multi-sync-mutex*) (let ((expire-time (- (current-seconds) (server:get-timeout) 10))) ;; don't forget the 10 second margin (for-each (lambda (run-id) (let ((connection (hash-table-ref/default *runremote* run-id #f))) (if (and (vector? connection) (< (http-transport:server-dat-get-last-access connection) expire-time)) (begin (debug:print-info 0 "Discarding connection to server for run-id " run-id ", too long between accesses") ;; SHOULD CLOSE THE CONNECTION HERE (case *transport-type* ((nmsg)(nn-close (http-transport:server-dat-get-socket (hash-table-ref *runremote* run-id))))) (hash-table-delete! *runremote* run-id))))) (hash-table-keys *runremote*))) (mutex-unlock! *db-multi-sync-mutex*) ;; (mutex-lock! *send-receive-mutex*) (let* ((run-id (if rid rid 0)) (connection-info (hash-table-ref/default *runremote* run-id #f))) ;; (rmt:get-connection-info run-id))) ;; the nmsg method does the encoding under the hood (the http method should be changed to do this also) (if connection-info ;; use the server if have connection info (let* ((dat (case *transport-type* ((http)(condition-case (http-transport:client-api-send-receive run-id connection-info cmd params) ((commfail) (tasks:kill-server-run-id run-id) (vector #f "communications fail")) ((exn) (tasks:kill-server-run-id run-id) (vector #f "other fail")))) ((nmsg)(condition-case (nmsg-transport:client-api-send-receive run-id connection-info cmd params) ((timeout)(vector #f "timeout talking to server")))) (else (exit)))) (success (if (vector? dat) (vector-ref dat 0) #f)) (res (if (vector? dat) (vector-ref dat 1) #f))) (if (vector? connection-info)(http-transport:server-dat-update-last-access connection-info)) (if success (begin ;; (mutex-unlock! *send-receive-mutex*) ;; all is well, return the result! (case *transport-type* ((http) res) ;; (db:string->obj res)) ((nmsg) res))) ;; (vector-ref res 1))) ;; we had a connection but it is borked. clean up and reconnect (begin ;; let ((new-connection-info (client:setup run-id))) (debug:print 0 "WARNING: Communication failed, trying call to rmt:send-receive again.") ;; (case *transport-type* ;; ((nmsg)(nn-close (http-transport:server-dat-get-socket connection-info)))) (hash-table-delete! *runremote* run-id) ;; don't keep using the same connection (rmt:send-receive cmd run-id params attemptnum: (+ attemptnum 1))))) ;; no connection info? try to start a server (if (and (< attemptnum 15) (member cmd api:write-queries)) (begin (hash-table-delete! *runremote* run-id) ;; (mutex-unlock! *send-receive-mutex*) (tasks:start-and-wait-for-server (db:delay-if-busy (tasks:open-db)) run-id 10) (hash-table-set! *runremote* run-id (client:setup run-id)) (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))) (begin (rmt:open-qry-close-locally cmd run-id params) ))))) (define (rmt:update-db-stats run-id rawcmd params duration) (mutex-lock! *db-stats-mutex*) (handle-exceptions exn |
︙ | ︙ | |||
206 207 208 209 210 211 212 | (if (> tot 10) (cons newmax-cmd currmax) (cons 'none 0)) (loop (car tal)(cdr tal) newmax-cmd currmax))))))) (mutex-unlock! *db-stats-mutex*) res)) | | | | | | | > | > > > > > > > > > > | | | | | | | | | | | 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 | (if (> tot 10) (cons newmax-cmd currmax) (cons 'none 0)) (loop (car tal)(cdr tal) newmax-cmd currmax))))))) (mutex-unlock! *db-stats-mutex*) res)) (define (rmt:open-qry-close-locally cmd run-id params #!key (remretries 5)) (let* ((dbstruct-local (if *dbstruct-db* *dbstruct-db* (let* ((dbdir (conc (configf:lookup *configdat* "setup" "linktree") "/.db")) (db (make-dbr:dbstruct path: dbdir local: #t))) (set! *dbstruct-db* db) db))) (db-file-path (db:dbfile-path 0)) ;; (read-only (not (file-read-access? db-file-path))) (start (current-milliseconds)) (resdat (api:execute-requests dbstruct-local (vector (symbol->string cmd) params))) (success (vector-ref resdat 0)) (res (vector-ref resdat 1)) (duration (- (current-milliseconds) start))) (if (not success) (if (> remretries 0) (begin (debug:print 0 "ERROR: local query failed. Trying again.") (thread-sleep! (/ (random 5000) 1000)) ;; some random delay (rmt:open-qry-close-locally cmd run-id params remretries: (- remretries 1))) (begin (debug:print 0 "ERROR: too many retries in rmt:open-qry-close-locally, giving up") #f)) (begin (rmt:update-db-stats run-id cmd params duration) ;; mark this run as dirty if this was a write (if (not (member cmd api:read-only-queries)) (let ((start-time (current-seconds))) (mutex-lock! *db-multi-sync-mutex*) ;; (if (not (hash-table-ref/default *db-local-sync* run-id #f)) ;; just set it every time. Is a write more expensive than a read and does it matter? (hash-table-set! *db-local-sync* (or run-id 0) start-time) ;; the oldest "write" (mutex-unlock! *db-multi-sync-mutex*))) res)))) (define (rmt:send-receive-no-auto-client-setup connection-info cmd run-id params) (let* ((run-id (if run-id run-id 0)) ;; (jparams (db:obj->string params)) ;; (rmt:dat->json-str params)) (res (handle-exceptions exn #f |
︙ | ︙ |
Modified runs.scm from [866c85daf6] to [3bd91bccb4].
︙ | ︙ | |||
158 159 160 161 162 163 164 | (if (> (- currtime lasttime) waitval) (begin (hash-table-set! *runs:denoise* key currtime) #t) #f))) (define (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs) | | | | | 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 | (if (> (- currtime lasttime) waitval) (begin (hash-table-set! *runs:denoise* key currtime) #t) #f))) (define (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs) (thread-sleep! (cond ((> *runs:can-run-more-tests-count* 20) 2);; obviously haven't had any work to do for a while (else 0))) (let* ((num-running (rmt:get-count-tests-running run-id)) (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)) (job-group-limit (let ((jobg-count (config-lookup *configdat* "jobgroups" jobgroup))) (if (string? jobg-count) (string->number jobg-count) jobg-count)))) (if (> (+ num-running num-running-in-jobgroup) 0) |
︙ | ︙ |
Modified tasks.scm from [edd9ff6647] to [4936b038b4].
︙ | ︙ | |||
359 360 361 362 363 364 365 | (lambda (id) (set! res id)) mdb ;; NEEDS dbprep ADDED "SELECT id FROM servers WHERE run_id=? AND state = 'running';" run-id) res)) (define (tasks:need-server run-id) | | > | | | | | | | | | | | | | | | | | 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 | (lambda (id) (set! res id)) mdb ;; NEEDS dbprep ADDED "SELECT id FROM servers WHERE run_id=? AND state = 'running';" run-id) res)) (define (tasks:need-server run-id) (configf:lookup *configdat* "server" "required")) ;; (maxqry (cdr (rmt:get-max-query-average run-id))) ;; (threshold (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "10")))) ;; (cond ;; (forced ;; (if (common:low-noise-print 60 run-id "server required is set") ;; (debug:print-info 0 "Server required is set, starting server for run-id " run-id ".")) ;; #t) ;; ((> maxqry threshold) ;; (if (common:low-noise-print 60 run-id "Max query time execeeded") ;; (debug:print-info 0 "Max avg query time of " maxqry "ms exceeds limit of " threshold "ms, server needed for run-id " run-id ".")) ;; #t) ;; (else ;; #f)))) ;; try to start a server and wait for it to be available ;; (define (tasks:start-and-wait-for-server tdbdat run-id delay-max-tries) ;; ensure a server is running for this run (let loop ((server-running (tasks:server-running? (db:delay-if-busy tdbdat) run-id)) (delay-time 0)) (if (and (not server-running) (< delay-time delay-max-tries)) (begin (if (common:low-noise-print 60 "tasks:start-and-wait-for-server" run-id) (debug:print 0 "Try starting server for run-id " run-id)) (thread-sleep! (/ (random 2000) 1000)) (server:kind-run run-id) (thread-sleep! (min delay-time 1)) (loop (tasks:server-running? (db:delay-if-busy tdbdat) run-id)(+ delay-time 1)))))) (define (tasks:get-all-servers mdb) (let ((res '())) (sqlite3:for-each-row (lambda (id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id) ;; 0 1 2 3 4 5 6 7 8 9 10 11 12 (set! res (cons (vector id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id) res))) |
︙ | ︙ |