Overview
Comment: | Many tweaks to improve reliability under stress |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.60 |
Files: | files | file ages | folders |
SHA1: |
0b6b35ab5b13f49ea80ac027def9815d |
User & Date: | mrwellan on 2014-11-25 16:39:28 |
Other Links: | branch diff | manifest | tags |
Context
2014-11-25
| ||
21:10 | Add big delay and take a break when system is clearly overloaded. check-in: ec50f4ac00 user: matt tags: v1.60 | |
16:39 | Many tweaks to improve reliability under stress check-in: 0b6b35ab5b user: mrwellan tags: v1.60 | |
2014-11-24
| ||
12:45 | Fixed import-megatest.db bug check-in: 398c48390d user: mrwellan tags: v1.60, v1.6006 | |
Changes
Modified db.scm from [dec2e76730] to [7251c124d5].
︙ | ︙ | |||
193 194 195 196 197 198 199 | (if (or rdb do-not-open) rdb (let* ((dbpath (db:dbfile-path run-id)) ;; (conc toppath "/db/" run-id ".db")) (dbexists (file-exists? dbpath)) (inmem (if local #f (db:open-inmem-db))) (refdb (if local #f (db:open-inmem-db))) | | | 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 | (if (or rdb do-not-open) rdb (let* ((dbpath (db:dbfile-path run-id)) ;; (conc toppath "/db/" run-id ".db")) (dbexists (file-exists? dbpath)) (inmem (if local #f (db:open-inmem-db))) (refdb (if local #f (db:open-inmem-db))) (db (db:lock-create-open dbpath ;; this is the database physically on disk (lambda (db) (handle-exceptions exn (begin (release-dot-lock dbpath) (if (> attemptnum 2) (debug:print 0 "ERROR: tried twice, cannot create/initialize db for run-id " run-id ", at path " dbpath) |
︙ | ︙ | |||
232 233 234 235 236 237 238 239 | ;; (dbr:dbstruct-set-run-id! dbstruct run-id) (if local (begin (dbr:dbstruct-set-localdb! dbstruct run-id db) ;; (dbr:dbstruct-set-inmem! dbstruct db) ;; direct access ... db) (begin (dbr:dbstruct-set-inmem! dbstruct inmem) (db:sync-tables db:sync-tests-only db inmem) | > | > > > | 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 | ;; (dbr:dbstruct-set-run-id! dbstruct run-id) (if local (begin (dbr:dbstruct-set-localdb! dbstruct run-id db) ;; (dbr:dbstruct-set-inmem! dbstruct db) ;; direct access ... db) (begin (dbr:dbstruct-set-inmem! dbstruct inmem) (sqlite3:execute db "DELETE FROM tests WHERE state='DELETED';") ;; they just slow us down in this context (db:sync-tables db:sync-tests-only db inmem) (db:delay-if-busy refdb) ;; dbpath: (db:dbdat-get-path refdb)) (dbr:dbstruct-set-refdb! dbstruct refdb) (db:sync-tables db:sync-tests-only db refdb) ;; sync once more to deal with delays (db:sync-tables db:sync-tests-only db inmem) (db:sync-tables db:sync-tests-only db refdb) inmem)))))) ;; This routine creates the db. It is only called if the db is not already ls opened ;; (define (db:open-main dbstruct) ;; (conc *toppath* "/megatest.db") (car *configinfo*))) (let ((mdb (dbr:dbstruct-get-main dbstruct))) (if mdb |
︙ | ︙ | |||
290 291 292 293 294 295 296 297 298 299 300 301 302 303 | (inmem (dbr:dbstruct-get-inmem dbstruct)) (maindb (dbr:dbstruct-get-main dbstruct)) (refdb (dbr:dbstruct-get-refdb dbstruct)) (olddb (dbr:dbstruct-get-olddb dbstruct)) ;; (runid (dbr:dbstruct-get-run-id dbstruct)) ) (debug:print-info 4 "Syncing for run-id: " run-id) (if (eq? run-id 0) ;; runid equal to 0 is main.db (if maindb (if (or (not (number? mtime)) (not (number? stime)) (> mtime stime) force-sync) | > | 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 | (inmem (dbr:dbstruct-get-inmem dbstruct)) (maindb (dbr:dbstruct-get-main dbstruct)) (refdb (dbr:dbstruct-get-refdb dbstruct)) (olddb (dbr:dbstruct-get-olddb dbstruct)) ;; (runid (dbr:dbstruct-get-run-id dbstruct)) ) (debug:print-info 4 "Syncing for run-id: " run-id) (mutex-lock! *http-mutex*) (if (eq? run-id 0) ;; runid equal to 0 is main.db (if maindb (if (or (not (number? mtime)) (not (number? stime)) (> mtime stime) force-sync) |
︙ | ︙ | |||
320 321 322 323 324 325 326 327 | (> mtime stime) force-sync) (begin (db:delay-if-busy rundb) (db:delay-if-busy olddb) (let ((num-synced (db:sync-tables db:sync-tests-only inmem refdb rundb olddb))) (dbr:dbstruct-set-stime! dbstruct (current-milliseconds)) num-synced) | > > > | | 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 | (> mtime stime) force-sync) (begin (db:delay-if-busy rundb) (db:delay-if-busy olddb) (let ((num-synced (db:sync-tables db:sync-tests-only inmem refdb rundb olddb))) (dbr:dbstruct-set-stime! dbstruct (current-milliseconds)) (mutex-unlock! *http-mutex*) num-synced) (begin (mutex-unlock! *http-mutex*) 0)))))) (define (db:close-main dbstruct) (let ((maindb (dbr:dbstruct-get-main dbstruct))) (if maindb (begin (sqlite3:finalize! (db:dbdat-get-db maindb)) (dbr:dbstruct-set-main! dbstruct #f))))) |
︙ | ︙ |
Modified http-transport.scm from [d5c0bd2a5f] to [907ced71b2].
︙ | ︙ | |||
491 492 493 494 495 496 497 | (debug:print-info 0 "Average non-cached time " (if (eq? *number-non-write-queries* 0) "n/a (no queries)" (/ *total-non-write-delay* *number-non-write-queries*)) " ms") (debug:print-info 0 "Server shutdown complete. Exiting") | | | 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 | (debug:print-info 0 "Average non-cached time " (if (eq? *number-non-write-queries* 0) "n/a (no queries)" (/ *total-non-write-delay* *number-non-write-queries*)) " ms") (debug:print-info 0 "Server shutdown complete. Exiting") (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running complete") (exit))) ;; all routes though here end in exit ... ;; ;; start_server? ;; (define (http-transport:launch run-id) |
︙ | ︙ | |||
537 538 539 540 541 542 543 | "-") run-id server-id)) "Server run")) (th3 (make-thread (lambda () (debug:print-info 0 "Server monitor thread started") (http-transport:keep-running server-id run-id)) "Keep running"))) | < < < < < < < < > | 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 | "-") run-id server-id)) "Server run")) (th3 (make-thread (lambda () (debug:print-info 0 "Server monitor thread started") (http-transport:keep-running server-id run-id)) "Keep running"))) (thread-start! th2) (thread-sleep! 0.25) ;; give the server time to settle before starting the keep-running monitor. (thread-start! th3) (set! *didsomething* #t) (thread-join! th2) (exit)))))) (define (http-transport:server-signal-handler signum) (signal-mask! signum) |
︙ | ︙ |
Modified launch.scm from [e538ecba1a] to [34882953c1].
︙ | ︙ | |||
97 98 99 100 101 102 103 | ;; Do not run the test if it is REMOVING, RUNNING, KILLREQ or REMOTEHOSTSTART, ;; Mark the test as REMOTEHOSTSTART *IMMEDIATELY* ;; (let ((test-info (rmt:get-testinfo-state-status run-id test-id))) (if (not (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ"))) (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") | > | > | 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 | ;; Do not run the test if it is REMOVING, RUNNING, KILLREQ or REMOTEHOSTSTART, ;; Mark the test as REMOTEHOSTSTART *IMMEDIATELY* ;; (let ((test-info (rmt:get-testinfo-state-status run-id test-id))) (if (not (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ"))) (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") (begin (debug:print 0 "ERROR: test state is " (db:test-get-state test-info) ", cannot proceed") (exit)))) (debug:print 2 "Exectuing " test-name " (id: " test-id ") on " (get-host-name)) (set! keys (rmt:get-keys)) ;; (runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals) ;; these may be needed by the launching process ;; one of these is defunct/redundant ... (if (not (launch:setup-for-run force: #t)) (begin |
︙ | ︙ |
Modified rmt.scm from [a4cf4136f4] to [4e95475281].
︙ | ︙ | |||
84 85 86 87 88 89 90 | (mutex-unlock! *db-multi-sync-mutex*) (let* ((run-id (if rid rid 0)) (connection-info (rmt:get-connection-info run-id)) (jparams (db:obj->string params))) (if connection-info ;; use the server if have connection info (let* ((dat (http-transport:client-api-send-receive run-id connection-info cmd jparams)) | | | | | 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 | (mutex-unlock! *db-multi-sync-mutex*) (let* ((run-id (if rid rid 0)) (connection-info (rmt:get-connection-info run-id)) (jparams (db:obj->string params))) (if connection-info ;; use the server if have connection info (let* ((dat (http-transport:client-api-send-receive run-id connection-info cmd jparams)) (res (if (and dat (vector? dat)) (vector-ref dat 1) #f)) (success (if (and dat (vector? dat)) (vector-ref dat 0) #f))) (http-transport:server-dat-update-last-access connection-info) (if success (db:string->obj res) ;; (if (< attemptnum 100) ;; (begin ;; (hash-table-delete! *runremote* run-id) ;; (thread-sleep! 0.5) ;; (rmt:send-receive cmd rid params attempnum: (+ attemptnum 1))) ;; (begin ;; (print-call-chain (current-error-port)) ;; (debug:print 0 "ERROR: too many attempts to communicate have failed. Giving up. Kill your mtest processes and start over") ;; (exit 1))))) (begin ;; let ((new-connection-info (client:setup run-id))) (debug:print 0 "WARNING: Communication failed, trying call to http-transport:client-api-send-receive again.") (hash-table-delete! *runremote* run-id) ;; don't keep using the same connection ;; no longer killing the server in http-transport:client-api-send-receive ;; may kill it here but what are the criteria? ;; start with three calls then kill server (if (eq? attemptnum 3)(tasks:kill-server-run-id run-id)) (thread-sleep! 2) (rmt:send-receive cmd run-id params attemptnum: (+ attemptnum 1))))) (if (and (< attemptnum 10) (tasks:need-server run-id)) (begin (tasks:start-and-wait-for-server (db:delay-if-busy (tasks:open-db)) run-id 10) (rmt:send-receive cmd rid params (+ attemptnum 1))) (rmt:open-qry-close-locally cmd run-id params))))) |
︙ | ︙ |
Modified runs.scm from [6ed325fc14] to [7667be3cc8].
︙ | ︙ | |||
552 553 554 555 556 557 558 | (runs:queue-next-reg trimmed-tal trimmed-reg reglen regfull) reruns))) (list (car newtal)(append (cdr newtal) reg) '() reruns)))) ((and (null? fails) (null? prereq-fails) (null? non-completed)) | | | 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 | (runs:queue-next-reg trimmed-tal trimmed-reg reglen regfull) reruns))) (list (car newtal)(append (cdr newtal) reg) '() reruns)))) ((and (null? fails) (null? prereq-fails) (null? non-completed)) (if (runs:can-keep-running? hed 20) (begin (runs:inc-cant-run-tests hed) (debug:print-info 1 "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0)) ;; num-retries code was here ;; we use this opportunity to move contents of reg to tal (list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met? (begin |
︙ | ︙ | |||
671 672 673 674 675 676 677 | #f)) ;; Register tests ;; ((not (hash-table-ref/default test-registry (runs:make-full-test-name test-name item-path) #f)) (debug:print-info 4 "Pre-registering test " test-name "/" item-path " to create placeholder" ) ;; always do firm registration now in v1.60 and greater ;; (eq? *transport-type* 'fs) ;; no point in parallel registration if use fs | > | > | | > > > | 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 | #f)) ;; Register tests ;; ((not (hash-table-ref/default test-registry (runs:make-full-test-name test-name item-path) #f)) (debug:print-info 4 "Pre-registering test " test-name "/" item-path " to create placeholder" ) ;; always do firm registration now in v1.60 and greater ;; (eq? *transport-type* 'fs) ;; no point in parallel registration if use fs (let register-loop ((numtries 15)) (rmt:general-call 'register-test run-id run-id test-name item-path) (thread-sleep! 0.5) (if (rmt:get-test-id run-id test-name item-path) (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'done) (if (> numtries 0) (register-loop (- numtries 1)) (debug:print 0 "ERROR: failed to register test " (runs:make-full-test-name test-name item-path))))) (if (not (eq? (hash-table-ref/default test-registry (runs:make-full-test-name test-name "") #f) 'done)) (begin (rmt:general-call 'register-test run-id run-id test-name "") (if (rmt:get-test-id run-id test-name "") (hash-table-set! test-registry (runs:make-full-test-name test-name "") 'done)))) (runs:shrink-can-run-more-tests-count) ;; DELAY TWEAKER (still needed?) (if (and (null? tal)(null? reg)) |
︙ | ︙ |
Modified tasks.scm from [7c5174d4f3] to [af4bc3dbb1].
︙ | ︙ | |||
165 166 167 168 169 170 171 | (define (tasks:hostinfo-get-hostname vec) (vector-ref vec 6)) (define (tasks:server-lock-slot mdb run-id) (tasks:server-clean-out-old-records-for-run-id mdb run-id " tasks:server-lock-slot") (if (< (tasks:num-in-available-state mdb run-id) 4) (begin (tasks:server-set-available mdb run-id) | | | 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 | (define (tasks:hostinfo-get-hostname vec) (vector-ref vec 6)) (define (tasks:server-lock-slot mdb run-id) (tasks:server-clean-out-old-records-for-run-id mdb run-id " tasks:server-lock-slot") (if (< (tasks:num-in-available-state mdb run-id) 4) (begin (tasks:server-set-available mdb run-id) ;; (thread-sleep! 2) ;; Try removing this. It may not be needed. (tasks:server-am-i-the-server? mdb run-id)) #f)) ;; register that this server may come online (first to register goes though with the process) (define (tasks:server-set-available mdb run-id) (sqlite3:execute mdb |
︙ | ︙ | |||
342 343 344 345 346 347 348 349 350 351 352 353 354 355 | (let ((res #f)) (sqlite3:for-each-row (lambda (id) (set! res id)) mdb ;; NEEDS dbprep ADDED "SELECT id FROM servers WHERE run_id=? AND (state = 'running' OR (state = 'dbprep' AND (strftime('%s','now') - start_time) < 60));" run-id) res)) (define (tasks:need-server run-id) (let ((forced (configf:lookup *configdat* "server" "required")) (maxqry (cdr (rmt:get-max-query-average run-id))) (threshold (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "10")))) (cond (forced | > > > > > > > > > | 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 | (let ((res #f)) (sqlite3:for-each-row (lambda (id) (set! res id)) mdb ;; NEEDS dbprep ADDED "SELECT id FROM servers WHERE run_id=? AND (state = 'running' OR (state = 'dbprep' AND (strftime('%s','now') - start_time) < 60));" run-id) res)) (define (tasks:server-running? mdb run-id) (let ((res #f)) (sqlite3:for-each-row (lambda (id) (set! res id)) mdb ;; NEEDS dbprep ADDED "SELECT id FROM servers WHERE run_id=? AND state = 'running';" run-id) res)) (define (tasks:need-server run-id) (let ((forced (configf:lookup *configdat* "server" "required")) (maxqry (cdr (rmt:get-max-query-average run-id))) (threshold (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "10")))) (cond (forced |
︙ | ︙ | |||
403 404 405 406 407 408 409 410 411 412 413 414 415 416 | (define (tasks:kill-server-run-id run-id #!key (tag "default")) (let* ((tdbdat (tasks:open-db)) (sdat (tasks:get-server (db:delay-if-busy tdbdat) run-id))) (if sdat (let ((hostname (vector-ref sdat 6)) (pid (vector-ref sdat 5)) (server-id (vector-ref sdat 0))) (debug:print-info 0 "Killing server " server-id " for run-id " run-id " on host " hostname " with pid " pid) (tasks:kill-server hostname pid) (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) ) (debug:print-info 0 "No server found for run-id " run-id ", nothing to kill")) ;; (sqlite3:finalize! tdb) )) | > | 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 | (define (tasks:kill-server-run-id run-id #!key (tag "default")) (let* ((tdbdat (tasks:open-db)) (sdat (tasks:get-server (db:delay-if-busy tdbdat) run-id))) (if sdat (let ((hostname (vector-ref sdat 6)) (pid (vector-ref sdat 5)) (server-id (vector-ref sdat 0))) (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "killed") (debug:print-info 0 "Killing server " server-id " for run-id " run-id " on host " hostname " with pid " pid) (tasks:kill-server hostname pid) (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) ) (debug:print-info 0 "No server found for run-id " run-id ", nothing to kill")) ;; (sqlite3:finalize! tdb) )) |
︙ | ︙ |
Modified tests/fullrun/megatest.config from [79e8e68f6b] to [a6f800861f].
︙ | ︙ | |||
131 132 133 134 135 136 137 | # If the server can't be started on this port it will try the next port until # it succeeds port 8080 # This server will keep running this number of hours after last access. # Three minutes is 0.05 hours # timeout 0.025 | | | > | 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 | # If the server can't be started on this port it will try the next port until # it succeeds port 8080 # This server will keep running this number of hours after last access. # Three minutes is 0.05 hours # timeout 0.025 timeout 0.1 # Server is required - slower but more resistant to Sqlite issues. # required yes # Start server when average query takes longer than this server-query-threshold 100 # 55500 # daemonize yes # hostname #{scheme (get-host-name)} ## disks are: ## name host:/path/to/area ## -or- |
︙ | ︙ |