Overview
Comment: | Many tweaks to improve reliability under stress |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.60 |
Files: | files | file ages | folders |
SHA1: |
0b6b35ab5b13f49ea80ac027def9815d |
User & Date: | mrwellan on 2014-11-25 16:39:28 |
Other Links: | branch diff | manifest | tags |
Context
2014-11-25
| ||
21:10 | Add big delay and take a break when system is clearly overloaded. check-in: ec50f4ac00 user: matt tags: v1.60 | |
16:39 | Many tweaks to improve reliability under stress check-in: 0b6b35ab5b user: mrwellan tags: v1.60 | |
2014-11-24
| ||
12:45 | Fixed import-megatest.db bug check-in: 398c48390d user: mrwellan tags: v1.60, v1.6006 | |
Changes
Modified db.scm from [dec2e76730] to [7251c124d5].
︙ | |||
193 194 195 196 197 198 199 | 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 | - + | (if (or rdb do-not-open) rdb (let* ((dbpath (db:dbfile-path run-id)) ;; (conc toppath "/db/" run-id ".db")) (dbexists (file-exists? dbpath)) (inmem (if local #f (db:open-inmem-db))) (refdb (if local #f (db:open-inmem-db))) |
︙ | |||
232 233 234 235 236 237 238 239 | 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 | + - + + + + | ;; (dbr:dbstruct-set-run-id! dbstruct run-id) (if local (begin (dbr:dbstruct-set-localdb! dbstruct run-id db) ;; (dbr:dbstruct-set-inmem! dbstruct db) ;; direct access ... db) (begin (dbr:dbstruct-set-inmem! dbstruct inmem) (sqlite3:execute db "DELETE FROM tests WHERE state='DELETED';") ;; they just slow us down in this context (db:sync-tables db:sync-tests-only db inmem) |
︙ | |||
290 291 292 293 294 295 296 297 298 299 300 301 302 303 | 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 | + | (inmem (dbr:dbstruct-get-inmem dbstruct)) (maindb (dbr:dbstruct-get-main dbstruct)) (refdb (dbr:dbstruct-get-refdb dbstruct)) (olddb (dbr:dbstruct-get-olddb dbstruct)) ;; (runid (dbr:dbstruct-get-run-id dbstruct)) ) (debug:print-info 4 "Syncing for run-id: " run-id) (mutex-lock! *http-mutex*) (if (eq? run-id 0) ;; runid equal to 0 is main.db (if maindb (if (or (not (number? mtime)) (not (number? stime)) (> mtime stime) force-sync) |
︙ | |||
320 321 322 323 324 325 326 327 | 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 | + + + - + | (> mtime stime) force-sync) (begin (db:delay-if-busy rundb) (db:delay-if-busy olddb) (let ((num-synced (db:sync-tables db:sync-tests-only inmem refdb rundb olddb))) (dbr:dbstruct-set-stime! dbstruct (current-milliseconds)) (mutex-unlock! *http-mutex*) num-synced) (begin (mutex-unlock! *http-mutex*) |
︙ |
Modified http-transport.scm from [d5c0bd2a5f] to [907ced71b2].
︙ | |||
491 492 493 494 495 496 497 | 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 | - + | (debug:print-info 0 "Average non-cached time " (if (eq? *number-non-write-queries* 0) "n/a (no queries)" (/ *total-non-write-delay* *number-non-write-queries*)) " ms") (debug:print-info 0 "Server shutdown complete. Exiting") |
︙ | |||
537 538 539 540 541 542 543 | 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 | - - - - - - - - + | "-") run-id server-id)) "Server run")) (th3 (make-thread (lambda () (debug:print-info 0 "Server monitor thread started") (http-transport:keep-running server-id run-id)) "Keep running"))) |
︙ |
Modified launch.scm from [e538ecba1a] to [34882953c1].
︙ | |||
97 98 99 100 101 102 103 | 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 | + - + + | ;; Do not run the test if it is REMOVING, RUNNING, KILLREQ or REMOTEHOSTSTART, ;; Mark the test as REMOTEHOSTSTART *IMMEDIATELY* ;; (let ((test-info (rmt:get-testinfo-state-status run-id test-id))) (if (not (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ"))) (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") (begin |
︙ |
Modified rmt.scm from [a4cf4136f4] to [4e95475281].
︙ | |||
84 85 86 87 88 89 90 | 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 | - - + + - + | (mutex-unlock! *db-multi-sync-mutex*) (let* ((run-id (if rid rid 0)) (connection-info (rmt:get-connection-info run-id)) (jparams (db:obj->string params))) (if connection-info ;; use the server if have connection info (let* ((dat (http-transport:client-api-send-receive run-id connection-info cmd jparams)) |
︙ |
Modified runs.scm from [6ed325fc14] to [7667be3cc8].
︙ | |||
552 553 554 555 556 557 558 | 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 | - + | (runs:queue-next-reg trimmed-tal trimmed-reg reglen regfull) reruns))) (list (car newtal)(append (cdr newtal) reg) '() reruns)))) ((and (null? fails) (null? prereq-fails) (null? non-completed)) |
︙ | |||
671 672 673 674 675 676 677 | 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 | + - - - + + + + + + + | #f)) ;; Register tests ;; ((not (hash-table-ref/default test-registry (runs:make-full-test-name test-name item-path) #f)) (debug:print-info 4 "Pre-registering test " test-name "/" item-path " to create placeholder" ) ;; always do firm registration now in v1.60 and greater ;; (eq? *transport-type* 'fs) ;; no point in parallel registration if use fs (let register-loop ((numtries 15)) |
︙ |
Modified tasks.scm from [7c5174d4f3] to [af4bc3dbb1].
︙ | |||
165 166 167 168 169 170 171 | 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 | - + | (define (tasks:hostinfo-get-hostname vec) (vector-ref vec 6)) (define (tasks:server-lock-slot mdb run-id) (tasks:server-clean-out-old-records-for-run-id mdb run-id " tasks:server-lock-slot") (if (< (tasks:num-in-available-state mdb run-id) 4) (begin (tasks:server-set-available mdb run-id) |
︙ | |||
342 343 344 345 346 347 348 349 350 351 352 353 354 355 | 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 | + + + + + + + + + | (let ((res #f)) (sqlite3:for-each-row (lambda (id) (set! res id)) mdb ;; NEEDS dbprep ADDED "SELECT id FROM servers WHERE run_id=? AND (state = 'running' OR (state = 'dbprep' AND (strftime('%s','now') - start_time) < 60));" run-id) res)) (define (tasks:server-running? mdb run-id) (let ((res #f)) (sqlite3:for-each-row (lambda (id) (set! res id)) mdb ;; NEEDS dbprep ADDED "SELECT id FROM servers WHERE run_id=? AND state = 'running';" run-id) res)) (define (tasks:need-server run-id) (let ((forced (configf:lookup *configdat* "server" "required")) (maxqry (cdr (rmt:get-max-query-average run-id))) (threshold (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "10")))) (cond (forced |
︙ | |||
403 404 405 406 407 408 409 410 411 412 413 414 415 416 | 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 | + | (define (tasks:kill-server-run-id run-id #!key (tag "default")) (let* ((tdbdat (tasks:open-db)) (sdat (tasks:get-server (db:delay-if-busy tdbdat) run-id))) (if sdat (let ((hostname (vector-ref sdat 6)) (pid (vector-ref sdat 5)) (server-id (vector-ref sdat 0))) (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "killed") (debug:print-info 0 "Killing server " server-id " for run-id " run-id " on host " hostname " with pid " pid) (tasks:kill-server hostname pid) (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) ) (debug:print-info 0 "No server found for run-id " run-id ", nothing to kill")) ;; (sqlite3:finalize! tdb) )) |
︙ |
Modified tests/fullrun/megatest.config from [79e8e68f6b] to [a6f800861f].
︙ | |||
131 132 133 134 135 136 137 | 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 | - + - + + | # If the server can't be started on this port it will try the next port until # it succeeds port 8080 # This server will keep running this number of hours after last access. # Three minutes is 0.05 hours # timeout 0.025 |
︙ |