Overview
Comment: | Merged v1.60 changes into try-nanomsg |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | try-nanomsg |
Files: | files | file ages | folders |
SHA1: |
0b3a6d8aa92b0c1bee6199cd6a43aff8 |
User & Date: | matt on 2014-11-26 08:59:53 |
Other Links: | branch diff | manifest | tags |
Context
2014-11-26
| ||
16:09 | Added debug support in newdashboard check-in: b79afa463f user: matt tags: try-nanomsg | |
08:59 | Merged v1.60 changes into try-nanomsg check-in: 0b3a6d8aa9 user: matt tags: try-nanomsg | |
2014-11-25
| ||
21:10 | Add big delay and take a break when system is clearly overloaded. check-in: ec50f4ac00 user: matt tags: v1.60 | |
2014-11-24
| ||
23:36 | several little fixes check-in: 308a5d65b6 user: matt tags: try-nanomsg | |
Changes
Modified db.scm from [f9ca78a55d] to [75e6e604f1].
︙ | ︙ | |||
193 194 195 196 197 198 199 | (if (or rdb do-not-open) rdb (let* ((dbpath (db:dbfile-path run-id)) ;; (conc toppath "/db/" run-id ".db")) (dbexists (file-exists? dbpath)) (inmem (if local #f (db:open-inmem-db))) (refdb (if local #f (db:open-inmem-db))) | | | 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 | (if (or rdb do-not-open) rdb (let* ((dbpath (db:dbfile-path run-id)) ;; (conc toppath "/db/" run-id ".db")) (dbexists (file-exists? dbpath)) (inmem (if local #f (db:open-inmem-db))) (refdb (if local #f (db:open-inmem-db))) (db (db:lock-create-open dbpath ;; this is the database physically on disk (lambda (db) (handle-exceptions exn (begin (release-dot-lock dbpath) (if (> attemptnum 2) (debug:print 0 "ERROR: tried twice, cannot create/initialize db for run-id " run-id ", at path " dbpath) |
︙ | ︙ | |||
232 233 234 235 236 237 238 239 | ;; (dbr:dbstruct-set-run-id! dbstruct run-id) (if local (begin (dbr:dbstruct-set-localdb! dbstruct run-id db) ;; (dbr:dbstruct-set-inmem! dbstruct db) ;; direct access ... db) (begin (dbr:dbstruct-set-inmem! dbstruct inmem) (db:sync-tables db:sync-tests-only db inmem) | > | > > > | 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 | ;; (dbr:dbstruct-set-run-id! dbstruct run-id) (if local (begin (dbr:dbstruct-set-localdb! dbstruct run-id db) ;; (dbr:dbstruct-set-inmem! dbstruct db) ;; direct access ... db) (begin (dbr:dbstruct-set-inmem! dbstruct inmem) (sqlite3:execute db "DELETE FROM tests WHERE state='DELETED';") ;; they just slow us down in this context (db:sync-tables db:sync-tests-only db inmem) (db:delay-if-busy refdb) ;; dbpath: (db:dbdat-get-path refdb)) (dbr:dbstruct-set-refdb! dbstruct refdb) (db:sync-tables db:sync-tests-only db refdb) ;; sync once more to deal with delays (db:sync-tables db:sync-tests-only db inmem) (db:sync-tables db:sync-tests-only db refdb) inmem)))))) ;; This routine creates the db. It is only called if the db is not already ls opened ;; (define (db:open-main dbstruct) ;; (conc *toppath* "/megatest.db") (car *configinfo*))) (let ((mdb (dbr:dbstruct-get-main dbstruct))) (if mdb |
︙ | ︙ | |||
290 291 292 293 294 295 296 297 298 299 300 301 302 303 | (inmem (dbr:dbstruct-get-inmem dbstruct)) (maindb (dbr:dbstruct-get-main dbstruct)) (refdb (dbr:dbstruct-get-refdb dbstruct)) (olddb (dbr:dbstruct-get-olddb dbstruct)) ;; (runid (dbr:dbstruct-get-run-id dbstruct)) ) (debug:print-info 4 "Syncing for run-id: " run-id) (if (eq? run-id 0) ;; runid equal to 0 is main.db (if maindb (if (or (not (number? mtime)) (not (number? stime)) (> mtime stime) force-sync) | > | 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 | (inmem (dbr:dbstruct-get-inmem dbstruct)) (maindb (dbr:dbstruct-get-main dbstruct)) (refdb (dbr:dbstruct-get-refdb dbstruct)) (olddb (dbr:dbstruct-get-olddb dbstruct)) ;; (runid (dbr:dbstruct-get-run-id dbstruct)) ) (debug:print-info 4 "Syncing for run-id: " run-id) (mutex-lock! *http-mutex*) (if (eq? run-id 0) ;; runid equal to 0 is main.db (if maindb (if (or (not (number? mtime)) (not (number? stime)) (> mtime stime) force-sync) |
︙ | ︙ | |||
320 321 322 323 324 325 326 327 | (> mtime stime) force-sync) (begin (db:delay-if-busy rundb) (db:delay-if-busy olddb) (let ((num-synced (db:sync-tables db:sync-tests-only inmem refdb rundb olddb))) (dbr:dbstruct-set-stime! dbstruct (current-milliseconds)) num-synced) | > > > | | 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 | (> mtime stime) force-sync) (begin (db:delay-if-busy rundb) (db:delay-if-busy olddb) (let ((num-synced (db:sync-tables db:sync-tests-only inmem refdb rundb olddb))) (dbr:dbstruct-set-stime! dbstruct (current-milliseconds)) (mutex-unlock! *http-mutex*) num-synced) (begin (mutex-unlock! *http-mutex*) 0)))))) (define (db:close-main dbstruct) (let ((maindb (dbr:dbstruct-get-main dbstruct))) (if maindb (begin (sqlite3:finalize! (db:dbdat-get-db maindb)) (dbr:dbstruct-set-main! dbstruct #f))))) |
︙ | ︙ | |||
649 650 651 652 653 654 655 | (for-each (lambda (run-id) (db:delay-if-busy mtdb) (let ((testrecs (db:get-all-tests-info-by-run-id mtdb run-id)) (dbstruct (if toppath (make-dbr:dbstruct path: toppath local: #t) #f))) (debug:print 0 "INFO: Propagating " (length testrecs) " records for run-id=" run-id " to run specific db") (db:replace-test-records dbstruct run-id testrecs) | | | | 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 | (for-each (lambda (run-id) (db:delay-if-busy mtdb) (let ((testrecs (db:get-all-tests-info-by-run-id mtdb run-id)) (dbstruct (if toppath (make-dbr:dbstruct path: toppath local: #t) #f))) (debug:print 0 "INFO: Propagating " (length testrecs) " records for run-id=" run-id " to run specific db") (db:replace-test-records dbstruct run-id testrecs) (sqlite3:finalize! (db:dbdat-get-db (dbr:dbstruct-get-rundb dbstruct))))) run-ids))) ;; now ensure all newdb data are synced to megatest.db (if (member 'new2old options) (for-each (lambda (run-id) (let* ((fromdb (if toppath (make-dbr:dbstruct path: toppath local: #t) #f)) (frundb (db:dbdat-get-db (db:get-db fromdb run-id)))) ;; (db:delay-if-busy frundb) ;; (db:delay-if-busy mtdb) (if (eq? run-id 0) (db:sync-tables (db:sync-main-list dbstruct) (db:get-db fromdb #f) mtdb) (db:sync-tables db:sync-tests-only (db:get-db fromdb run-id) mtdb)))) (cons 0 run-ids))) ;; (db:close-all dbstruct) ;; (sqlite3:finalize! mdb) )) ;; keeping it around for debugging purposes only (define (open-run-close-no-exception-handling proc idb . params) (debug:print-info 11 "open-run-close-no-exception-handling START given a db=" (if idb "yes " "no ") ", params=" params) |
︙ | ︙ | |||
1942 1943 1944 1945 1946 1947 1948 | (define db:test-record-qry-selector (string-intersperse db:test-record-fields ",")) ;; NOTE: Use db:test-get* to access records ;; NOTE: This needs rundir decoding? Decide, decode here or where used? For the moment decode where used. (define (db:get-all-tests-info-by-run-id dbstruct run-id) | > | > | | | 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 | (define db:test-record-qry-selector (string-intersperse db:test-record-fields ",")) ;; NOTE: Use db:test-get* to access records ;; NOTE: This needs rundir decoding? Decide, decode here or where used? For the moment decode where used. (define (db:get-all-tests-info-by-run-id dbstruct run-id) (let* ((dbdat (if (vector? dbstruct) (db:get-db dbstruct run-id) dbstruct)) ;; still settling on when to use dbstruct or dbdat (db (db:dbdat-get-db dbdat)) (res '())) (db:delay-if-busy dbdat) (sqlite3:for-each-row (lambda (id run-id testname state status event-time host cpuload diskfree uname rundir item-path run-duration final-logf comment shortdir attemptnum) ;; 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 (set! res (cons (vector id run-id testname state status event-time host cpuload diskfree uname rundir item-path run-duration final-logf comment shortdir attemptnum) res))) db |
︙ | ︙ |
Modified http-transport.scm from [8c622aa861] to [445c83d4cb].
︙ | ︙ | |||
486 487 488 489 490 491 492 | (debug:print-info 0 "Average non-cached time " (if (eq? *number-non-write-queries* 0) "n/a (no queries)" (/ *total-non-write-delay* *number-non-write-queries*)) " ms") (debug:print-info 0 "Server shutdown complete. Exiting") | | | 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 | (debug:print-info 0 "Average non-cached time " (if (eq? *number-non-write-queries* 0) "n/a (no queries)" (/ *total-non-write-delay* *number-non-write-queries*)) " ms") (debug:print-info 0 "Server shutdown complete. Exiting") (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running complete") (exit))) ;; all routes though here end in exit ... ;; ;; start_server? ;; (define (http-transport:launch run-id) |
︙ | ︙ | |||
532 533 534 535 536 537 538 | "-") run-id server-id)) "Server run")) (th3 (make-thread (lambda () (debug:print-info 0 "Server monitor thread started") (http-transport:keep-running server-id run-id)) "Keep running"))) | < < < < < < < < > | 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 | "-") run-id server-id)) "Server run")) (th3 (make-thread (lambda () (debug:print-info 0 "Server monitor thread started") (http-transport:keep-running server-id run-id)) "Keep running"))) (thread-start! th2) (thread-sleep! 0.25) ;; give the server time to settle before starting the keep-running monitor. (thread-start! th3) (set! *didsomething* #t) (thread-join! th2) (exit)))))) (define (http-transport:server-signal-handler signum) (signal-mask! signum) |
︙ | ︙ |
Modified launch.scm from [e538ecba1a] to [34882953c1].
︙ | ︙ | |||
97 98 99 100 101 102 103 | ;; Do not run the test if it is REMOVING, RUNNING, KILLREQ or REMOTEHOSTSTART, ;; Mark the test as REMOTEHOSTSTART *IMMEDIATELY* ;; (let ((test-info (rmt:get-testinfo-state-status run-id test-id))) (if (not (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ"))) (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") | > | > | 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 | ;; Do not run the test if it is REMOVING, RUNNING, KILLREQ or REMOTEHOSTSTART, ;; Mark the test as REMOTEHOSTSTART *IMMEDIATELY* ;; (let ((test-info (rmt:get-testinfo-state-status run-id test-id))) (if (not (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ"))) (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") (begin (debug:print 0 "ERROR: test state is " (db:test-get-state test-info) ", cannot proceed") (exit)))) (debug:print 2 "Exectuing " test-name " (id: " test-id ") on " (get-host-name)) (set! keys (rmt:get-keys)) ;; (runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals) ;; these may be needed by the launching process ;; one of these is defunct/redundant ... (if (not (launch:setup-for-run force: #t)) (begin |
︙ | ︙ |
Modified rmt.scm from [e6ea17e36c] to [2b271743b3].
︙ | ︙ | |||
87 88 89 90 91 92 93 | ;; the nmsg method does the encoding under the hood (the http method should be changed to do this also) (if connection-info ;; use the server if have connection info (let* ((dat (case *transport-type* ((http)(http-transport:client-api-send-receive run-id connection-info cmd jparams)) ((nmsg)(nmsg-transport:client-api-send-receive run-id connection-info cmd params)) (else (exit)))) | | | | | 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 | ;; the nmsg method does the encoding under the hood (the http method should be changed to do this also) (if connection-info ;; use the server if have connection info (let* ((dat (case *transport-type* ((http)(http-transport:client-api-send-receive run-id connection-info cmd jparams)) ((nmsg)(nmsg-transport:client-api-send-receive run-id connection-info cmd params)) (else (exit)))) (res (if (and dat (vector? dat)) (vector-ref dat 1) #f)) (success (if (and dat (vector? dat)) (vector-ref dat 0) #f))) (http-transport:server-dat-update-last-access connection-info) (if success (case *transport-type* ((http)(db:string->obj res)) ((nmsg) res)) (begin ;; let ((new-connection-info (client:setup run-id))) (debug:print 0 "WARNING: Communication failed, trying call to http-transport:client-api-send-receive again.") (hash-table-delete! *runremote* run-id) ;; don't keep using the same connection ;; no longer killing the server in http-transport:client-api-send-receive ;; may kill it here but what are the criteria? ;; start with three calls then kill server (if (eq? attemptnum 3)(tasks:kill-server-run-id run-id)) (thread-sleep! 2) (rmt:send-receive cmd run-id params attemptnum: (+ attemptnum 1))))) (if (and (< attemptnum 10) (tasks:need-server run-id)) (begin (tasks:start-and-wait-for-server (db:delay-if-busy (tasks:open-db)) run-id 10) (rmt:send-receive cmd rid params (+ attemptnum 1))) (rmt:open-qry-close-locally cmd run-id params))))) |
︙ | ︙ |
Modified runs.scm from [3205d6c7db] to [f2976f1213].
︙ | ︙ | |||
552 553 554 555 556 557 558 | (runs:queue-next-reg trimmed-tal trimmed-reg reglen regfull) reruns))) (list (car newtal)(append (cdr newtal) reg) '() reruns)))) ((and (null? fails) (null? prereq-fails) (null? non-completed)) | | > > | 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 | (runs:queue-next-reg trimmed-tal trimmed-reg reglen regfull) reruns))) (list (car newtal)(append (cdr newtal) reg) '() reruns)))) ((and (null? fails) (null? prereq-fails) (null? non-completed)) (if (runs:can-keep-running? hed 20) (begin (runs:inc-cant-run-tests hed) (debug:print-info 1 "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0)) ;; getting here likely means the system is way overloaded, kill a full minute before continuing (thread-sleep! 60) ;; num-retries code was here ;; we use this opportunity to move contents of reg to tal (list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met? (begin (debug:print-info 1 "no fails in prerequisites for " hed " but nothing seen running in a while, dropping test " hed " from the run queue") (let ((test-id (rmt:get-test-id run-id hed ""))) (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "TIMED_OUT" "Nothing seen running in a while."))) |
︙ | ︙ | |||
671 672 673 674 675 676 677 | #f)) ;; Register tests ;; ((not (hash-table-ref/default test-registry (runs:make-full-test-name test-name item-path) #f)) (debug:print-info 4 "Pre-registering test " test-name "/" item-path " to create placeholder" ) ;; always do firm registration now in v1.60 and greater ;; (eq? *transport-type* 'fs) ;; no point in parallel registration if use fs | > | > | | > > > | 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 | #f)) ;; Register tests ;; ((not (hash-table-ref/default test-registry (runs:make-full-test-name test-name item-path) #f)) (debug:print-info 4 "Pre-registering test " test-name "/" item-path " to create placeholder" ) ;; always do firm registration now in v1.60 and greater ;; (eq? *transport-type* 'fs) ;; no point in parallel registration if use fs (let register-loop ((numtries 15)) (rmt:general-call 'register-test run-id run-id test-name item-path) (thread-sleep! 0.5) (if (rmt:get-test-id run-id test-name item-path) (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'done) (if (> numtries 0) (register-loop (- numtries 1)) (debug:print 0 "ERROR: failed to register test " (runs:make-full-test-name test-name item-path))))) (if (not (eq? (hash-table-ref/default test-registry (runs:make-full-test-name test-name "") #f) 'done)) (begin (rmt:general-call 'register-test run-id run-id test-name "") (if (rmt:get-test-id run-id test-name "") (hash-table-set! test-registry (runs:make-full-test-name test-name "") 'done)))) (runs:shrink-can-run-more-tests-count) ;; DELAY TWEAKER (still needed?) (if (and (null? tal)(null? reg)) |
︙ | ︙ |
Modified tasks.scm from [84082aa618] to [42971aa928].
︙ | ︙ | |||
165 166 167 168 169 170 171 | (define (tasks:hostinfo-get-hostname vec) (vector-ref vec 6)) (define (tasks:server-lock-slot mdb run-id) (tasks:server-clean-out-old-records-for-run-id mdb run-id " tasks:server-lock-slot") (if (< (tasks:num-in-available-state mdb run-id) 4) (begin (tasks:server-set-available mdb run-id) | | | 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 | (define (tasks:hostinfo-get-hostname vec) (vector-ref vec 6)) (define (tasks:server-lock-slot mdb run-id) (tasks:server-clean-out-old-records-for-run-id mdb run-id " tasks:server-lock-slot") (if (< (tasks:num-in-available-state mdb run-id) 4) (begin (tasks:server-set-available mdb run-id) ;; (thread-sleep! 2) ;; Try removing this. It may not be needed. (tasks:server-am-i-the-server? mdb run-id)) #f)) ;; register that this server may come online (first to register goes though with the process) (define (tasks:server-set-available mdb run-id) (sqlite3:execute mdb |
︙ | ︙ | |||
342 343 344 345 346 347 348 349 350 351 352 353 354 355 | (let ((res #f)) (sqlite3:for-each-row (lambda (id) (set! res id)) mdb ;; NEEDS dbprep ADDED "SELECT id FROM servers WHERE run_id=? AND (state = 'running' OR (state = 'dbprep' AND (strftime('%s','now') - start_time) < 60));" run-id) res)) (define (tasks:need-server run-id) (let ((forced (configf:lookup *configdat* "server" "required")) (maxqry (cdr (rmt:get-max-query-average run-id))) (threshold (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "10")))) (cond (forced | > > > > > > > > > | 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 | (let ((res #f)) (sqlite3:for-each-row (lambda (id) (set! res id)) mdb ;; NEEDS dbprep ADDED "SELECT id FROM servers WHERE run_id=? AND (state = 'running' OR (state = 'dbprep' AND (strftime('%s','now') - start_time) < 60));" run-id) res)) (define (tasks:server-running? mdb run-id) (let ((res #f)) (sqlite3:for-each-row (lambda (id) (set! res id)) mdb ;; NEEDS dbprep ADDED "SELECT id FROM servers WHERE run_id=? AND state = 'running';" run-id) res)) (define (tasks:need-server run-id) (let ((forced (configf:lookup *configdat* "server" "required")) (maxqry (cdr (rmt:get-max-query-average run-id))) (threshold (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "10")))) (cond (forced |
︙ | ︙ | |||
403 404 405 406 407 408 409 410 411 412 413 414 415 416 | (define (tasks:kill-server-run-id run-id #!key (tag "default")) (let* ((tdbdat (tasks:open-db)) (sdat (tasks:get-server (db:delay-if-busy tdbdat) run-id))) (if sdat (let ((hostname (vector-ref sdat 6)) (pid (vector-ref sdat 5)) (server-id (vector-ref sdat 0))) (debug:print-info 0 "Killing server " server-id " for run-id " run-id " on host " hostname " with pid " pid) (tasks:kill-server hostname pid) (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) ) (debug:print-info 0 "No server found for run-id " run-id ", nothing to kill")) ;; (sqlite3:finalize! tdb) )) | > | 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 | (define (tasks:kill-server-run-id run-id #!key (tag "default")) (let* ((tdbdat (tasks:open-db)) (sdat (tasks:get-server (db:delay-if-busy tdbdat) run-id))) (if sdat (let ((hostname (vector-ref sdat 6)) (pid (vector-ref sdat 5)) (server-id (vector-ref sdat 0))) (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "killed") (debug:print-info 0 "Killing server " server-id " for run-id " run-id " on host " hostname " with pid " pid) (tasks:kill-server hostname pid) (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) ) (debug:print-info 0 "No server found for run-id " run-id ", nothing to kill")) ;; (sqlite3:finalize! tdb) )) |
︙ | ︙ |
Modified tests/fullrun/megatest.config from [558689506a] to [7518de8980].
︙ | ︙ | |||
131 132 133 134 135 136 137 | # If the server can't be started on this port it will try the next port until # it succeeds port 8080 # This server will keep running this number of hours after last access. # Three minutes is 0.05 hours # timeout 0.025 | | | 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 | # If the server can't be started on this port it will try the next port until # it succeeds port 8080 # This server will keep running this number of hours after last access. # Three minutes is 0.05 hours # timeout 0.025 timeout 0.1 # Server is required - slower but more resistant to Sqlite issues. required yes # Start server when average query takes longer than this # server-query-threshold 55500 server-query-threshold -1 |
︙ | ︙ |