Overview
Comment: | Run launch:setup in launch:execute - needed now due to refactoring. |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.62-no-rpc |
Files: | files | file ages | folders |
SHA1: |
b19499a3eb4e0b2d9499cbbcfc5e94cf |
User & Date: | matt on 2016-11-27 11:58:42 |
Other Links: | branch diff | manifest | tags |
Context
2016-11-27
| ||
13:08 | Fixed runs cleanup where not constrained by run_id. Added missing mutex-unlock before recursively calling homehost. check-in: 8956d8d873 user: matt tags: v1.62-no-rpc | |
11:58 | Run launch:setup in launch:execute - needed now due to refactoring. check-in: b19499a3eb user: matt tags: v1.62-no-rpc | |
08:25 | Added mutex for homehost calculation check-in: 8d4f021975 user: matt tags: v1.62-no-rpc | |
Changes
Modified common.scm from [a2d76e3385] to [046ea6668a].
︙ | ︙ | |||
129 130 131 132 133 134 135 | (define *test-paths* (make-hash-table)) ;; cache test-id to test run paths here (define *test-ids* (make-hash-table)) ;; cache run-id, testname, and item-path => test-id (define *test-info* (make-hash-table)) ;; cache the test info records, update the state, status, run_duration etc. from testdat.db (define *run-info-cache* (make-hash-table)) ;; run info is stable, no need to reget (define *launch-setup-mutex* (make-mutex)) ;; need to be able to call launch:setup often so mutex it and re-call the real deal only if *toppath* not set (define *homehost-mutex* (make-mutex)) | | > | 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 | (define *test-paths* (make-hash-table)) ;; cache test-id to test run paths here (define *test-ids* (make-hash-table)) ;; cache run-id, testname, and item-path => test-id (define *test-info* (make-hash-table)) ;; cache the test info records, update the state, status, run_duration etc. from testdat.db (define *run-info-cache* (make-hash-table)) ;; run info is stable, no need to reget (define *launch-setup-mutex* (make-mutex)) ;; need to be able to call launch:setup often so mutex it and re-call the real deal only if *toppath* not set (define *homehost-mutex* (make-mutex)) ;; cache environment vars for each run here (define *env-vars-by-run-id* (make-hash-table)) ;; Testconfig and runconfig caches. (define *testconfigs* (make-hash-table)) ;; test-name => testconfig (define *runconfigs* (make-hash-table)) ;; target => runconfig ;; This is a cache of pre-reqs met, don't re-calc in cases where called with same params less than |
︙ | ︙ | |||
513 514 515 516 517 518 519 | ;;====================================================================== ;; E X I T H A N D L I N G ;;====================================================================== (define (common:run-sync?) (let ((ohh (common:on-homehost?)) (srv (args:get-arg "-server"))) | | | 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 | ;;====================================================================== ;; E X I T H A N D L I N G ;;====================================================================== (define (common:run-sync?) (let ((ohh (common:on-homehost?)) (srv (args:get-arg "-server"))) ;; (debug:print-info 0 *default-log-port* "common:run-sync? ohh=" ohh ", srv=" srv) (and (common:on-homehost?) (args:get-arg "-server")))) ;;;; run-ids ;; if #f use *db-local-sync* : or 'local-sync-flags ;; if #t use timestamps : or 'timestamps (define (common:sync-to-megatest.db dbstruct) |
︙ | ︙ |
Modified http-transport.scm from [1ecd29c566] to [a93ed3a08e].
︙ | ︙ | |||
369 370 371 372 373 374 375 | (debug:print-info 0 *default-log-port* "Waiting for server alive signature") (mutex-lock! *heartbeat-mutex*) (set! sdat *server-info*) (mutex-unlock! *heartbeat-mutex*) (if (and sdat (not changed) (> (- (current-seconds) start-time) 2)) | > > | | > | > | < < < < | | | | | | | | | > | | > | | | | > > > > > > > > | | 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 | (debug:print-info 0 *default-log-port* "Waiting for server alive signature") (mutex-lock! *heartbeat-mutex*) (set! sdat *server-info*) (mutex-unlock! *heartbeat-mutex*) (if (and sdat (not changed) (> (- (current-seconds) start-time) 2)) (begin (debug:print-info 0 *default-log-port* "Received server alive signature") sdat) (begin (debug:print-info 0 *default-log-port* "Still waiting, last-sdat=" last-sdat) (sleep 4) (if (> (- (current-seconds) start-time) 120) ;; been waiting for two minutes (begin (debug:print-error 0 *default-log-port* "transport appears to have died, exiting server " server-id " for run " run-id) (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id "failed to start, never received server alive signature") (exit)) (loop start-time (equal? sdat last-sdat) sdat))))))) (iface (car server-info)) (port (cadr server-info)) (last-access 0) (server-timeout (server:get-timeout)) (server-going #f)) (let loop ((count 0) (server-state 'available) (bad-sync-count 0) (start-time (current-milliseconds))) ;; Use this opportunity to sync the tmp db to megatest.db (if (not server-going) ;; *dbstruct-db* ;; Removed code is pasted below (keeping it around until we are clear it is not needed). ;; no *dbstruct-db* yet, set running after our first pass through and start the db (if (eq? server-state 'available) (let ((new-server-id (tasks:server-am-i-the-server? (db:delay-if-busy tdbdat) run-id))) ;; try to ensure no double registering of servers (if (equal? new-server-id server-id) (begin (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "dbprep") (thread-sleep! 0.5) ;; give some margin for queries to complete before switching from file based access to server based access (set! *dbstruct-db* (db:setup)) ;; run-id)) (set! server-going #t) (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running") (server:write-dotserver *toppath* (conc iface ":" port)) (delete-file* (conc *toppath* "/.starting-server"))) (begin ;; gotta exit nicely (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "collision") (http-transport:server-shutdown server-id port)))))) ;; when things go wrong we don't want to be doing the various queries too often ;; so we strive to run this stuff only every four seconds or so. (let* ((sync-time (- (current-milliseconds) start-time)) (rem-time (quotient (- 4000 sync-time) 1000))) (if (and (<= rem-time 4) (> rem-time 0)) (thread-sleep! rem-time))) (if (< count 1) ;; 3x3 = 9 secs aprox (loop (+ count 1) 'running bad-sync-count (current-milliseconds))) ;; Check that iface and port have not changed (can happen if server port collides) (mutex-lock! *heartbeat-mutex*) (set! sdat *server-info*) (mutex-unlock! *heartbeat-mutex*) (if (or (not (equal? sdat (list iface port))) |
︙ | ︙ | |||
455 456 457 458 459 460 461 | ;; ;; Consider implementing some smarts here to re-insert the record or kill self is ;; the db indicates so ;; ;; (if (tasks:server-am-i-the-server? tdb run-id) ;; (tasks:server-set-state! tdb server-id "running")) ;; | | | 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 | ;; ;; Consider implementing some smarts here to re-insert the record or kill self is ;; the db indicates so ;; ;; (if (tasks:server-am-i-the-server? tdb run-id) ;; (tasks:server-set-state! tdb server-id "running")) ;; (loop 0 server-state bad-sync-count (current-milliseconds))) (http-transport:server-shutdown server-id port)))))) ;; code cut out from above ;; ;; (condition-case ;; ;; (if (and (member (mutex-state *db-sync-mutex*) '(abandoned not-abandoned)) ;; ;; (> (- (current-seconds) *db-last-sync*) 5)) ;; if not currently being synced nor recently synced |
︙ | ︙ | |||
521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 | (exit))) ;; all routes though here end in exit ... ;; ;; start_server? ;; (define (http-transport:launch run-id) (let* ((tdbdat (tasks:open-db))) (set! *run-id* run-id) (if (args:get-arg "-daemonize") (begin (daemon:ize) (if *alt-log-file* ;; we should re-connect to this port, I think daemon:ize disrupts it (begin (current-error-port *alt-log-file*) (current-output-port *alt-log-file*))))) (if (server:check-if-running run-id) (begin (debug:print 0 *default-log-port* "INFO: Server for run-id " run-id " already running") | > > > > | > > > | 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 | (exit))) ;; all routes though here end in exit ... ;; ;; start_server? ;; (define (http-transport:launch run-id) (with-output-to-file (conc *toppath* "/.starting-server") (lambda () (print (current-process-id) " on " (get-host-name)))) (let* ((tdbdat (tasks:open-db))) (set! *run-id* run-id) (if (args:get-arg "-daemonize") (begin (daemon:ize) (if *alt-log-file* ;; we should re-connect to this port, I think daemon:ize disrupts it (begin (current-error-port *alt-log-file*) (current-output-port *alt-log-file*))))) (if (server:check-if-running run-id) (begin (debug:print 0 *default-log-port* "INFO: Server for run-id " run-id " already running") (exit 0)) (begin ;; ok, no server detected, clean out any lingering records (tasks:server-force-clean-running-records-for-run-id (db:delay-if-busy tdbdat) run-id "notresponding"))) (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id)) (remtries 4)) (if (not server-id) (if (> remtries 0) (begin (thread-sleep! 2) (loop (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id) (- remtries 1))) (begin ;; since we didn't get the server lock we are going to clean up and bail out (debug:print-info 2 *default-log-port* "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue") (tasks:server-delete-records-for-this-pid (db:delay-if-busy tdbdat) " http-transport:launch") (delete-file* (conc *toppath* "/.starting-server")) )) (let* ((th2 (make-thread (lambda () (debug:print-info 0 *default-log-port* "Server run thread started") (http-transport:run (if (args:get-arg "-server") (args:get-arg "-server") "-") |
︙ | ︙ |
Modified launch.scm from [df256f9067] to [4e784cfd15].
︙ | ︙ | |||
386 387 388 389 390 391 392 | (begin (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta (loop (calc-minutes) (or new-cpu-load cpu-load) (or new-disk-free disk-free))))))) (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional (define (launch:execute encoded-cmd) | | | | 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 | (begin (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta (loop (calc-minutes) (or new-cpu-load cpu-load) (or new-disk-free disk-free))))))) (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional (define (launch:execute encoded-cmd) (let* ((cmdinfo (common:read-encoded-string encoded-cmd)) (tconfigreg #f)) (setenv "MT_CMDINFO" encoded-cmd) (if (list? cmdinfo) ;; ((testpath /tmp/mrwellan/jazzmind/src/example_run/tests/sqlitespeed) ;; (test-name sqlitespeed) (runscript runscript.rb) (db-host localhost) (run-id 1)) (let* ((testpath (assoc/default 'testpath cmdinfo)) ;; testpath is the test spec area (top-path (assoc/default 'toppath cmdinfo)) (work-area (assoc/default 'work-area cmdinfo)) ;; work-area is the test run area (test-name (assoc/default 'test-name cmdinfo)) |
︙ | ︙ | |||
434 435 436 437 438 439 440 | (if (or (file-exists? top-path) (> count 10)) (change-directory top-path) (begin (debug:print 0 *default-log-port* "INFO: Not starting job yet - directory " top-path " not found") (thread-sleep! 10) (loop (+ count 1))))) | | > | 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 | (if (or (file-exists? top-path) (> count 10)) (change-directory top-path) (begin (debug:print 0 *default-log-port* "INFO: Not starting job yet - directory " top-path " not found") (thread-sleep! 10) (loop (+ count 1))))) (launch:setup) ;; should be properly in the top-path now (set! tconfigreg (tests:get-all)) (let ((sighand (lambda (signum) ;; (signal-mask! signum) ;; to mask or not? seems to cause issues in exiting (if (eq? signum signal/stop) (debug:print-error 0 *default-log-port* "attempt to STOP process. Exiting.")) (set! *time-to-exit* #t) (print "Received signal " signum ", cleaning up before exit. Please wait...") (let ((th1 (make-thread (lambda () |
︙ | ︙ | |||
701 702 703 704 705 706 707 | ;; side effects: ;; sets; *configdat* (megatest.config info) ;; *runconfigdat* (runconfigs.config info) ;; *configstatus* (status of the read data) ;; (define (launch:setup #!key (force #f)) (mutex-lock! *launch-setup-mutex*) | | > > | 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 | ;; side effects: ;; sets; *configdat* (megatest.config info) ;; *runconfigdat* (runconfigs.config info) ;; *configstatus* (status of the read data) ;; (define (launch:setup #!key (force #f)) (mutex-lock! *launch-setup-mutex*) (if (and *toppath* (eq? *configstatus* 'fulldata)) ;; got it all (begin (debug:print 0 *default-log-port* "NOTE: skipping launch:setup-body call since we have fulldata") (mutex-unlock! *launch-setup-mutex*) *toppath*) (let ((res (launch:setup-body force: force))) (mutex-unlock! *launch-setup-mutex*) res))) (define (launch:setup-body #!key (force #f)) |
︙ | ︙ |
Modified megatest.scm from [a8a0fb352a] to [60075c013d].
︙ | ︙ | |||
695 696 697 698 699 700 701 | ;;====================================================================== (if (args:get-arg "-server") ;; Server? Start up here. ;; (let ((tl (launch:setup)) | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 | ;;====================================================================== (if (args:get-arg "-server") ;; Server? Start up here. ;; (let ((tl (launch:setup)) ;; (run-id (and (args:get-arg "-run-id") ;; (string->number (args:get-arg "-run-id")))) (transport-type (string->symbol (or (args:get-arg "-transport") "http")))) ;; (if run-id ;; (begin (server:launch 0 transport-type) (set! *didsomething* #t))) ;; ;; (debug:print-error 0 *default-log-port* "server requires run-id be specified with -run-id"))) ;; ;; ;; Not a server? This section will decide how to communicate ;; ;; ;; ;; Setup client for all expect listed here ;; (if (null? (lset-intersection ;; equal? ;; (hash-table-keys args:arg-hash) ;; '("-list-servers" ;; "-stop-server" ;; "-kill-server" ;; "-show-cmdinfo" ;; "-list-runs" ;; "-ping"))) ;; (if (launch:setup) ;; (let ((run-id (and (args:get-arg "-run-id") ;; (string->number (args:get-arg "-run-id"))))) ;; ;; (set! *fdb* (filedb:open-db (conc *toppath* "/db/paths.db"))) ;; ;; if not list or kill then start a client (if appropriate) ;; (if (or (args-defined? "-h" "-version" "-create-megatest-area" "-create-test") ;; (eq? (length (hash-table-keys args:arg-hash)) 0)) ;; (debug:print-info 1 *default-log-port* "Server connection not needed") ;; (begin ;; ;; (if run-id ;; ;; (client:launch run-id) ;; ;; (client:launch 0) ;; without run-id we'll start a server for "0" ;; #t ;; )))))) (if (or (args:get-arg "-list-servers") (args:get-arg "-stop-server") (args:get-arg "-kill-server")) (let ((tl (launch:setup))) (if tl (let* ((tdbdat (tasks:open-db)) |
︙ | ︙ |
Modified rmt.scm from [80d13e1dc6] to [1582aa56da].
︙ | ︙ | |||
57 58 59 60 61 62 63 | ;; clean out old connections ;; (mutex-lock! *db-multi-sync-mutex*) ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in *runremote* ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds. ;; 3. do the query, if on homehost use local access ;; | | > | | 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 | ;; clean out old connections ;; (mutex-lock! *db-multi-sync-mutex*) ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in *runremote* ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds. ;; 3. do the query, if on homehost use local access ;; (if (and ;; #f ;; FORCE NO GO FOR RIGHT NOW (not *runremote*) ;; we trust *runremote* to reflect that a server was found previously (not (member cmd api:read-only-queries))) ;; we don't trust so much the list of write queries (let ((serverconn (server:check-if-running *toppath*))) (if serverconn (set! *runremote* serverconn) ;; the string can be consumed by the client setup if needed (if (not (server:start-attempted? *toppath*)) (server:kind-run *toppath*))))) (rmt:open-qry-close-locally cmd (if rid rid 0) params)) ;; (let ((expire-time (- (current-seconds) (server:get-timeout) 10))) ;; don't forget the 10 second margin ;; (for-each ;; (lambda (run-id) ;; (let ((connection (hash-table-ref/default *runremote* run-id #f))) |
︙ | ︙ |
Modified server.scm from [185590173a] to [15a5983a03].
︙ | ︙ | |||
105 106 107 108 109 110 111 | ;; try running on that host ;; incidental: rotate logs in logs/ dir. ;; (define (server:run areapath) ;; areapath is ignored for now. (let* ((curr-host (get-host-name)) (curr-ip (server:get-best-guess-address curr-host)) (curr-pid (current-process-id)) | | > | | | 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 | ;; try running on that host ;; incidental: rotate logs in logs/ dir. ;; (define (server:run areapath) ;; areapath is ignored for now. (let* ((curr-host (get-host-name)) (curr-ip (server:get-best-guess-address curr-host)) (curr-pid (current-process-id)) (homehost (common:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" )) (target-host (car homehost)) (testsuite (common:get-testsuite-name)) (logfile (conc *toppath* "/logs/server.log")) (cmdln (conc (common:get-megatest-exe) " -server " (or target-host "-") " -run-id " 0 (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") (conc " -daemonize -log " logfile) "") " -m testsuite:" testsuite)) ;; (conc " >> " logfile " 2>&1 &"))))) (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread"))) ;; we want the remote server to start in *toppath* so push there (push-directory *toppath*) (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") (thread-start! log-rotate) ;; host.domain.tld match host? (if (and target-host ;; look at target host, is it host.domain.tld or ip address and does it ;; match current ip or hostname (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host)) |
︙ | ︙ | |||
155 156 157 158 159 160 161 | (> (- (current-seconds) last-run-time) 30)) (begin (server:run areapath) (hash-table-set! *server-kind-run* areapath (current-seconds)))))) ;; The generic run a server command. Dispatches the call to server 0 if run-id != 0 ;; | | | | | > > > > > > > > | 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 | (> (- (current-seconds) last-run-time) 30)) (begin (server:run areapath) (hash-table-set! *server-kind-run* areapath (current-seconds)))))) ;; The generic run a server command. Dispatches the call to server 0 if run-id != 0 ;; ;; (define (server:try-running run-id) ;; (if (eq? run-id 0) ;; (server:run run-id) ;; (rmt:start-server run-id))) (define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG. (define (server:start-attempted? areapath) (let ((flagfile (conc areapath "/.starting-server"))) (and (file-exists? flagfile) (< (- (current-seconds) (file-modification-time flagfile)) 15)))) ;; exists and less than 15 seconds old (define (server:read-dotserver areapath) (let ((dotfile (conc areapath "/.server"))) (if (and (file-exists? dotfile) (file-read-access? dotfile)) (with-input-from-file dotfile (lambda () |
︙ | ︙ |