Overview
Comment: | Improved lock out on running tests. Added weak work-around for issue with preclean |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.60 |
Files: | files | file ages | folders |
SHA1: |
ef4d6581250ee9656b764ea2ac028384 |
User & Date: | matt on 2014-10-13 01:09:51 |
Other Links: | branch diff | manifest | tags |
Context
2014-10-13
| ||
10:43 | Wrapped db init with transaction. Dunno why I didn't think of that a long time ago. check-in: d494a8975c user: mrwellan tags: v1.60 | |
01:09 | Improved lock out on running tests. Added weak work-around for issue with preclean check-in: ef4d658125 user: matt tags: v1.60 | |
2014-10-10
| ||
10:49 | bumped version check-in: 69da7e37ab user: mrwellan tags: v1.60 | |
Changes
Modified http-transport.scm from [6063d55ba0] to [6e6db43f0a].
︙ | ︙ | |||
413 414 415 416 417 418 419 420 421 422 423 424 425 426 | (mutex-lock! *heartbeat-mutex*) (set! last-access *last-db-access*) (mutex-unlock! *heartbeat-mutex*) ;; (debug:print 11 "last-access=" last-access ", server-timeout=" server-timeout) ;; ;; no_traffic, no running tests, if server 0, no running servers ;; (if (and *server-run* (or (> (+ last-access server-timeout) (current-seconds)) (and (eq? run-id 0) (> (tasks:num-servers-non-zero-running tdb) 0)) (and (not (eq? run-id 0)) ;; only makes sense in non-zero run-id servers | > > | 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 | (mutex-lock! *heartbeat-mutex*) (set! last-access *last-db-access*) (mutex-unlock! *heartbeat-mutex*) ;; (debug:print 11 "last-access=" last-access ", server-timeout=" server-timeout) ;; ;; no_traffic, no running tests, if server 0, no running servers ;; ;; (let ((wait-on-running (configf:lookup *configdat* "server" "wait-on-running"))) ;; wait on running tasks (if not true then exit on time out) ;; (if (and *server-run* (or (> (+ last-access server-timeout) (current-seconds)) (and (eq? run-id 0) (> (tasks:num-servers-non-zero-running tdb) 0)) (and (not (eq? run-id 0)) ;; only makes sense in non-zero run-id servers |
︙ | ︙ |
Modified launch.scm from [8269db9adc] to [15a97345bd].
︙ | ︙ | |||
89 90 91 92 93 94 95 | (if (and (file-exists? fulln) (file-execute-access? fulln)) fulln runscript))))) ;; assume it is on the path (rollup-status 0)) (change-directory top-path) | | | | | | | | | | 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 | (if (and (file-exists? fulln) (file-execute-access? fulln)) fulln runscript))))) ;; assume it is on the path (rollup-status 0)) (change-directory top-path) ;; Do not run the test if it is REMOVING, RUNNING, KILLREQ or REMOTEHOSTSTART, ;; Mark the test as REMOTEHOSTSTART *IMMEDIATELY* ;; (let ((test-info (rmt:get-testinfo-state-status run-id test-id))) (if (not (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ"))) (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") (debug:print 0 "ERROR: test state is " (db:test-get-state test-info) ", cannot proceed"))) (debug:print 2 "Exectuing " test-name " (id: " test-id ") on " (get-host-name)) (set! keys (rmt:get-keys)) ;; (runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals) ;; these may be needed by the launching process ;; one of these is defunct/redundant ... (if (not (launch:setup-for-run force: #t)) (begin (debug:print 0 "Failed to setup, exiting") |
︙ | ︙ | |||
171 172 173 174 175 176 177 | (set-item-env-vars itemdat) (save-environment-as-files "megatest") ;; open-run-close not needed for test-set-meta-info ;; (tests:set-full-meta-info #f test-id run-id 0 work-area) ;; (tests:set-full-meta-info test-id run-id 0 work-area) (tests:set-full-meta-info #f test-id run-id 0 work-area 10) | < < | 171 172 173 174 175 176 177 178 179 180 181 182 183 184 | (set-item-env-vars itemdat) (save-environment-as-files "megatest") ;; open-run-close not needed for test-set-meta-info ;; (tests:set-full-meta-info #f test-id run-id 0 work-area) ;; (tests:set-full-meta-info test-id run-id 0 work-area) (tests:set-full-meta-info #f test-id run-id 0 work-area 10) (thread-sleep! 0.3) ;; NFS slowness has caused grief here (if (args:get-arg "-xterm") (set! fullrunscript "xterm") (if (and fullrunscript (not (file-execute-access? fullrunscript))) (system (conc "chmod ug+x " fullrunscript)))) ;; We are about to actually kick off the test |
︙ | ︙ | |||
522 523 524 525 526 527 528 | ;; ;; <linkdir> - <target> - <testname> [ - <itempath> ] ;; ;; All log file links should be stored relative to the top of link path ;; ;; <target> - <testname> [ - <itempath> ] ;; | | | 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 | ;; ;; <linkdir> - <target> - <testname> [ - <itempath> ] ;; ;; All log file links should be stored relative to the top of link path ;; ;; <target> - <testname> [ - <itempath> ] ;; (define (create-work-area run-id run-info keyvals test-id test-src-path disk-path testname itemdat #!key (remtries 2)) (let* ((item-path (item-list->path itemdat)) (runname (db:get-value-by-header (db:get-rows run-info) (db:get-header run-info) "runname")) ;; convert back to db: from rdb: - this is always run at server end (target (string-intersperse (map cadr keyvals) "/")) |
︙ | ︙ | |||
544 545 546 547 548 549 550 | (toptest-path (conc disk-path "/" testtop-base)) (test-path (conc disk-path "/" test-base)) ;; ensure this exists first as links to subtests must be created there (linktree (let ((rd (config-lookup *configdat* "setup" "linktree"))) (if rd rd (conc *toppath* "/runs")))) | | | | > | 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 | (toptest-path (conc disk-path "/" testtop-base)) (test-path (conc disk-path "/" test-base)) ;; ensure this exists first as links to subtests must be created there (linktree (let ((rd (config-lookup *configdat* "setup" "linktree"))) (if rd rd (conc *toppath* "/runs")))) (lnkbase (conc linktree "/" target "/" runname)) (lnkpath (conc lnkbase "/" testname)) (lnkpathf (conc lnkpath (if not-iterated "" "/") item-path)) (lnktarget (conc lnkpath "/" item-path))) ;; Update the rundir path in the test record for all, rundir=physical, shortdir=logical ;; rundir shortdir (rmt:general-call 'test-set-rundir-shortdir run-id lnkpathf test-path testname item-path) (debug:print 2 "INFO:\n lnkbase=" lnkbase "\n lnkpath=" lnkpath "\n toptest-path=" toptest-path "\n test-path=" test-path) (if (not (file-exists? linktree)) |
︙ | ︙ | |||
629 630 631 632 633 634 635 | (debug:print-info 2 "Creating " toptest-path " and link " lnkpath) (create-directory toptest-path #t) (hash-table-set! *toptest-paths* testname toptest-path))))) ;; The toptest path has been created, the link to the test in the linktree has ;; been created. Now, if this is an iterated test the real test dir must be created (if (not not-iterated) ;; this is an iterated test | | | 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 | (debug:print-info 2 "Creating " toptest-path " and link " lnkpath) (create-directory toptest-path #t) (hash-table-set! *toptest-paths* testname toptest-path))))) ;; The toptest path has been created, the link to the test in the linktree has ;; been created. Now, if this is an iterated test the real test dir must be created (if (not not-iterated) ;; this is an iterated test (begin ;; (let ((lnktarget (conc lnkpath "/" item-path))) (debug:print 2 "Setting up sub test run area") (debug:print 2 " - creating run area in " test-path) (handle-exceptions exn (begin (debug:print 0 "ERROR: Failed to create directory " test-path ((condition-property-accessor 'exn 'message) exn) ", exiting") (exit 1)) |
︙ | ︙ | |||
651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 | exn (begin (debug:print 0 "ERROR: Failed to re-create link " linktarget ((condition-property-accessor 'exn 'message) exn) ", exiting") (exit)) (if (symbolic-link? lnktarget) (delete-file lnktarget)) (if (not (file-exists? lnktarget)) (create-symbolic-link test-path lnktarget))))) (if (directory? test-path) (begin (let* ((ovrcmd (let ((cmd (config-lookup *configdat* "setup" "testcopycmd"))) (if cmd ;; substitute the TEST_SRC_PATH and TEST_TARG_PATH (string-substitute "TEST_TARG_PATH" test-path (string-substitute "TEST_SRC_PATH" test-src-path cmd #t) #t) #f))) (cmd (if ovrcmd ovrcmd (conc "rsync -av" (if (debug:debug-mode 1) "" "q") " " test-src-path "/ " test-path "/" " >> " test-path "/mt_launch.log 2>> " test-path "/mt_launch.log"))) (status (system cmd))) (if (not (eq? status 0)) (debug:print 2 "ERROR: problem with running \"" cmd "\""))) (list lnkpathf lnkpath )) | > > > > > > > > | | 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 | exn (begin (debug:print 0 "ERROR: Failed to re-create link " linktarget ((condition-property-accessor 'exn 'message) exn) ", exiting") (exit)) (if (symbolic-link? lnktarget) (delete-file lnktarget)) (if (not (file-exists? lnktarget)) (create-symbolic-link test-path lnktarget))))) (if (not (directory? test-path)) (create-directory test-path #t)) ;; this is a hack, I don't know why out of the blue this path does not exist sometimes (if (directory? test-path) (begin (let* ((ovrcmd (let ((cmd (config-lookup *configdat* "setup" "testcopycmd"))) (if cmd ;; substitute the TEST_SRC_PATH and TEST_TARG_PATH (string-substitute "TEST_TARG_PATH" test-path (string-substitute "TEST_SRC_PATH" test-src-path cmd #t) #t) #f))) (cmd (if ovrcmd ovrcmd (conc "rsync -av" (if (debug:debug-mode 1) "" "q") " " test-src-path "/ " test-path "/" " >> " test-path "/mt_launch.log 2>> " test-path "/mt_launch.log"))) (status (system cmd))) (if (not (eq? status 0)) (debug:print 2 "ERROR: problem with running \"" cmd "\""))) (list lnkpathf lnkpath )) (if (> remtries 0) (begin (debug:print 0 "ERROR: Failed to create work area at " test-path " with link at " lnktarget ", remaining attempts " remtries) ;; (create-work-area run-id run-info keyvals test-id test-src-path disk-path testname itemdat remtries: (- remtries 1))) (list #f #f))))) ;; 1. look though disks list for disk with most space ;; 2. create run dir on disk, path name is meaningful ;; 3. create link from run dir to megatest runs area ;; 4. remotely run the test on allocated host ;; - could be ssh to host from hosts table (update regularly with load) ;; - could be netbatch |
︙ | ︙ | |||
739 740 741 742 743 744 745 746 747 748 749 750 751 752 | (if launcher (set! launcher (string-split launcher))) ;; set up the run work area for this test (if (and (args:get-arg "-preclean") ;; user has requested to preclean for this run (not (member (db:test-get-rundir testinfo)(list "n/a" "/tmp/badname")))) ;; n/a is a placeholder and thus not a read dir (begin (debug:print-info 0 "attempting to preclean directory " (db:test-get-rundir testinfo) " for test " test-name "/" item-path) (runs:remove-test-directory #f testinfo #t))) ;; remove data only, do not perturb the record (set! diskpath (get-best-disk *configdat*)) (if diskpath (let ((dat (create-work-area run-id run-info keyvals test-id test-path diskpath test-name itemdat))) (set! work-area (car dat)) (set! toptest-work-area (cadr dat)) (debug:print-info 2 "Using work area " work-area)) (begin | > > > > > | 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 | (if launcher (set! launcher (string-split launcher))) ;; set up the run work area for this test (if (and (args:get-arg "-preclean") ;; user has requested to preclean for this run (not (member (db:test-get-rundir testinfo)(list "n/a" "/tmp/badname")))) ;; n/a is a placeholder and thus not a read dir (begin (debug:print-info 0 "attempting to preclean directory " (db:test-get-rundir testinfo) " for test " test-name "/" item-path) (runs:remove-test-directory #f testinfo #t))) ;; remove data only, do not perturb the record ;; prevent overlapping actions - set to LAUNCHED as early as possible ;; (tests:test-set-status! run-id test-id "LAUNCHED" "n/a" #f #f) ;; (if launch-results launch-results "FAILED")) (rmt:roll-up-pass-fail-counts run-id test-name item-path "LAUNCHED") (set! diskpath (get-best-disk *configdat*)) (if diskpath (let ((dat (create-work-area run-id run-info keyvals test-id test-path diskpath test-name itemdat))) (set! work-area (car dat)) (set! toptest-work-area (cadr dat)) (debug:print-info 2 "Using work area " work-area)) (begin |
︙ | ︙ | |||
772 773 774 775 776 777 778 779 780 781 | (list 'ezsteps ezsteps) (list 'target mt_target) (list 'runtlim (if run-time-limit (common:hms-string->seconds run-time-limit) #f)) (list 'env-ovrd (hash-table-ref/default *configdat* "env-override" '())) (list 'set-vars (if params (hash-table-ref/default params "-setvars" #f))) (list 'runname runname) (list 'mt-bindir-path mt-bindir-path)))))))) ;; clean out step records from previous run if they exist ;; (rmt:delete-test-step-records run-id test-id) (change-directory work-area) ;; so that log files from the launch process don't clutter the test dir | > < < | 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 | (list 'ezsteps ezsteps) (list 'target mt_target) (list 'runtlim (if run-time-limit (common:hms-string->seconds run-time-limit) #f)) (list 'env-ovrd (hash-table-ref/default *configdat* "env-override" '())) (list 'set-vars (if params (hash-table-ref/default params "-setvars" #f))) (list 'runname runname) (list 'mt-bindir-path mt-bindir-path)))))))) ;; clean out step records from previous run if they exist ;; (rmt:delete-test-step-records run-id test-id) (change-directory work-area) ;; so that log files from the launch process don't clutter the test dir (cond ((and launcher hosts) ;; must be using ssh hostname (set! fullcmd (append launcher (car hosts)(list remote-megatest test-sig "-execute" cmdparms) debug-param))) ;; (set! fullcmd (append launcher (car hosts)(list remote-megatest test-sig "-execute" cmdparms)))) (launcher (set! fullcmd (append launcher (list remote-megatest test-sig "-execute" cmdparms) debug-param))) ;; (set! fullcmd (append launcher (list remote-megatest test-sig "-execute" cmdparms)))) |
︙ | ︙ |
Modified runs.scm from [8478824409] to [93a7d492d9].
︙ | ︙ | |||
1068 1069 1070 1071 1072 1073 1074 | (begin ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes ;; (debug:print 0 "Got here eh! num-running=" num-running " (> num-running 0) " (> num-running 0)) (if (> (current-seconds)(+ last-time-incomplete 900)) (begin (debug:print-info 0 "Marking stuck tests as INCOMPLETE while waiting for run " run-id ". Running as pid " (current-process-id) " on " (get-host-name)) (set! last-time-incomplete (current-seconds)) | | | 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 | (begin ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes ;; (debug:print 0 "Got here eh! num-running=" num-running " (> num-running 0) " (> num-running 0)) (if (> (current-seconds)(+ last-time-incomplete 900)) (begin (debug:print-info 0 "Marking stuck tests as INCOMPLETE while waiting for run " run-id ". Running as pid " (current-process-id) " on " (get-host-name)) (set! last-time-incomplete (current-seconds)) (rmt:find-and-mark-incomplete run-id #f))) (if (not (eq? num-running prev-num-running)) (debug:print-info 0 "run-wait specified, waiting on " num-running " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at " (time->string (seconds->local-time (current-seconds))))) (thread-sleep! 15) ;; (wait-loop (rmt:get-count-tests-running-for-run-id run-id) num-running)))) (wait-loop (rmt:get-count-tests-running-for-run-id run-id) num-running)))) ;; LET* ((test-record ;; we get here on "drop through". All done! |
︙ | ︙ | |||
1503 1504 1505 1506 1507 1508 1509 | #t) (define (runs:remove-test-directory db test remove-data-only) (let* ((run-dir (db:test-get-rundir test)) ;; run dir is from the link tree (real-dir (if (file-exists? run-dir) (resolve-pathname run-dir) #f))) | | > | 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 | #t) (define (runs:remove-test-directory db test remove-data-only) (let* ((run-dir (db:test-get-rundir test)) ;; run dir is from the link tree (real-dir (if (file-exists? run-dir) (resolve-pathname run-dir) #f))) (if remove-data-only (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "CLEANING" "LOCKED" #f) (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "REMOVING" "LOCKED" #f)) (debug:print-info 1 "Attempting to remove " (if real-dir (conc " dir " real-dir " and ") "") " link " run-dir) (if (and real-dir (> (string-length real-dir) 5) (file-exists? real-dir)) ;; bad heuristic but should prevent /tmp /home etc. (begin ;; let* ((realpath (resolve-pathname run-dir))) (debug:print-info 1 "Recursively removing " real-dir) |
︙ | ︙ | |||
1537 1538 1539 1540 1541 1542 1543 | (delete-directory run-dir))) (if (and run-dir (not (member run-dir (list "n/a" "/tmp/badname")))) (debug:print 0 "WARNING: not removing " run-dir " as it either doesn't exist or is not a symlink") (debug:print 0 "NOTE: the run dir for this test is undefined. Test may have already been deleted.")) )) ;; Only delete the records *after* removing the directory. If things fail we have a record | | > | 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 | (delete-directory run-dir))) (if (and run-dir (not (member run-dir (list "n/a" "/tmp/badname")))) (debug:print 0 "WARNING: not removing " run-dir " as it either doesn't exist or is not a symlink") (debug:print 0 "NOTE: the run dir for this test is undefined. Test may have already been deleted.")) )) ;; Only delete the records *after* removing the directory. If things fail we have a record (if remove-data-only (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "NOT_STARTED" "n/a" #f) (rmt:delete-test-records (db:test-get-run_id test) (db:test-get-id test))))) ;;====================================================================== ;; Routines for manipulating runs ;;====================================================================== ;; Since many calls to a run require pretty much the same setup |
︙ | ︙ |