Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -817,14 +817,14 @@ (define *common:ended-states* ;; states which indicate the test is stopped and will not proceed '("COMPLETED" "ARCHIVED" "KILLED" "KILLREQ" "STUCK" "INCOMPLETE" )) (define *common:badly-ended-states* ;; these roll up as CHECK, i.e. results need to be checked - '("KILLED" "KILLREQ" "STUCK" "INCOMPLETE" "DEAD")) + '("KILLED" "KILLREQ" "STUCK" "INCOMPLETE" "DEAD" "CHECK")) (define *common:well-ended-states* ;; an item's prereq in this state allows item to proceed - '("PASS" "WARN" "CHECK" "WAIVED" "SKIP")) + '("PASS" "WARN" "WAIVED" "SKIP")) ;; BBnote: *common:running-states* used from db:set-state-status-and-roll-up-items (define *common:running-states* ;; test is either running or can be run '("RUNNING" "REMOTEHOSTSTART" "LAUNCHED" "STARTED")) @@ -2208,37 +2208,31 @@ ;; (define (common:wait-for-cpuload maxnormload numcpus-in #!key (count 1000) (msg #f)(remote-host #f)(num-tries 5)) (let* ((loadavg (common:get-cpu-load remote-host)) - ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again - (numcpus (if (<= 1 numcpus-in) - (common:get-num-cpus remote-host) - numcpus-in)) - (first (car loadavg)) - (next (cadr loadavg)) - (adjmaxload (* maxnormload (max 1 numcpus))) ;; possible bug - ;; where numcpus - ;; (or could be - ;; maxload) is - ;; zero, crude - ;; fallback is to - ;; at least use 1 - ;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit - ;; etc. - (effective-load (common:get-intercept first next)) - (recommended-delay (common:get-delay effective-load numcpus)) - (effective-host (or remote-host "localhost")) - (normalized-effective-load (/ effective-load numcpus)) - (will-wait (> normalized-effective-load maxnormload))) - (if (> recommended-delay 1) - (let* ((actual-delay (min recommended-delay 30))) - (if (common:low-noise-print 30 (conc (round actual-delay) "-safe-load")) - (debug:print-info 0 *default-log-port* "Load control, delaying " + ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again + (numcpus (if (<= 1 numcpus-in) + (common:get-num-cpus remote-host) numcpus-in)) + (first (car loadavg)) + (next (cadr loadavg)) + (adjmaxload (* maxnormload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude + ;; fallback is to at least use 1 + ;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit + ;; etc. + (effective-load (common:get-intercept first next)) + (recommended-delay (common:get-delay effective-load numcpus)) + (effective-host (or remote-host "localhost")) + (normalized-effective-load (/ effective-load numcpus)) + (will-wait (> normalized-effective-load maxnormload))) + (if (and will-wait (> recommended-delay 1)) + (let* ((actual-delay (min recommended-delay 30))) + (if (common:low-noise-print 30 (conc (round actual-delay) "-safe-load")) + (debug:print-info 0 *default-log-port* "Load control, delaying " actual-delay " seconds to maintain safe load. current normalized effective load is " normalized-effective-load". maxnormload = " maxnormload " numcpus = " numcpus " loadavg = " loadavg " effective-load = " effective-load)) - (thread-sleep! actual-delay))) + (thread-sleep! actual-delay))) (cond ;; bad data, try again to get the data ((not will-wait) (if (common:low-noise-print 3600 (conc (round normalized-effective-load) "-load-acceptable-" effective-host)) Index: configf.scm ================================================================== --- configf.scm +++ configf.scm @@ -52,10 +52,14 @@ (let ((newalist (filter (lambda (x)(not (equal? key (car x)))) alist))) (append newalist (list (if metadata (list key val metadata) (list key val)))))) +;; this is used in megatestqa/ext.scm. +;; remove it from here and there by 12/31/21 +;; (define config:assoc-safe-add configf:assoc-safe-add) + (define (configf:section-var-set! cfgdat section-name var value #!key (metadata #f)) (hash-table-set! cfgdat section-name (configf:assoc-safe-add (hash-table-ref/default cfgdat section-name '()) var value metadata: metadata))) @@ -144,11 +148,11 @@ (set! result (conc "#{(" cmdtype ") " cmd "}")))) (case cmdsym ((system shell scheme) (let ((delta (- (current-seconds) start-time))) (if (> delta 2) - (debug:print-info 0 *default-log-port* "for line \"" l "\"\n command: " cmd " took " delta " seconds to run with output:\n " result) + (debug:print-info 2 *default-log-port* "for line \"" l "\"\n command: " cmd " took " delta " seconds to run with output:\n " result) (debug:print-info 9 *default-log-port* "for line \"" l "\"\n command: " cmd " took " delta " seconds to run with output:\n " result))))) (loop (conc prestr result poststr))) res)) res))) @@ -531,12 +535,12 @@ (define configf:read-file read-config) ;; safely look up a value that is expected to be a number, return ;; a default (#f unless provided) ;; -(define (configf:lookup-number cfdat section varname #!key (default #f)) - (let* ((val (configf:lookup *configdat* section varname)) +(define (configf:lookup-number cfgdat section varname #!key (default #f)) + (let* ((val (configf:lookup cfgdat section varname)) (res (if val (string->number (string-substitute "\\s+" "" val #t)) #f))) (cond (res res) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -4659,11 +4659,11 @@ ;; the new prereqs calculation, looks also at itempath if specified ;; all prereqs must be met -;; if prereq test with itempath='' is COMPLETED and PASS, WARN, CHECK, or WAIVED then prereq is met +;; if prereq test with itempath='' is in common:well-ended-states, then prereq is met ;; if prereq test with itempath=ref-item-path and COMPLETED with PASS, WARN, CHECK, or WAIVED then prereq is met ;; ;; Note: mode 'normal means that tests must be COMPLETED and ok (i.e. PASS, WARN, CHECK, SKIP or WAIVED) ;; mode 'toplevel means that tests must be COMPLETED only ;; mode 'itemmatch or 'itemwait means that tests items must be COMPLETED and (PASS|WARN|WAIVED|CHECK) [[ NB// NOT IMPLEMENTED YET ]] @@ -4674,10 +4674,11 @@ ;; 2. any NOT completed and good? if yes => return those as prereqs not met, if no => return null list ;; ;; (define (db:get-prereqs-not-met dbstruct run-id waitons ref-item-path mode) (define (db:get-prereqs-not-met dbstruct run-id waitons ref-test-name ref-item-path mode itemmaps) ;; #!key (mode '(normal))(itemmap #f)) ;; BBnote - rollup of an itemized test's overall state/status done in db:set-state-status-and-roll-up-items + (debug:print 4 *default-log-port* "db:get-prereqs-not-met: " waitons) (append (if (member 'exclusive mode) (let ((running-tests (db:get-tests-for-run dbstruct #f ;; run-id of #f means for all runs. (if (string=? ref-item-path "") ;; testpatt @@ -4700,10 +4701,12 @@ ;; (conc (db:test-get-testname testdat) ;; "/" ;; (db:test-get-item-path testdat)))) running-tests) ;; calling functions want the entire data '()) + + ;; collection of: for each waiton - ;; if this ref-test-name is an item in an itemized test and mode is itemwait/itemmatch: ;; if waiton is not itemized - if waiton is not both completed and in ok status, add as unmet prerequisite ;; if waiton is itemized: Index: docs/manual/Makefile ================================================================== --- docs/manual/Makefile +++ docs/manual/Makefile @@ -30,15 +30,15 @@ # asciidoc -b html5 -a icons -a iconsdir=$(DISPATH)/images/icons -a toc2 design_spec.txt # all : server.ps megatest_manual.html client.ps complex-itemmap.png megatest_manual.pdf -megatest_manual.html : megatest_manual.txt *.txt installation.txt *png +megatest_manual.html : megatest_manual.txt *.txt installation.txt *png *.dot asciidoc -b html5 -a icons -a iconsdir=$(DISPATH)/images/icons -a toc2 megatest_manual.txt # dos2unix megatest_manual.html -megatest_manual.pdf : megatest_manual.txt *.txt *png +megatest_manual.pdf : megatest_manual.txt *.txt *png *.dot a2x -a toc -f pdf megatest_manual.txt server.ps : server.dot dot -Tps server.dot > server.ps ADDED docs/manual/bisecting.dot Index: docs/manual/bisecting.dot ================================================================== --- /dev/null +++ docs/manual/bisecting.dot @@ -0,0 +1,31 @@ +// Copyright 2021, Matthew Welland. +// +// This file is part of Megatest. +// +// Megatest is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Megatest is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Megatest. If not, see . +// +digraph G { + rankdir=LR + subgraph cluster_1 { + node [style=filled,shape=box]; + + B [label="B\nProblem is here"]; + E [label="E\nProblem manifests here"]; + A -> B; + B -> C; + C -> D; + D -> E; + } + +} ADDED docs/manual/bisecting.png Index: docs/manual/bisecting.png ================================================================== --- /dev/null +++ docs/manual/bisecting.png cannot compute difference between binary files ADDED docs/manual/debugging.txt Index: docs/manual/debugging.txt ================================================================== --- /dev/null +++ docs/manual/debugging.txt @@ -0,0 +1,314 @@ +// Copyright 2021, Matthew Welland. +// +// This file is part of Megatest. +// +// Megatest is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Megatest is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Megatest. If not, see . + +Debugging +--------- + +Well Written Tests +~~~~~~~~~~~~~~~~~~ + +Test Design and Surfacing Errors +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Design your tests to surface errors. Ensure that all logs are +processed by logpro (or a custom log processing tool) and can be +reached by a mouse click or two from the test control panel. + +To illustrate, here is a set of scripts with nested calls where script1.sh calls script2.sh which calls script3.sh which finally calls the Cadence EDA tool virtuoso: + +.script1.sh +.............................. +#!/bin/bash +code ... +script2.sh some parameters > script2.log +more code ... +.............................. + +.script2.sh +.............................. +#!/bin/bash +code ... +script3.sh some more parameters > script3.log +more code ... +.............................. + +.script3.sh +.............................. +#!/bin/bash +code ... +virtuoso params and switches ... +more code ... +.............................. + +The log files script2.log, script3.log and the log output from +virtuoso are not accessible from the test control panel. It would be +much better for future users of your automation to use steps more +fully. One easy option would be to post process the logs in downstream +additional steps: + +.testconfig +.............................. +[ezsteps] +step1 script1.sh +step2 cat script2.log +step3 cat script3.log + +[logpro] +step1 ;; some logpro rules + (expect:required in "LogFileBody" > 0 "Expect this output" #/something expected/) +step2 ;; some logpro rules for script2.sh +step3 ;; some logpro rules for script3.sh + +[scripts] +script1.sh #!/bin/bash + code ... + +... +.............................. + +With the above testconfig the logs for every critical part of the +automation are fully surfaced and rules can be created to flag errors, +warnings, aborts and to ignore false errors. A user of your automation +will be able to see the important error with two mouse clicks from the +runs view. + +An even better would be to eliminate the nesting if possible. As a +general statement with layers - less is usually more. By flattening +the automation into a sequence of steps you can use the test control +panel to re-run a step with a single click or from the test xterm run +only the errant step from the command line. + +The message here is make debugging and maintenace easy for future +users (and yourself) by keeping clicks-to-error in mind. + +Examining The Test Logs and Environment +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Test Control Panel - xterm +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +From the dashboard click on a test PASS/FAIL button. This brings up a +test control panel. Aproximately near the center left of the window +there is a button "Start Xterm". Push this to get an xterm with the +full context and environment loaded for that test. You can run scripts +or ezsteps by copying from the testconfig (hint, load up the +testconfig in a separate text editor window). + +With more recent versions of Megatest you can step through your test +from the test control panel. Click on the cell labeled "rerun this +step" to only rerun the step or click on "restart from here" to rerun +that step and downstream steps. + +NOTE 1: visual feedback can take some time, give it a few seconds and +you will see the step change color to blue as it starts running. + +NOTE 2: steping through only works if you are using ezsteps. + +A word on Bisecting +~~~~~~~~~~~~~~~~~~~ + +Bisecting is a debug strategy intended to speed up finding the root +cause of some bug. + +.A complex process with a problem found in stage "E" +["graphviz", "bisecting.png"] +---------------------------------------------------------------------- +include::bisecting.dot[] +---------------------------------------------------------------------- + +It is common to start debugging where the problem was observed and +then work back. However by inspecting the output at stage "C" in the +example above you would potentially save a lot of debug effort, this +is similar to the feature in source control tools like git and fossil +called biseceting. + +Tough Bugs +~~~~~~~~~~ + +Most bugs in Megatest based automation will be in the scripts called +in your test steps and if you utilize the good design practice +described above should be fairly easy for you to reproduce, isolate +and find. + +Some bugs however will come from subtle and hard to detect +interactions between Megatest and your OS and Unix environment. This +includes things like constructed variables that are legal in one +context (e.g. tcsh) but illegal in another context (e.g. bash), +variables that come from your login scripts and access and permissions +issues (e.g. a script that silently fails due to no access to needed +data). Other bugs might be due to Megatest itself. + +To isolate bugs like this you may need to look at the log files at +various stages in the execution process of your run and tests. + +.A simplified diagram of the stages Megatest goes through to run a test. +["graphviz", "megatest-test-stages.png"] +---------------------------------------------------------------------- +include::megatest-test-stages.dot[] +---------------------------------------------------------------------- + +.How to check variable values and inspect logs at each stage +[width="80%",cols="<,2m,2m",frame="topbot",options="header"] +|====================== +|Stage | How to inspect | Watch for or try ... +|A: post config processing | megatest -show-config -target your/target | #f (failed var processing) +|B: post runconfig | megatest -show-runconfig -target your/target | Add -debug 0,9 to see which file your settings come from +|C: processing testconfigs | inspect output from "megatest -run ..." | Messages indicating issues process configs, dependency problems +|D: process testconfig for test launch | inspect output from megatest runner | Zero items (items expansion yielded no items) +|E,F: launching test | start test xterm, look at mt_launch.log | Did your batch system accept the job? Has the job landed on a machine? +|G: starting test | look at your batch systems logs for the process | Did the megatest -execute process start and run? Extract the "megatest -execute ..." command and run it from your xterm. +|H,H1,H2: step exectution | look at .log, .html and your own internal logs | Do you have sufficiently tight logpro rules? You must always have a "required" rule! +|====================== + +Bisecting megatest.csh/sh +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Sometimes finding the environment variable that is causing the problem +can be very difficult. Bisection can be applied. + +Edit the megatest.csh or megatest.sh file and comment out 50% per +round, source in fresh xterm and run the test. + +This idea can also be applied to your .cshrc, .bashrc, .aliases and +other similar files. + +csh and -f +^^^^^^^^^^ + +A common issue when tcsh or csh shells are used for scripting is to +forget or choose to not use -f in your #! line. + +.Not good +.............................. +#!/bin/tcsh +... +.............................. + +.Good +.............................. +#!/bin/tcsh -f +... +.............................. + + +Config File Processing +^^^^^^^^^^^^^^^^^^^^^^ + +As described above it is often helpful to know the content of +variables in various contexts as Megatest works through the actions +needed to run your tests. A handy technique is to force the startup of +an xterm in the context being examined. + +For example, if an item list is not being generated as expected you +can inject the startup of an xterm as if it were an item: + +.Original items table +----------------- +[items] +CELLNAME [system getcellname.sh] +----------------- + +.Items table modified for debug +----------------- +[items] +DEBUG [system xterm] +CELLNAME [system getcellnames.sh] +----------------- + +When this test is run an xterm will pop up. In that xterm the +environment is exactly that in which the script "getcellnames.sh" +would run. You can now debug the script to find out why it isn't +working as expected. + +Similarly in a script just call the xterm. +NOTE: This technique can be very helpful in debugging running of EDA tools in Perl, Ruby, Python or tcl scripts: + +.Perl example +.............................. +some_code(); +$cmdline="virtuoso -some-switches and params ..."; +print "$cmdline"; # print the command line so you can paste it into the xterm that pops up +system("xterm"); # this line is added for the debug and removed when done +system($cmdline); +more_code(); +.............................. + +Misc Other Debugging Hints +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Annotating scripts and config files +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Run the "env" command to record the environment: + +env | sort > stagename.log + +In a config file: + +.megatest.config, runconfigs.config and testconfig +................... +#{shell env | sort > stagename.log} + +# or + +[system env | sort > stagename.log] +................... + +In scripts just insert the commands, this example helps you identify +if "some commands ..." changed any environment variables.: + +.myscript.sh +.............................. +env | sort > somefile-before.log +some commands ... +env | sort > somefile-after.log +.............................. + +.Use meld to examine the differences +.............................. +meld somefile-before.log somefile-after.log +.............................. + +Oneshot Modifying a Variable +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To try various values for a variable without mutating the current value + +.within a bash shell +.............................. +SOMEVAR=123 runcmd.sh +.............................. + +.within csh +.............................. +(setenv SOMEVAR 123;runcmd.sh) + +# OR + +env SOMEVAR=123 runcmd.sh +.............................. + +Capturing output from a command +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.Use the "script" utility +.............................. +script -c "virtuoso -params and switches ..." +.............................. + + + Index: docs/manual/howto.txt ================================================================== --- docs/manual/howto.txt +++ docs/manual/howto.txt @@ -110,15 +110,15 @@ # if defined and not "no" flexi-launcher will bypass launcher unless there is no # match. flexi-launcher yes ------------------------ -Tricks ------- +Tricks and Tips +--------------- -This section is a compendium of a various useful tricks for debugging, -configuring and generally getting the most out of Megatest. +This section is a collection of a various useful tricks for that +didn't quite fit elsewhere. Limiting your running jobs ~~~~~~~~~~~~~~~~~~~~~~~~~~ The following example will limit a test in the jobgroup "group1" to no more than 10 tests simultaneously. @@ -136,50 +136,10 @@ [jobgroups] group1 10 custdes 4 --------------- -Debugging Tricks ----------------- - -Examining The Environment -~~~~~~~~~~~~~~~~~~~~~~~~~ - -Test Control Panel - xterm -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -From the dashboard click on a test PASS/FAIL button. This brings up a test control panel. Aproximately near the center left of the -window there is a button "Start Xterm". Push this to get an xterm with the full context and environment loaded for that test. You can run -scripts or ezsteps by copying from the testconfig (hint, load up the testconfig in a separate gvim or emacs window). This is the easiest way -to debug your tests. - -During Config File Processing -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -It is often helpful to know the content of variables in various -contexts as Megatest does the actions needed to run your tests. A handy technique is to force the startup of an xterm in the context being examined. - -For example, if an item list is not being generated as expected you -can inject the startup of an xterm as if it were an item: - -.Original items table ------------------ -[items] -CELLNAME [system getcellname.sh] ------------------ - -.Items table modified for debug ------------------ -[items] -DEBUG [system xterm] -CELLNAME [system getcellnames.sh] ------------------ - -When this test is run an xterm will pop up. In that xterm the -environment is exactly that in which the script "getcellnames.sh" -would run. You can now debug the script to find out why it isn't -working as expected. Organising Your Tests and Tasks ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The default location "tests" for storing tests can be extended by ADDED docs/manual/megatest-test-stages.dot Index: docs/manual/megatest-test-stages.dot ================================================================== --- /dev/null +++ docs/manual/megatest-test-stages.dot @@ -0,0 +1,42 @@ +// Copyright 2021, Matthew Welland. +// +// This file is part of Megatest. +// +// Megatest is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Megatest is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Megatest. If not, see . +// +digraph G { + // rankdir=LR + subgraph cluster_1 { + node [style=filled,shape=box]; + A [label="A: Process megatest.config"] + B [label="B: Process runconfig.config"] + A -> B [label="resolve vars"] + B2 [label="B2: Resolve variables"] + B -> B2 -> A + C [label="C: Process testconfigs (find tests to run)"] + D [label="D: Process testconfig for test of interest"] + E [label="E: Set vars for launching test"] + F [label="F: Launch into batch system, ssh, batch system\nand different hosts can all \nimpact variable values"] + G [label="G: Process testconfig again with all available variables"] + H [label="H: Start test"] + H1 [label="H1: Start step"] + H2 [label="H2: Execute step script"] + I [label="I: End step, process logfile with logpro"] + I -> H [label="Run remaining steps"] + H-> H1 -> H2 -> I + B->C->D->E->F->G->H + I->D + } + +} ADDED docs/manual/megatest-test-stages.png Index: docs/manual/megatest-test-stages.png ================================================================== --- /dev/null +++ docs/manual/megatest-test-stages.png cannot compute difference between binary files Index: docs/manual/megatest_manual.html ================================================================== --- docs/manual/megatest_manual.html +++ docs/manual/megatest_manual.html @@ -773,13 +773,13 @@

The Megatest Users Manual

Matt Welland
<matt@kiatoa.com>
version 1.5, June 2020 -
-
Table of Contents
- +
+
Table of Contents
+

Preface

@@ -808,22 +808,17 @@

Why Megatest?

-

The Megatest project was started for two reasons, the first was an -immediate and pressing need for a generalized tool to manage a suite -of regression tests and the second was the fact that I had written or -maintained several such tools at different companies over the years. I -thought a single open source tool, flexible enough to meet the needs -of any team doing continuous integration and or running a complex -suite of tests for release qualification would solve some problems for -me and for others.

-
-
-
-- Matt Welland, original author of the Megatest tool suite.
-
+

Megatest was created to provide a generalized tool for managing suites +of regression tests and to provide a multi-host, distributed +alternative to "make". The EDA world is littered with proprietory, +company-specific tools for this purpose and by going open source and +keeping the tool flexible the hope is that Megatest could be useful to +any team at any company for continuous integration and almost any +other general automation tasks.

Megatest Design Philosophy

@@ -959,15 +954,10 @@ Static
- -
-

Road Map

-
-

TODO / Road Map

Note: This road-map is a wish list and not a formal plan. Items are in @@ -1941,10 +1931,310 @@

This test runs a single step called "stepname1" which runs a script "stepname.sh". Note that although it is common to put the actions needed for a test step into a script it is not necessary.

+
+
+

Debugging

+
+
+

Well Written Tests

+
+

Test Design and Surfacing Errors

+

Design your tests to surface errors. Ensure that all logs are +processed by logpro (or a custom log processing tool) and can be +reached by a mouse click or two from the test control panel.

+

To illustrate, here is a set of scripts with nested calls where script1.sh calls script2.sh which calls script3.sh which finally calls the Cadence EDA tool virtuoso:

+
+
script1.sh
+
+
#!/bin/bash
+code ...
+script2.sh some parameters > script2.log
+more code ...
+
+
+
script2.sh
+
+
#!/bin/bash
+code ...
+script3.sh some more parameters > script3.log
+more code ...
+
+
+
script3.sh
+
+
#!/bin/bash
+code ...
+virtuoso params and switches ...
+more code ...
+
+

The log files script2.log, script3.log and the log output from +virtuoso are not accessible from the test control panel. It would be +much better for future users of your automation to use steps more +fully. One easy option would be to post process the logs in downstream +additional steps:

+
+
testconfig
+
+
[ezsteps]
+step1 script1.sh
+step2 cat script2.log
+step3 cat script3.log
+
+[logpro]
+step1 ;; some logpro rules
+  (expect:required in "LogFileBody" > 0 "Expect this output" #/something expected/)
+step2 ;; some logpro rules for script2.sh
+step3 ;; some logpro rules for script3.sh
+
+[scripts]
+script1.sh #!/bin/bash
+ code ...
+
+...
+
+

With the above testconfig the logs for every critical part of the +automation are fully surfaced and rules can be created to flag errors, +warnings, aborts and to ignore false errors. A user of your automation +will be able to see the important error with two mouse clicks from the +runs view.

+

An even better would be to eliminate the nesting if possible. As a +general statement with layers - less is usually more. By flattening +the automation into a sequence of steps you can use the test control +panel to re-run a step with a single click or from the test xterm run +only the errant step from the command line.

+

The message here is make debugging and maintenace easy for future +users (and yourself) by keeping clicks-to-error in mind.

+
+
+
+

Examining The Test Logs and Environment

+
+

Test Control Panel - xterm

+

From the dashboard click on a test PASS/FAIL button. This brings up a +test control panel. Aproximately near the center left of the window +there is a button "Start Xterm". Push this to get an xterm with the +full context and environment loaded for that test. You can run scripts +or ezsteps by copying from the testconfig (hint, load up the +testconfig in a separate text editor window).

+

With more recent versions of Megatest you can step through your test +from the test control panel. Click on the cell labeled "rerun this +step" to only rerun the step or click on "restart from here" to rerun +that step and downstream steps.

+

NOTE 1: visual feedback can take some time, give it a few seconds and +you will see the step change color to blue as it starts running.

+

NOTE 2: steping through only works if you are using ezsteps.

+
+
+
+

A word on Bisecting

+

Bisecting is a debug strategy intended to speed up finding the root +cause of some bug.

+
+
+bisecting.png +
+
Figure 1. A complex process with a problem found in stage "E"
+
+

It is common to start debugging where the problem was observed and +then work back. However by inspecting the output at stage "C" in the +example above you would potentially save a lot of debug effort, this +is similar to the feature in source control tools like git and fossil +called biseceting.

+
+
+

Tough Bugs

+

Most bugs in Megatest based automation will be in the scripts called +in your test steps and if you utilize the good design practice +described above should be fairly easy for you to reproduce, isolate +and find.

+

Some bugs however will come from subtle and hard to detect +interactions between Megatest and your OS and Unix environment. This +includes things like constructed variables that are legal in one +context (e.g. tcsh) but illegal in another context (e.g. bash), +variables that come from your login scripts and access and permissions +issues (e.g. a script that silently fails due to no access to needed +data). Other bugs might be due to Megatest itself.

+

To isolate bugs like this you may need to look at the log files at +various stages in the execution process of your run and tests.

+
+
+megatest-test-stages.png +
+
Figure 2. A simplified diagram of the stages Megatest goes through to run a test.
+
+ + ++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Table 2. How to check variable values and inspect logs at each stage
Stage How to inspect Watch for or try …

A: post config processing

megatest -show-config -target your/target

#f (failed var processing)

B: post runconfig

megatest -show-runconfig -target your/target

Add -debug 0,9 to see which file your settings come from

C: processing testconfigs

inspect output from "megatest -run …"

Messages indicating issues process configs, dependency problems

D: process testconfig for test launch

inspect output from megatest runner

Zero items (items expansion yielded no items)

E,F: launching test

start test xterm, look at mt_launch.log

Did your batch system accept the job? Has the job landed on a machine?

G: starting test

look at your batch systems logs for the process

Did the megatest -execute process start and run? Extract the "megatest -execute …" command and run it from your xterm.

H,H1,H2: step exectution

look at <stepname>.log, <stepname>.html and your own internal logs

Do you have sufficiently tight logpro rules? You must always have a "required" rule!

+
+

Bisecting megatest.csh/sh

+

Sometimes finding the environment variable that is causing the problem +can be very difficult. Bisection can be applied.

+

Edit the megatest.csh or megatest.sh file and comment out 50% per +round, source in fresh xterm and run the test.

+

This idea can also be applied to your .cshrc, .bashrc, .aliases and +other similar files.

+
+
+

csh and -f

+

A common issue when tcsh or csh shells are used for scripting is to +forget or choose to not use -f in your #! line.

+
+
Not good
+
+
#!/bin/tcsh
+...
+
+
+
Good
+
+
#!/bin/tcsh -f
+...
+
+
+
+

Config File Processing

+

As described above it is often helpful to know the content of +variables in various contexts as Megatest works through the actions +needed to run your tests. A handy technique is to force the startup of +an xterm in the context being examined.

+

For example, if an item list is not being generated as expected you +can inject the startup of an xterm as if it were an item:

+
+
Original items table
+
+
[items]
+CELLNAME [system getcellname.sh]
+
+
+
Items table modified for debug
+
+
[items]
+DEBUG [system xterm]
+CELLNAME [system getcellnames.sh]
+
+

When this test is run an xterm will pop up. In that xterm the +environment is exactly that in which the script "getcellnames.sh" +would run. You can now debug the script to find out why it isn’t +working as expected.

+

Similarly in a script just call the xterm. +NOTE: This technique can be very helpful in debugging running of EDA tools in Perl, Ruby, Python or tcl scripts:

+
+
Perl example
+
+
some_code();
+$cmdline="virtuoso -some-switches and params ...";
+print "$cmdline"; # print the command line so you can paste it into the xterm that pops up
+system("xterm");  # this line is added for the debug and removed when done
+system($cmdline);
+more_code();
+
+
+
+
+

Misc Other Debugging Hints

+
+

Annotating scripts and config files

+

Run the "env" command to record the environment:

+

env | sort > stagename.log

+

In a config file:

+
+
megatest.config, runconfigs.config and testconfig
+
+
#{shell env | sort > stagename.log}
+
+# or
+
+[system env | sort > stagename.log]
+
+

In scripts just insert the commands, this example helps you identify +if "some commands …" changed any environment variables.:

+
+
myscript.sh
+
+
env | sort > somefile-before.log
+some commands ...
+env | sort > somefile-after.log
+
+
+
Use meld to examine the differences
+
+
meld somefile-before.log somefile-after.log
+
+
+
+

Oneshot Modifying a Variable

+

To try various values for a variable without mutating the current value

+
+
within a bash shell
+
+
SOMEVAR=123 runcmd.sh
+
+
+
within csh
+
+
(setenv SOMEVAR 123;runcmd.sh)
+
+# OR
+
+env SOMEVAR=123 runcmd.sh
+
+
+
+

How To Do Things

@@ -2036,14 +2326,14 @@
-

Tricks

+

Tricks and Tips

-

This section is a compendium of a various useful tricks for debugging, -configuring and generally getting the most out of Megatest.

+

This section is a collection of a various useful tricks for that +didn’t quite fit elsewhere.

Limiting your running jobs

The following example will limit a test in the jobgroup "group1" to no more than 10 tests simultaneously.

In your testconfig:

@@ -2056,49 +2346,10 @@
[jobgroups]
 group1 10
 custdes 4
-
-
-
-
-

Debugging Tricks

-
-
-

Examining The Environment

-
-

Test Control Panel - xterm

-

From the dashboard click on a test PASS/FAIL button. This brings up a test control panel. Aproximately near the center left of the -window there is a button "Start Xterm". Push this to get an xterm with the full context and environment loaded for that test. You can run -scripts or ezsteps by copying from the testconfig (hint, load up the testconfig in a separate gvim or emacs window). This is the easiest way -to debug your tests.

-
-
-

During Config File Processing

-

It is often helpful to know the content of variables in various -contexts as Megatest does the actions needed to run your tests. A handy technique is to force the startup of an xterm in the context being examined.

-

For example, if an item list is not being generated as expected you -can inject the startup of an xterm as if it were an item:

-
-
Original items table
-
-
[items]
-CELLNAME [system getcellname.sh]
-
-
-
Items table modified for debug
-
-
[items]
-DEBUG [system xterm]
-CELLNAME [system getcellnames.sh]
-
-

When this test is run an xterm will pop up. In that xterm the -environment is exactly that in which the script "getcellnames.sh" -would run. You can now debug the script to find out why it isn’t -working as expected.

-

Organising Your Tests and Tasks

The default location "tests" for storing tests can be extended by adding to your tests-paths section.

@@ -2148,11 +2399,11 @@

Megatest Use Modes

- + @@ -2210,11 +2461,11 @@

Various helpers for more advanced config files.

Table 2. Base commandsTable 3. Base commands
- + @@ -2495,11 +2746,11 @@

Database settings

Table 3. HelpersTable 4. Helpers
- + @@ -3008,10 +3259,14 @@ lookittmp ;; Note: config file format supports multi-line entries where leading whitespace is removed from each line ;; a blank line indicates the end of the block of text (expect:required in "LogFileBody" > 0 "A file name that should never exist!" #/This is a awfully stupid file name that should never be found in the temp dir/)

To transfer the environment to the next step you can do the following:

+# if your upstream file is csh you can force csh like this +# if your upstream is bash +loadenv source $REF/ourenviron.sh +
Propagate environment to next step
$MT_MEGATEST -env2file .ezsteps/${stepname}
@@ -3160,11 +3415,11 @@ fail gracefully if it doesn’t exist.

Table 4. Database config settings in [setup] section of megatest.configTable 5. Database config settings in [setup] section of megatest.config
- + @@ -3394,11 +3649,11 @@

These routines can be called from the megatest repl.

Table 5. Environment variables visible to the trigger scriptTable 6. Environment variables visible to the trigger script
Variable
- + ADDED docs/manual/megatest_manual.pdf Index: docs/manual/megatest_manual.pdf ================================================================== --- /dev/null +++ docs/manual/megatest_manual.pdf cannot compute difference between binary files Index: docs/manual/megatest_manual.txt ================================================================== --- docs/manual/megatest_manual.txt +++ docs/manual/megatest_manual.txt @@ -33,20 +33,17 @@ ---------------------------- Why Megatest? ------------- -The Megatest project was started for two reasons, the first was an -immediate and pressing need for a generalized tool to manage a suite -of regression tests and the second was the fact that I had written or -maintained several such tools at different companies over the years. I -thought a single open source tool, flexible enough to meet the needs -of any team doing continuous integration and or running a complex -suite of tests for release qualification would solve some problems for -me and for others. - - -- Matt Welland, original author of the Megatest tool suite. +Megatest was created to provide a generalized tool for managing suites +of regression tests and to provide a multi-host, distributed +alternative to "make". The EDA world is littered with proprietory, +company-specific tools for this purpose and by going open source and +keeping the tool flexible the hope is that Megatest could be useful to +any team at any company for continuous integration and almost any +other general automation tasks. Megatest Design Philosophy -------------------------- Megatest is a distributed system intended to provide the minimum needed @@ -98,24 +95,26 @@ which can launch jobs on local and remote Linux hosts. Currently megatest uses the network filesystem to call home to your master sqlite3 database. Megatest has been used with the Intel Netbatch and lsf (also known as openlava) batch systems and it should be straightforward to use it with other similar systems. + +// :leveloffset: 0 include::overview.txt[] include::plan.txt[] - + include::installation.txt[] include::getting_started.txt[] include::study_plan.txt[] -// :leveloffset: 0 - include::writing_tests.txt[] + +include::debugging.txt[] include::howto.txt[] include::reference.txt[] Index: docs/manual/plan.txt ================================================================== --- docs/manual/plan.txt +++ docs/manual/plan.txt @@ -1,8 +1,5 @@ -Road Map --------- - // This file is part of Megatest. // // Megatest is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or Index: docs/manual/reference.txt ================================================================== --- docs/manual/reference.txt +++ docs/manual/reference.txt @@ -205,10 +205,27 @@ [setup] # this will automatically kill the test if it runs for more than 1h 2m and 3s runtimelim 1h 2m 3s ----------------- +Post Run Hook ++++++++++++++ + +This runs script to-run.sh after all tests have been completed. It is +not necessary to use -run-wait as each test will check for other +running tests on completion and if there are none it will call the +post run hook. + +Note that the output from the script call will be placed in a log file +in the logs directory with a file name derived by replacing / with _ +in post-hook--.log. + +------------------- +[runs] +post-hook /path/to/script/to-run.sh +------------------- + Tests browser view ~~~~~~~~~~~~~~~~~~ The tests browser (see the Run Control tab on the dashboard) has two views for displaying the tests. @@ -720,10 +737,14 @@ (expect:required in "LogFileBody" > 0 "A file name that should never exist!" #/This is a awfully stupid file name that should never be found in the temp dir/) ----------------- To transfer the environment to the next step you can do the following: +# if your upstream file is csh you can force csh like this +# if your upstream is bash +loadenv source $REF/ourenviron.sh + .Propagate environment to next step ---------------------------- $MT_MEGATEST -env2file .ezsteps/${stepname} ---------------------------- ADDED docs/megatest-debug-tutorial.odp Index: docs/megatest-debug-tutorial.odp ================================================================== --- /dev/null +++ docs/megatest-debug-tutorial.odp cannot compute difference between binary files Index: ezsteps.scm ================================================================== --- ezsteps.scm +++ ezsteps.scm @@ -104,11 +104,11 @@ (pid #f)) (let ((proc (lambda () (set! pid (process-run "/bin/bash" (list "-c" cmd)))))) (if subrun (begin - (debug:print-info 0 *default-log-port* "Running without MT_.* environment variables.") + (debug:print-info 0 *default-log-port* "Running "cmd" without MT_.* environment variables.") (common:without-vars proc "^MT_.*")) (proc))) (with-output-to-file "Makefile.ezsteps" (lambda () Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -159,11 +159,11 @@ ;; 5. fix testpatt or calculate it from contour ;; 6. launch the run ;; 7. roll up the run result and or roll up the logpro processed result (when (configf:lookup testconfig "subrun" "runwait") ;; we use runwait as the flag that a subrun is requested (subrun:initialize-toprun-test testconfig test-run-dir) - (let* ((mt-cmd (subrun:launch-cmd test-run-dir))) + (let* ((mt-cmd (subrun:launch-cmd test-run-dir (configf:lookup testconfig "subrun" "runwait")))) (debug:print-info 0 *default-log-port* "Subrun command is \"" mt-cmd "\"") (set! ezsteps #t) ;; set the needed flag (set! ezstepslst (append (or ezstepslst '()) (list (list "subrun" (conc "{subrun=true} " mt-cmd))))))) @@ -173,11 +173,13 @@ (let* ((all-steps-dat (make-hash-table))) ;; keep all the info around as stepname ==> alist; where 'params is the params list (add other stuff as needed) (if (not (common:file-exists? ".ezsteps"))(create-directory ".ezsteps")) ;; if ezsteps was defined then we are sure to have at least one step but check anyway (if (not (> (length ezstepslst) 0)) (debug:print-error 0 *default-log-port* "ezsteps defined but ezstepslst is zero length") - (let loop ((ezstep (car ezstepslst)) + (let ((all-step-names (map car ezstepslst))) + (setenv "MT_STEP_NAMES" (string-intersperse all-step-names " ")) + (let loop ((ezstep (car ezstepslst)) (tal (cdr ezstepslst)) (prevstep #f)) (debug:print-info 0 *default-log-port* "Processing ezstep \"" (string-intersperse ezstep " ") "\"") ;; check exit-info (vector-ref exit-info 1) (if (launch:einf-exit-status exit-info) ;; (vector-ref exit-info 1) @@ -191,11 +193,11 @@ (launch:load-logpro-dat run-id test-id stepname)) (if (steprun-good? logpro-used (launch:einf-exit-code exit-info) stepparms) (if (not (null? tal)) (loop (car tal) (cdr tal) stepname)) (debug:print 0 *default-log-port* "WARNING: step " (car ezstep) " failed. Stopping"))) - (debug:print 0 *default-log-port* "WARNING: a prior step failed, stopping at " ezstep))))))))) + (debug:print 0 *default-log-port* "WARNING: a prior step failed, stopping at " ezstep)))))))))) (define (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags) (let* ((update-period (string->number (or (configf:lookup *configdat* "setup" "test-stats-update-period") "30"))) (start-seconds (current-seconds)) (calc-minutes (lambda () @@ -476,21 +478,23 @@ (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f) ) ;; prime it for running ((member (db:test-get-state test-info) '("RUNNING" "REMOTEHOSTSTART")) (if (process:alive-on-host? test-host test-pid) (debug:print-error 0 *default-log-port* "test state is " (db:test-get-state test-info) " and process " test-pid " is still running on host " test-host ", cannot proceed") - (exit))) + (exit 1))) ((member (db:test-get-state test-info) '("COMPLETED")) ;; we do NOT want to re-run COMPLETED jobs. Mark as NOT_STARTED to run! - (debug:print-error 0 *default-log-port* "test state is " (db:test-get-state test-info) ", cannot proceed") - (exit)) + (debug:print 0 *default-log-port* "test state is " (db:test-get-state test-info) ", cannot proceed") + (debug:print 0 *default-log-port* "exiting with status 1") + (exit 1)) ((not (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ"))) ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") (rmt:general-call 'set-test-start-time #f test-id) (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f)) (else ;; (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ")) - (debug:print-error 0 *default-log-port* "test state is " (db:test-get-state test-info) ", cannot proceed") - (exit)))) + (debug:print 0 *default-log-port* "test state is " (db:test-get-state test-info) ", cannot proceed") + (debug:print 0 *default-log-port* "exiting with status 1") + (exit 1)))) ;; cleanup prior execution's steps (rmt:delete-steps-for-test! run-id test-id) (debug:print 2 *default-log-port* "Executing " test-name " (id: " test-id ") on " (get-host-name)) @@ -643,11 +647,12 @@ (print content) (change-file-mode name (bitwise-ior perm/irwxg perm/irwxu))))) (else (debug:print-info 0 "Invalid script definiton found in [scripts] section of testconfig. \"" scriptdat "\"")))) scripts)) - ;; + ;; + (let* ((m (make-mutex)) (kill-job? #f) (exit-info (make-launch:einf pid: #t exit-status: #t exit-code: #t rollup-status: 0)) ;; pid exit-status exit-code (i.e. process was successfully run) rollup-status (job-thread #f) ;; (keep-going #t) @@ -657,29 +662,33 @@ (runit (lambda () (launch:manage-steps run-id test-id item-path fullrunscript ezsteps subrun test-name tconfigreg exit-info m))) (monitorjob (lambda () (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags))) (th1 (make-thread monitorjob "monitor job")) - (th2 (make-thread runit "run job"))) + (th2 (make-thread runit "run job")) + (tconfig (tests:get-testconfig test-name item-path tconfigreg #t)) + (propagate-exit-code (configf:lookup *configdat* "setup" "propagate-exit-code")) + (propagate-status-list '("FAIL" "KILLED" "ABORT" "DEAD" "CHECK" "SKIP" "WAIVED")) + (test-status "not set") + ) (set! job-thread th2) (thread-start! th1) (thread-start! th2) (thread-join! th2) - (debug:print-info 0 *default-log-port* "Megatest exectute of test " test-name ", item path " item-path " complete. Notifying the db ...") + (debug:print-info 0 *default-log-port* "Megatest execute of test " test-name ", item path " item-path " complete. Notifying the db ...") + (debug:print-info 2 *default-log-port* "exit-info = " exit-info) (hash-table-set! misc-flags 'keep-going #f) (thread-join! th1) (thread-sleep! 1) ;; givbe thread th1 a chance to be done TODO: Verify this is needed. At 0.1 I was getting fail to stop, increased to total of 1.1 sec. (mutex-lock! m) (let* ((item-path (item-list->path itemdat)) ;; only state and status needed - use lazy routine (testinfo (rmt:get-testinfo-state-status run-id test-id))) ;; Am I completed? - (if (member (db:test-get-state testinfo) '("REMOTEHOSTSTART" "RUNNING")) ;; NOTE: It should *not* be REMOTEHOSTSTART but for reasons I don't yet understand it sometimes gets stuck in that state ;; (not (equal? (db:test-get-state testinfo) "COMPLETED")) - (let ((new-state (if kill-job? "KILLED" "COMPLETED") ;; (if (eq? (vector-ref exit-info 2) 0) ;; exited with "good" status - ;; "COMPLETED" ;; (db:test-get-state testinfo))) ;; else preseve the state as set within the test - ) - (new-status (cond + (if (member (db:test-get-state testinfo) '("REMOTEHOSTSTART" "RUNNING")) + (let ((new-state (if kill-job? "KILLED" "COMPLETED")) + (new-status (cond ((not (launch:einf-exit-status exit-info)) "FAIL") ;; job failed to run ... (vector-ref exit-info 1) ((eq? (launch:einf-rollup-status exit-info) 0) ;; (vector-ref exit-info 3) ;; if the current status is AUTO then defer to the calculated value (i.e. leave this AUTO) (if (equal? (db:test-get-status testinfo) "AUTO") "AUTO" "PASS")) ((eq? (launch:einf-rollup-status exit-info) 1) "FAIL") ;; (vector-ref exit-info 3) @@ -688,35 +697,53 @@ (if (equal? (db:test-get-status testinfo) "AUTO") "AUTO-WARN" "WARN")) ((eq? (launch:einf-rollup-status exit-info) 3) "CHECK") ((eq? (launch:einf-rollup-status exit-info) 4) "WAIVED") ((eq? (launch:einf-rollup-status exit-info) 5) "ABORT") ((eq? (launch:einf-rollup-status exit-info) 6) "SKIP") - (else "FAIL")))) ;; (db:test-get-status testinfo))) - (debug:print-info 1 *default-log-port* "Test exited in state=" (db:test-get-state testinfo) ", setting state/status based on exit code of " (launch:einf-exit-status exit-info) " and rollup-status of " (launch:einf-rollup-status exit-info)) + (else "FAIL"))) + ) ;; (db:test-get-status testinfo))) + (debug:print-info 0 *default-log-port* "Test exited in state=" (db:test-get-state testinfo) ", setting state/status based on exit code of " (launch:einf-exit-status exit-info) " and rollup-status of " (launch:einf-rollup-status exit-info)) - ;; Leave a .final-status file for each sub-test - (tests:save-final-status run-id test-id) + ;; Leave a .final-status file for each sub-test + (tests:save-final-status run-id test-id) (tests:test-set-status! run-id test-id new-state new-status (args:get-arg "-m") #f) ;; need to update the top test record if PASS or FAIL and this is a subtest ;; NO NEED TO CALL set-state-status-and-roll-up-items HERE, THIS IS DONE IN set-state-status-and-roll-up-items called by tests:test-set-status! - )) + ) + ) + + ;; for automated creation of the rollup html file this is a good place... (if (not (equal? item-path "")) (tests:summarize-items run-id test-id test-name #f)) (tests:summarize-test run-id test-id) ;; don't force - just update if no - ;; Leave a .final-status file for the top level test - (tests:save-final-status run-id test-id) - (rmt:update-run-stats run-id (rmt:get-raw-run-stats run-id))) + ;; Leave a .final-status file for the top level test + (tests:save-final-status run-id test-id) + (rmt:update-run-stats run-id (rmt:get-raw-run-stats run-id))) ;; end of let* + (mutex-unlock! m) (launch:end-of-run-check run-id ) (debug:print 2 *default-log-port* "Output from running " fullrunscript ", pid " (launch:einf-pid exit-info) " in work area " work-area ":\n====\n exit code " (launch:einf-exit-code exit-info) "\n" "====\n") + + + (set! test-status (db:test-get-status (rmt:get-testinfo-state-status run-id test-id))) + + ;; If the propagate-exit-code option has been set in the megatest config, and the test status matches the list, set the exit code to 1. + + (if (and propagate-exit-code (string=? propagate-exit-code "yes") (member test-status propagate-status-list)) + (begin + (debug:print 1 *default-log-port* "Setting exit status to 1 because of test status of " test-status) + (set! *globalexitstatus* 1) + ) + ) + (if (not (launch:einf-exit-status exit-info)) (exit 4)))) ))) ;; Spec for End of test @@ -768,30 +795,46 @@ (tal (cdr not-completed-tests))) (let* ((test-name (vector-ref running-test 2)) (item-path (vector-ref running-test 11))) (debug:print 0 *default-log-port* "test " test-name "/" item-path " not completed") (if (not (null? tal)) - (loop (car tal) (cdr tal))))))))))) - -(define (launch:is-test-alive host pid) + (loop (car tal) (cdr tal))))))))))) + +;; replaced below with version that does not ssh if checking on localhost +#;(define (launch:is-test-alive host pid) (if (and host pid (not (equal? host "n/a"))) (let* ((cmd (conc "ssh " host " pstree -A " pid)) (output (with-input-from-pipe cmd read-lines))) (debug:print 2 *default-log-port* "Running " cmd " received " output) (if (eq? (length output) 0) #f #t)) #t)) - + +(define (launch:is-test-alive host pid) + (let* ((same-host (equal? host (get-host-name))) + (cmd (conc + (if same-host "" (conc "ssh "host" ")) + "pstree -A "pid))) + (if (and host pid + (not (equal? host "n/a"))) + + (let* ((output (with-input-from-pipe cmd read-lines))) + (debug:print 2 *default-log-port* "Running " cmd " received " output) + (if (eq? (length output) 0) + #f + #t)) + #t))) ;; assuming bad query is about a live test is likely not the right thing to do? + (define (launch:kill-tests-if-dead run-id) (let* ((running-tests (rmt:get-tests-for-run run-id "%" `("RUNNING" "LAUNCHED" "REMOTEHOSTSTART") `() #f #f #f #f #f #f #f #f))) (let loop ((running-test (car running-tests)) (tal (cdr running-tests)) (kill-cnt 0)) (let* ((test-name (vector-ref running-test 2)) (item-path (vector-ref running-test 11)) - (test-id (vector-ref running-test 0)) + (test-id (vector-ref running-test 0)) (host (vector-ref running-test 6)) (pid (rmt:test-get-top-process-pid run-id test-id)) (event-time (vector-ref running-test 5)) (duration (vector-ref running-test 12)) (flag 0) @@ -1513,10 +1556,12 @@ (list 'runtlim (if run-time-limit (common:hms-string->seconds run-time-limit) #f)) (list 'env-ovrd (hash-table-ref/default *configdat* "env-override" '())) (list 'set-vars (if params (hash-table-ref/default params "-setvars" #f))) (list 'runname runname) (list 'mt-bindir-path mt-bindir-path)))))))) + + (setenv "MT_CMDINFO" cmdparms) ;; setting this for use in nblauncher ;; clean out step records from previous run if they exist ;; (rmt:delete-test-step-records run-id test-id) ;; if the dir does not exist we may have a itempath where individual variables are a path, launch anyway (if (common:file-exists? work-area) Index: megatest-version.scm ================================================================== --- megatest-version.scm +++ megatest-version.scm @@ -18,6 +18,6 @@ ;; Always use two or four digit decimal ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00.. ;; (declare (unit megatest-version)) -(define megatest-version 1.6584) +(define megatest-version 1.6589) Index: mtut.scm ================================================================== --- mtut.scm +++ mtut.scm @@ -154,10 +154,11 @@ show [areas|contours... ] : show areas, contours or other section from megatest.config gendot : generate a graphviz dot file from pkts. Contour actions: process : runs import, rungen and dispatch + go : runs import, rungen and dispatch every five minutes forever Trigger propagation actions: tsend a=b,c=d... : send trigger info to all recpients in the [listeners] section tlisten -port N : listen for trigger info on port N @@ -773,11 +774,11 @@ (begin (print-call-chain) (print "FAILED TO RUN RUNNAME MAPPER " callname " FOR AREA " area) (print " message: " ((condition-property-accessor 'exn 'message) exn)) runname) - (print "(mapper " (string-intersperse (list runkey runname area area-path reason contour mode-patt) ", ") ")") + (print "(mapper " (string-intersperse (map conc (list runkey runname area area-path reason contour mode-patt)) ", ") ")") (mapper runkey runname area area-path reason contour mode-patt)) (case callname ((auto #f) runname) (else runtrans))))) (new-target target) ;; I believe we will want target manipulation here .. (map-targets xlatr-key runkey area contour)) @@ -1612,23 +1613,70 @@ ;; (hash-table-keys adjargs)) (let-values (((uuid pkt) (command-line->pkt *action* adjargs #f area-path: area-path new-ss: new-ss))) (print "run log @ " (conc (current-directory) "/" uuid "-" *action* ".log")) (write-pkt pktsdir uuid pkt)))) - ((dispatch import rungen process) + ((dispatch import rungen process go) (let* ((mtconfdat (simple-setup (args:get-arg "-start-dir"))) (mtconf (car mtconfdat)) - (toppath (configf:lookup mtconf "scratchdat" "toppath"))) + (toppath (configf:lookup mtconf "scratchdat" "toppath")) + (period (configf:lookup-number mtconf "mtutil" "autorun-period" default: 300)) + (rest-time (configf:lookup-number mtconf "mtutil" "autorun-rest" default: 30))) + (print "Using period="period" and rest time="rest-time) (case (string->symbol *action*) ((process) (begin (common:load-pkts-to-db mtconf) (generate-run-pkts mtconf toppath) (common:load-pkts-to-db mtconf) (dispatch-commands mtconf toppath))) ((import) (common:load-pkts-to-db mtconf)) ;; import pkts ((rungen) (generate-run-pkts mtconf toppath)) - ((dispatch) (dispatch-commands mtconf toppath))))) + ((dispatch) (dispatch-commands mtconf toppath)) + ;; [mtutil] + ;; # approximate interval between run processing in mtutil (seconds) + ;; autorun-period 300 + ;; # minimal rest period between processing + ;; autorun-rest 30 + ((go) + ;; determine if I'm the boss + (if (file-exists? "mtutil-go.pid") + (begin + (print "ERROR: mtutil go is already running under host and pid " (with-input-from-file "mtutil-go.pid" read-line) + ". Please kill that process and remove the file \"mutil-go.pid\" and try again.") + (exit))) + (with-output-to-file "mtutil-go.pid" (lambda ()(print (get-host-name) " " (current-process-id)))) + (print "Starting long running import, rungen, and process loop") + (if (file-exists? "do-not-run-mtutil-go") + (begin + (print "NOTE: Removing flag file "(current-directory)"/do-not-run-mtutil-go") + (delete-file* "do-not-run-mtutil-go"))) + (let loop ((last-run (- (current-seconds) (+ period 10))) ;; fake out first time in + (this-run (current-seconds))) + (if (file-exists? "do-not-run-mtutil-go") + (begin + (print "File do-not-run-mtutil-go exists, exiting.") + (delete-file* "mtutil-go.pid") + (exit))) + (let ((delta (- this-run last-run))) + (if (>= delta period) + (let* ((mtconfdat (simple-setup (args:get-arg "-start-dir"))) + (mtconf (car mtconfdat))) + (print "Running import at " (current-seconds)) + (common:load-pkts-to-db mtconf) + (print "Running generate run pkts at " (current-seconds)) + (generate-run-pkts mtconf toppath) + (print "Running run dispatch at " (current-seconds)) + (common:load-pkts-to-db mtconf) + (dispatch-commands mtconf toppath) + (print "Done running import, generate, and dispatch done in " (- (current-seconds) this-run)) + (print "NOTE: touch " (current-directory) "/do-not-run-mtutil-go to kill this runner.") + (loop this-run (current-seconds))) + (let ((now (current-seconds))) + (print "Sleeping " rest-time " seconds, next run in aproximately " (- period (- now last-run)) " seconds") + (thread-sleep! rest-time) + (loop last-run (current-seconds)))))) + (delete-file* "mtutil-go.pid"))))) ;; misc ((show) (if (> (length remargs) 0) (let* ((mtconfdat (simple-setup (args:get-arg "-start-dir"))) (mtconf (car mtconfdat)) @@ -1807,46 +1855,52 @@ ) ))) (loop (nn-recv rep)))) (print "ERROR: Port " portnum " already in use. Try another port"))))))) - - - - - ((tlisten) - (if (null? remargs) - (print "ERROR: useage for tlisten is \"mtutil tlisten portnum\"") - (let ((portnum (string->number (car remargs)))) - - (if (not portnum) - (print "ERROR: the portnumber parameter must be a number, you gave: " (car remargs)) - (begin - (if (not (is-port-in-use portnum)) - (let* ((rep (start-nn-server portnum)) - (mtconfdat (simple-setup (args:get-arg "-start-dir"))) - (mtconf (car mtconfdat)) - (contact (configf:lookup mtconf "listener" "owner")) - (script (configf:lookup mtconf "listener" "script"))) - (print "Listening on port " portnum " for messages.") - (set-signal-handler! signal/int (lambda (signum) - (set! *time-to-exit* #t) - (debug:print-error 0 *default-log-port* "Received signal " signum " sending email befor exiting !!") - (let ((email-body (mtut:stml->string (s:body - (s:p (conc "Received signal " signum ". Lister has been terminated on host " (get-environment-variable "HOST") ". ")))))) - (sendmail contact "Listner has been terminated." email-body use_html: #t)) - (exit))) - (set-signal-handler! signal/term (lambda (signum) - (set! *time-to-exit* #t) - (debug:print-error 0 *default-log-port* "Received signal " signum " sending email befor exiting !!") - (let ((email-body (mtut:stml->string (s:body - (s:p (conc "Received signal " signum ". Lister has been terminated on host " (get-environment-variable "HOST") ". ")))))) - (sendmail contact "Listner has been terminated." email-body use_html: #t)) - (exit))) - - ;(set-signal-handler! signal/term special-signal-handler) - + + ((tlisten) + (if (null? remargs) + (print "ERROR: useage for tlisten is \"mtutil tlisten portnum\"") + (let ((portnum (string->number (car remargs)))) + + (if (not portnum) + (print "ERROR: the portnumber parameter must be a number, you gave: " (car remargs)) + (begin + (if (not (is-port-in-use portnum)) + (let* ((rep (start-nn-server portnum)) + (mtconfdat (simple-setup (args:get-arg "-start-dir"))) + (mtconf (car mtconfdat)) + (contact (configf:lookup mtconf "listener" "owner")) + (script (configf:lookup mtconf "listener" "script"))) + (print "Listening on port " portnum " for messages.") + (set-signal-handler! signal/int + (lambda (signum) + (set! *time-to-exit* #t) + (debug:print-error 0 *default-log-port* "Received signal " signum + " sending email befor exiting !!") + (let ((email-body (mtut:stml->string + (s:body + (s:p (conc "Received signal " signum + ". Lister has been terminated on host " + (get-environment-variable "HOST") ". ")))))) + (sendmail contact "Listner has been terminated." email-body use_html: #t)) + (exit))) + (set-signal-handler! signal/term (lambda (signum) + (set! *time-to-exit* #t) + (debug:print-error 0 *default-log-port* "Received signal " + signum " sending email befor exiting !!") + (let ((email-body (mtut:stml->string + (s:body + (s:p (conc "Received signal " signum + ". Lister has been terminated on host " + (get-environment-variable "HOST") ". ")))))) + (sendmail contact "Listner has been terminated." email-body use_html: #t)) + (exit))) + + ;; (set-signal-handler! signal/term special-signal-handler) + (let loop ((instr (nn-recv rep))) (nn-send rep "ok") (let ((ctime (date->string (current-date)))) (if (equal? instr "time-to-die") (begin Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -501,10 +501,11 @@ ;; ;; test-names: Comma separated patterns same as test-patts but used in selection ;; of tests to run. The item portions are not respected. ;; FIXME: error out if /patt specified ;; +;; run-count is passed from megatest.scm as configf:lookup *configdat* "setup" "reruns", or defaults to 1. (define (runs:run-tests target runname test-patts user flags #!key (run-count 1)) ;; test-names (let* ((keys (keys:config-get-fields *configdat*)) (keyvals (keys:target->keyval keys target)) (run-id (rmt:register-run keyvals runname "new" "n/a" user (args:get-arg "-contour"))) ;; test-name))) ;; (deferred '()) ;; delay running these since they have a waiton clause @@ -624,11 +625,11 @@ ;; look up all tests matching the comma separated list of globs in ;; test-patts (using % as wildcard) ;; (set! test-names (delete-duplicates (tests:get-valid-tests *toppath* test-patts))) (debug:print-info 0 *default-log-port* "tests search path: " (string-intersperse (tests:get-tests-search-path *configdat*) " ")) - (debug:print-info 0 *default-log-port* "all tests: " (string-intersperse (sort all-test-names string<) " ")) + (debug:print-info 2 *default-log-port* "all tests: " (string-intersperse (sort all-test-names string<) " ")) (debug:print-info 0 *default-log-port* "test names: " (string-intersperse (sort test-names string<) " ")) (debug:print-info 0 *default-log-port* "required tests: " (string-intersperse (sort required-tests string<) " ")) ;; on the first pass or call to run-tests set FAILS to NOT_STARTED if ;; -keepgoing is specified @@ -738,11 +739,11 @@ (waiton-itemized (and waiton-tconfig (or (hash-table-ref/default waiton-tconfig "items" #f) (hash-table-ref/default waiton-tconfig "itemstable" #f)))) (itemmaps (tests:get-itemmaps config)) ;; (configf:lookup config "requirements" "itemmap")) (new-test-patts (tests:extend-test-patts test-patts hed waiton itemmaps hed-itemized-waiton))) - (debug:print-info 0 *default-log-port* "Test " waiton " has " (if waiton-record "a" "no") " waiton-record and" (if waiton-itemized " " " no ") "items") + (debug:print-info 2 *default-log-port* "Test " waiton " has " (if waiton-record "a" "no") " waiton-record and" (if waiton-itemized " " " no ") "items") ;; need to account for test-patt here, if I am test "a", selected with a test-patt of "hed/b%" ;; and we are waiting on "waiton" we need to add "waiton/,waiton/b%" to test-patt ;; is this satisfied by merely appending "/" to the waiton name added to the list? ;; ;; This approach causes all of the items in an upstream test to be run @@ -760,11 +761,11 @@ (set! test-patts new-test-patts)) (begin (debug:print-info 0 *default-log-port* "Waitor(s) not yet on testpatt for " waiton ", setting up to re-process it") (set! tal (append (cons waiton tal)(list hed))))) (begin - (debug:print-info 0 *default-log-port* "Adding non-itemized test " waiton " to required-tests") + (debug:print-info 2 *default-log-port* "Adding non-itemized test " waiton " to required-tests") (set! required-tests (cons waiton required-tests)) (set! test-patts new-test-patts))) (begin (debug:print-info 0 *default-log-port* "No testconfig info yet for " waiton ", setting up to re-process it") (set! tal (append (cons waiton tal)(list hed))))) ;; (cons (conc waiton "/") required-tests)) @@ -897,10 +898,11 @@ ;; (tal (cdr sorted-test-names)) ;; (reg '()) ;; registered, put these at the head of tal ;; (reruns '())) (define (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps) (let* ((loop-list (list hed tal reg reruns)) + (junk (debug:print-info 4 *default-log-port* "expand-items calling rmt:get-prereqs-not-met")) (prereqs-not-met (let ((res (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))) (if (list? res) res (begin (debug:print 0 *default-log-port* @@ -1090,11 +1092,11 @@ ((null? runnables) (debug:print-info 4 *default-log-port* "cond branch - " "ei-7") #f) ;; if we get here and non-completed is null then it is all over. (else (debug:print-info 4 *default-log-port* "cond branch - " "ei-8") - (debug:print 0 *default-log-port* "WARNING: FAILS or incomplete tests maybe preventing completion of this run. Watch for issues with test " hed ", continuing for now") + (debug:print 2 *default-log-port* "WARNING: FAILS or incomplete tests maybe preventing completion of this run. Watch for issues with test " hed ", continuing for now") (list (car newtal)(cdr newtal) reg reruns))))) (define (runs:mixed-list-testname-and-testrec->list-of-strings inlst) (if (null? inlst) '() @@ -1667,11 +1669,10 @@ (loop (runs:queue-next-hed tal reg reglen regfull) (runs:queue-next-tal tal reg reglen regfull) (runs:queue-next-reg tal reg reglen regfull) reruns)))) ;; (loop (car tal)(cdr tal) reg reruns)))) - (runs:incremental-print-results run-id) (debug:print 4 *default-log-port* "TOP OF LOOP => " "test-name: " test-name "\n hed: " hed "\n tal: " (runs:pretty-long-list tal) @@ -1851,16 +1852,18 @@ (debug:print-info 4 *default-log-port* " -- Can't expand hed="hed) ) ) ;; if can't run more just loop with next possible test (loop (car newtal)(cdr newtal) reg reruns)))) - + + ;; this case should not happen, added to help catch any bugs ((and (list? items) itemdat) (debug:print-info 4 *default-log-port* "cond branch - " "rtq-5") (debug:print-error 0 *default-log-port* "Should not have a list of items in a test and the itemspath set - please report this") (exit 1)) + ((not (null? reruns)) (debug:print-info 4 *default-log-port* "cond branch - " "rtq-6") (let* ((newlst (tests:filter-non-runnable run-id tal test-records)) ;; i.e. not FAIL, WAIVED, INCOMPLETE, PASS, KILLED, (junked (lset-difference equal? tal newlst))) (debug:print-info 4 *default-log-port* "full drop through, if reruns is less than 100 we will force retry them, reruns=" reruns ", tal=" tal) @@ -1869,10 +1872,11 @@ (set! num-retries (+ num-retries 1)) ;; (thread-sleep! (+ 1 *global-delta*)) (if (not (null? newlst)) ;; since reruns have been tacked on to newlst create new reruns from junked (loop (car newlst)(cdr newlst) reg (delete-duplicates junked))))) + ((not (null? tal)) (debug:print-info 4 *default-log-port* "cond branch - " "rtq-7") (debug:print-info 4 *default-log-port* "I'm pretty sure I shouldn't get here.")) ((not (null? reg)) ;; could we get here with leftovers? (debug:print-info 4 *default-log-port* "cond branch - " "rtq-8") @@ -2108,11 +2112,11 @@ (else (set! runflag #f))) (debug:print 4 *default-log-port* "RUNNING => runflag: " runflag " STATE: " (test:get-state testdat) " STATUS: " (test:get-status testdat)) (if (not runflag) (if (not parent-test) (if (runs:lownoise (conc "not starting test" full-test-name) 60) - (debug:print 1 *default-log-port* "NOTE: Not starting test " full-test-name " as it is state \"" (test:get-state testdat) + (debug:print 3 *default-log-port* "NOTE: Not starting test " full-test-name " as it is state \"" (test:get-state testdat) "\" and status \"" (test:get-status testdat) "\", use -rerun \"" (test:get-status testdat) "\" or -force to override"))) ;; NOTE: No longer be checking prerequisites here! Will never get here unless prereqs are ;; already met. ;; This would be a great place to do the process-fork @@ -2393,11 +2397,11 @@ (tasks:kill-runner target run-name testpatt) ;; (debug:print 0 *default-log-port* "not attempting to kill any run launcher processes as testpatt is " testpatt)) (debug:print 1 *default-log-port* "Removing tests for run: " runkey " " (db:get-value-by-header run header "runname"))) ((set-state-status) ;; (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10)) - (debug:print 1 *default-log-port* "Modifying state and staus for tests for run: " runkey " " (db:get-value-by-header run header "runname"))) + (debug:print 2 *default-log-port* "Modifying state and status for tests for run: " runkey " " (db:get-value-by-header run header "runname"))) ((print-run) (debug:print 1 *default-log-port* "Printing info for run " runkey ", run=" run ", tests=" tests ", header=" header) action) ((run-wait) (debug:print 1 *default-log-port* "Waiting for run " runkey ", run=" runnamepatt " to complete")) @@ -2534,11 +2538,11 @@ ) ; end let ); end cond has-subrun (else ;; BB - TODO - consider backgrounding to threads to delete tests (work below) - (debug:print-info 0 *default-log-port* "test: " test-name " itest-state: " test-state) + (debug:print-info 2 *default-log-port* "test: " test-name " itest-state: " test-state) (if (member test-state (list "RUNNING" "LAUNCHED" "REMOTEHOSTSTART" "KILLREQ")) (begin (if (not (hash-table-ref/default test-retry-time test-fulln #f)) (begin ;; want to set to REMOVING BUT CANNOT do it here? @@ -2729,11 +2733,11 @@ ) (case clean-mode ((remove-data-only)(mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "CLEANING" "LOCKED" #f)) ((remove-all) (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "REMOVING" "LOCKED" #f)) ((archive-remove) (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "ARCHIVE_REMOVING" #f #f))) - (debug:print-info 1 *default-log-port* "Attempting to remove " (if real-dir (conc " dir " real-dir " and ") "") " link " run-dir) + (debug:print-info 2 *default-log-port* "Attempting to remove " (if real-dir (conc " dir " real-dir " and ") "") " link " run-dir) (if (and real-dir (> (string-length real-dir) 5) (common:file-exists? real-dir)) ;; bad heuristic but should prevent /tmp /home etc. (let* ((realpath (resolve-pathname run-dir))) (debug:print-info 1 *default-log-port* "Recursively removing " realpath) @@ -3057,17 +3061,17 @@ (files (if (common:file-exists? runtop) (append (glob (conc runtop "/.megatest*")) (glob (conc runtop "/.runconfig*"))) '()))) (if (null? files) - (debug:print-info 0 *default-log-port* "No cached megatest or runconfigs files found. None removed.") + (debug:print-info 2 *default-log-port* "No cached megatest or runconfigs files found. None removed.") (begin - (debug:print-info 0 *default-log-port* "Removing cached files:\n " (string-intersperse files "\n ")) + (debug:print-info 2 *default-log-port* "Removing cached files:\n " (string-intersperse files "\n ")) (for-each (lambda (f) (handle-exceptions exn (debug:print 0 *default-log-port* "WARNING: Failed to remove file " f ", exn=" exn) (delete-file f))) files)))) (debug:print-error 0 *default-log-port* "-clean-cache requires -runname.")) (debug:print-error 0 *default-log-port* "-clean-cache requires -target or -reqtarg"))) Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -111,12 +111,12 @@ ((fs) result) (else (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*) result))) -;; Given a run id start a server process ### NOTE ### > file 2>&1 -;; if the run-id is zero and the target-host is set +;; Given an area path, start a server process ### NOTE ### > file 2>&1 +;; if the target-host is set ;; try running on that host ;; incidental: rotate logs in logs/ dir. ;; (define (server:run areapath) ;; areapath is *toppath* for a given testsuite area (let* ((curr-host (get-host-name)) @@ -154,21 +154,22 @@ (begin (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) (setenv "TARGETHOST" target-host))) (setenv "TARGETHOST_LOGF" logfile) - (thread-sleep! (/ (random 5000) 1000)) ;; add about a random (up to 5 seconds) initial delay. It seems pretty common that many running tests request a server at the same time - ;; (common:wait-for-normalized-load load-limit " delaying server start due to load" target-host) ;; do not try starting servers on an already overloaded machine, just wait forever - #;(common:wait-for-homehost-load load-limit (conc " delaying server start due to load on homehost. limit is " load-limit)) + (thread-sleep! (/ (random 3000) 1000)) ;; add a random initial delay. It seems pretty common that many running tests request a server at the same time + (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time)) (system (conc "nbfake " cmdln)) (unsetenv "TARGETHOST_LOGF") (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) (thread-join! log-rotate) (pop-directory))) -;; given a path to a server log return: host port startseconds -;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let +;; given a path to a server log return: host port startseconds server-id +;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which use match let +;; example of what it's looking for in the log file: +;; SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 (define (server:logf-get-start-info logf) (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id (dbprep-rx (regexp "^SERVER: dbprep")) (dbprep-found 0)) @@ -201,18 +202,18 @@ (string->number (caddr dat)) (cadr (cddr dat)))))) (begin (if dbprep-found (begin - (debug:print-info 0 *default-log-port* "Server is in dbprep at " (current-seconds)) - (thread-sleep! 25) + (debug:print-info 2 *default-log-port* "Server is in dbprep at " (common:human-time)) + (thread-sleep! 0.5) ;; was 25 sec but that blocked things from starting? ) - (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (current-seconds)) + (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds))) ) (list #f #f #f #f))))))))) -;; get a list of servers with all relevant data +;; get a list of servers from the log files, with all relevant data ;; ( mod-time host port start-time pid ) ;; (define (server:get-list areapath #!key (limit #f)) (let ((fname-rx (regexp "^(|.*/)server-(\\d+)-(\\S+).log$")) (day-seconds (* 24 60 60))) @@ -228,17 +229,19 @@ (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs"))) (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list. exn=" exn))) (directory-exists? (conc areapath "/logs"))) '())) - ;; Get the list of server logs that do not contain "exiting". Ignore logs for servers that have exited. - (let* ((server-logs-cmd (conc "grep -iL exiting " areapath "/logs/server-*-*.log")) - (server-logs (string-split (string-chomp (call-with-input-pipe server-logs-cmd read-all)))) + ;; Get the list of server logs. First remove logs for servers that have exited. + (let* ( + ;; For some reason, when I uncomment the below line, ext-tests sometimes starts 1000's of servers. + ;; (exiting-servers (system (conc "bash -c 'rm -f `grep -il exiting " areapath "/logs/server-*-*.log 2> /dev/null`'"))) + (server-logs (glob (conc areapath "/logs/server-*-*.log"))) (num-serv-logs (length server-logs))) (if (or (null? server-logs) (= num-serv-logs 0)) (let () - (debug:print 1 *default-log-port* "There are no servers running") + (debug:print 2 *default-log-port* "There are no servers running at " (common:human-time)) '() ) (let loop ((hed (string-chomp (car server-logs))) (tal (cdr server-logs)) (res '())) @@ -365,66 +368,70 @@ (if *my-client-signature* *my-client-signature* (let ((sig (server:mk-signature))) (set! *my-client-signature* sig) *my-client-signature*))) -;; wait for server=start-last to be three seconds old + +;; if server-start-last exists, and wasn't old enough, wait + 1, then call this function recursively until it is old enough. +;; if it is old enough, overwrite it and wait 0.25 seconds. +;; if it then has the wrong server key, wait + 1 and call this function recursively. ;; (define (server:wait-for-server-start-last-flag areapath) (let* ((start-flag (conc areapath "/logs/server-start-last")) ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds) - (reftime (configf:lookup-number *configdat* "server" "idletime" default: 4)) + (idletime (configf:lookup-number *configdat* "server" "idletime" default: 4)) (server-key (conc (get-host-name) "-" (current-process-id)))) (if (file-exists? start-flag) (let* ((fmodtime (file-modification-time start-flag)) (delta (- (current-seconds) fmodtime)) - (all-go (> delta reftime))) - (if (and all-go + (old-enough (> delta idletime)) + (new-server-key "") + ) + + ;; write start-flag file, wait 0.25s, then if previously the start-flag file was older than seconds, and the new file still has the same server key as you just wrote, return #t. + ;; the intention is to make sure nfs can read the file we just wrote, and make sure it was written by us, and not another process. + (if (and old-enough (begin - (debug:print-info 0 *default-log-port* "Writing " start-flag) - (with-output-to-file start-flag - (lambda () - (print server-key))) + (debug:print-info 2 *default-log-port* "Writing " start-flag) + (with-output-to-file start-flag (lambda () (print server-key))) (thread-sleep! 0.25) - (let ((res (with-input-from-file start-flag - (lambda () - (read-line))))) - (equal? server-key res)))) - #t ;; (system (conc "touch " start-flag)) ;; lazy but safe + (set! new-server-key (with-input-from-file start-flag (lambda () (read-line)))) + (equal? server-key new-server-key)) + ) + #t + + ;; If either of the above conditions is not true, print a "Gating server start" message, wait + 1, then call this function recursively. (begin (debug:print-info 0 *default-log-port* "Gating server start, last start: " - fmodtime ", delta: " delta ", reftime: " reftime ", all-go=" all-go) - (thread-sleep! reftime) + (seconds->time-string fmodtime) ", time since last start: " delta ", required idletime: " idletime ", gating reason:" (if old-enough "another job started a server" "too soon to start another server")) + + (thread-sleep! ( + 1 idletime)) (server:wait-for-server-start-last-flag areapath))))))) -;; kind start up of servers, wait 40 seconds before allowing another server for a given -;; run-id to be launched + + +;; kind start up of server, wait before allowing another server for a given +;; area to be launched ;; (define (server:kind-run areapath) ;; look for $MT_RUN_AREA_HOME/logs/server-start-last - ;; and wait for it to be at least 3 seconds old + ;; and wait for it to be at least seconds old (server:wait-for-server-start-last-flag areapath) (if (not (server:check-if-running areapath)) ;; why try if there is already a server running? - (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun - (call-num (car last-run-dat)) - (when-run (cadr last-run-dat)) - (run-delay (+ (case call-num - ((0) 0) - ((1) 20) - ((2) 300) - (else 600)) - (random 5))) ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously + (let* ( (lock-file (conc areapath "/logs/server-start.lock"))) - (if (> (- (current-seconds) when-run) run-delay) - (let* ((start-flag (conc areapath "/logs/server-start-last"))) - (common:simple-file-lock-and-wait lock-file expire-time: 15) - (debug:print-info 0 *default-log-port* "server:kind-run: touching " start-flag) - (system (conc "touch " start-flag)) ;; lazy but safe - (server:run areapath) - (thread-sleep! 2) ;; don't release the lock for at least a few seconds - (common:simple-file-release-lock lock-file))) - (hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds)))))) + (let* ((start-flag (conc areapath "/logs/server-start-last"))) + (common:simple-file-lock-and-wait lock-file expire-time: 25) + (debug:print-info 2 *default-log-port* "server:kind-run: touching " start-flag) + (system (conc "touch " start-flag)) ;; lazy but safe + (server:run areapath) + (thread-sleep! 20) ;; don't release the lock for at least a few seconds. And allow time for the server startup to get to "SERVER STARTED". + (common:simple-file-release-lock lock-file))) + + (debug:print-info 0 *default-log-port* "Found server already running. NOT trying to start another.") + ) +) ;; this one seems to be the general entry point ;; (define (server:start-and-wait areapath #!key (timeout 60)) (let ((give-up-time (+ (current-seconds) timeout))) @@ -449,17 +456,17 @@ (or ns numservers))) ;; no longer care if multiple servers are started by accident. older servers will drop off in time. ;; (define (server:check-if-running areapath) ;; #!key (numservers "2")) - (let* ((ns (server:get-num-servers)) + (let* ((ns (server:get-num-servers)) ;; get the setting the for maximum number of servers allowed (servers (server:get-best (server:get-list areapath)))) (if (or (and servers (null? servers)) (not servers) (and (list? servers) - (< (length servers) (random ns)))) ;; somewhere between 0 and numservers + (< (length servers) (+ 1 (random ns))))) ;; somewhere between 1 and numservers #f (let loop ((hed (car servers)) (tal (cdr servers))) (let ((res (server:check-server hed))) (if res Index: subrun.scm ================================================================== --- subrun.scm +++ subrun.scm @@ -128,17 +128,17 @@ (kill-result (subrun:exec-sub-megatest test-run-dir action-switches-str "kill"))) kill-result) #t)) -(define (subrun:launch-cmd test-run-dir #!optional (sub-cmd "-run")) ;; BUG: "-run" should be changed to "-rerun-clean" but current doesn't work +(define (subrun:launch-cmd test-run-dir run-mode #!optional (sub-cmd "-run")) ;; BUG: "-run" should be changed to "-rerun-clean" but current doesn't work (if (subrun:subrun-removed? test-run-dir) (subrun:unset-subrun-removed test-run-dir)) (let* ((log-prefix "run") (switches (subrun:selector+log-switches test-run-dir log-prefix)) - (run-wait #t) + (run-wait (equal? run-mode "yes")) (cmd (conc "megatest " sub-cmd " " switches" " (if run-wait "-run-wait " "")))) cmd)) Index: tests.scm ================================================================== --- tests.scm +++ tests.scm @@ -53,10 +53,11 @@ ;; gets paths from configs and finds valid tests ;; returns hash of testname --> fullpath ;; (define (tests:get-all) (let* ((test-search-path (tests:get-tests-search-path *configdat*))) + (debug:print 8 *default-log-port* "test-search-path: " test-search-path) (tests:get-valid-tests (make-hash-table) test-search-path))) (define (tests:get-tests-search-path cfgdat) (let ((paths (let ((section (if cfgdat (configf:get-section cfgdat "tests-paths") @@ -909,11 +910,11 @@ ; (set! page (+ 1 page)) (if (> total-runs (* (+ 1 page) pg-size)) (loop (+ 1 page))))) (common:simple-file-release-lock lockfile)) (begin - (debug-print 0 *default-log-port* "Failed to get lock on file outf, lockfile: " lockfile) #f)))) + (debug:print 0 *default-log-port* "Failed to get lock on file outf, lockfile: " lockfile) #f)))) (define (tests:readlines filename) (call-with-input-file filename (lambda (p) @@ -1607,11 +1608,11 @@ (and wait-a-minute (> tries-left 0)) (thread-sleep! 10) (debug:print 0 *default-log-port* "WARNING: testconfig file does not exist: "test-configf" will retry in 10 seconds. Tries left: "tries-left) ;; BB: this fires (loopa (sub1 tries-left))) (else - (debug:print 0 *default-log-port* "WARNING: testconfig file does not exist: "test-configf) ;; BB: this fires + (debug:print 2 *default-log-port* "WARNING: testconfig file does not exist: "test-configf) ;; BB: this fires #f)))) (tcfg (if testexists (read-config test-configf #f system-allowed environ-patt: (if system-allowed "pre-launch-env-vars" @@ -1842,30 +1843,43 @@ ;; refactoring this block into tests:get-full-data from line 263 of runs.scm ;;====================================================================== ;; hed is the test name ;; test-records is a hash of test-name => test record (define (tests:get-full-data test-names test-records required-tests all-tests-registry) - (if (not (null? test-names)) + (let ((missing-waitons (make-hash-table))) + (if (not (null? test-names)) (let loop ((hed (car test-names)) (tal (cdr test-names))) ;; 'return-procs tells the config reader to prep running system but return a proc (debug:print-info 4 *default-log-port* "hed=" hed " at top of loop") ;; don't know item-path at this time, let the testconfig get the top level testconfig (let* ((config (tests:get-testconfig hed #f all-tests-registry 'return-procs)) (waitons (let ((instr (if config (configf:lookup config "requirements" "waiton") - (begin ;; No config means this is a non-existant test - (debug:print-error 0 *default-log-port* "non-existent required test \"" hed "\", grep through your testconfigs to find and remove or create the test. Discarding and continuing.") - "")))) + (begin ;; No config means this is a non-existent test + (let ((waiters '())) + ;; find the waiter(s) for this waiton. + (for-each + (lambda(waiter) + ;; (print "test-record = " (hash-table-ref test-records waiter)) + ;; (print "waitons = " (vector-ref (hash-table-ref test-records waiter) 2)) + (if (member hed (vector-ref (hash-table-ref test-records waiter) 2)) + (set! waiters (cons waiter waiters)) + ) + ) + (hash-table-keys test-records)) + (hash-table-set! missing-waitons hed waiters) + ) + "")))) (debug:print-info 8 *default-log-port* "waitons string is " instr) (string-split (cond ((procedure? instr) (let ((res (instr))) (debug:print-info 8 *default-log-port* "waiton procedure results in string " res " for test " hed) res)) ((string? instr) instr) (else - ;; NOTE: This is actually the case of *no* waitons! ;; (debug:print-error 0 *default-log-port* "something went wrong in processing waitons for test " hed) + ;; NOTE: This is actually the case of *no* waitons! ;; "")))))) (if (not config) ;; this is a non-existant test called in a waiton. (if (null? tal) test-records (loop (car tal)(cdr tal))) @@ -1910,21 +1924,28 @@ (items:get-items-from-config config)) (else #f))) ;; not iterated #f ;; itemsdat 5 #f ;; spare - used for item-path ))) - (for-each + (for-each (lambda (waiton) - (if (and waiton (not (member waiton test-names))) + (if (and waiton (not (string= "#f" waiton)) (not (member waiton test-names))) (begin (set! required-tests (cons waiton required-tests)) (set! test-names (cons waiton test-names))))) ;; was an append, now a cons waitons) (let ((remtests (delete-duplicates (append waitons tal)))) (if (not (null? remtests)) (loop (car remtests)(cdr remtests)) - test-records)))))))) + test-records))))))) + (for-each + (lambda (missing-waiton) + (debug:print-error 0 *default-log-port* "non-existent test \"" missing-waiton "\" is a waiton for tests " (hash-table-ref missing-waitons missing-waiton)) + ) + (hash-table-keys missing-waitons) + ) +)) ;;====================================================================== ;; test steps ;;====================================================================== Index: tests/mintest/megatest.config ================================================================== --- tests/mintest/megatest.config +++ tests/mintest/megatest.config @@ -19,16 +19,15 @@ X TEXT [setup] max_concurrent_jobs 50 linktree #{getenv MT_RUN_AREA_HOME}/linktree -transport http [server] port 8090 [jobtools] useshell yes -launcher nbfind +launcher nbfake [disks] disk0 #{getenv PWD}/runs Index: tests/mintest/tests/a/testconfig ================================================================== --- tests/mintest/tests/a/testconfig +++ tests/mintest/tests/a/testconfig @@ -16,8 +16,15 @@ # along with Megatest. If not, see . # Add steps here. Format is "stepname script" [ezsteps] step1 echo SUCCESS +# step1 ((xterm&);echo SUCCESS) +# step2 xterm_and_success + +[scripts] +xterm_and_success #!/bin/bash + xterm -T step2 & + echo SUCCESS [requirements] waiton b Index: utils/mk_wrapper ================================================================== --- utils/mk_wrapper +++ utils/mk_wrapper @@ -44,10 +44,19 @@ else export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$libdir fi export MT_SQLITE3_EXE=$sqlite3_exe + +http_vars="http_proxy https_proxy HTTP_PROXY HTTPS_PROXY" +for i in \$http_vars +do +j=\${!i} +if [ "\$j" != "" ]; then + unset \$i +fi +done __EOF ) > $cfgfile echo else echo "INFO: LD_LIBRARY_PATH not set" >&2 @@ -93,11 +102,9 @@ # echo "#!/bin/bash" > $target # echo "exec $prefix/bin/.\$(lsb_release -sr)/$cmd \"\$@\"" >> $target echo "lsbr=\$(lsb_release -sr)" >> $target -if [ "$LD_LIBRARY_PATH" != "" ];then - echo "source $prefix/bin/.\$lsbr/cfg.sh" >> $target -fi +echo "source $prefix/bin/.\$lsbr/cfg.sh" >> $target # echo "if [[ -e \$lsbr ]];then source \$lsbr;fi" >> $target echo "exec $prefix/bin/.\$lsbr/$cmd \"\$@\"" >> $target
Table 6. API Keys Related CallsTable 7. API Keys Related Calls