Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -817,14 +817,14 @@ (define *common:ended-states* ;; states which indicate the test is stopped and will not proceed '("COMPLETED" "ARCHIVED" "KILLED" "KILLREQ" "STUCK" "INCOMPLETE" )) (define *common:badly-ended-states* ;; these roll up as CHECK, i.e. results need to be checked - '("KILLED" "KILLREQ" "STUCK" "INCOMPLETE" "DEAD")) + '("KILLED" "KILLREQ" "STUCK" "INCOMPLETE" "DEAD" "CHECK")) (define *common:well-ended-states* ;; an item's prereq in this state allows item to proceed - '("PASS" "WARN" "CHECK" "WAIVED" "SKIP")) + '("PASS" "WARN" "WAIVED" "SKIP")) ;; BBnote: *common:running-states* used from db:set-state-status-and-roll-up-items (define *common:running-states* ;; test is either running or can be run '("RUNNING" "REMOTEHOSTSTART" "LAUNCHED" "STARTED")) @@ -2208,37 +2208,31 @@ ;; (define (common:wait-for-cpuload maxnormload numcpus-in #!key (count 1000) (msg #f)(remote-host #f)(num-tries 5)) (let* ((loadavg (common:get-cpu-load remote-host)) - ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again - (numcpus (if (<= 1 numcpus-in) - (common:get-num-cpus remote-host) - numcpus-in)) - (first (car loadavg)) - (next (cadr loadavg)) - (adjmaxload (* maxnormload (max 1 numcpus))) ;; possible bug - ;; where numcpus - ;; (or could be - ;; maxload) is - ;; zero, crude - ;; fallback is to - ;; at least use 1 - ;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit - ;; etc. - (effective-load (common:get-intercept first next)) - (recommended-delay (common:get-delay effective-load numcpus)) - (effective-host (or remote-host "localhost")) - (normalized-effective-load (/ effective-load numcpus)) - (will-wait (> normalized-effective-load maxnormload))) - (if (> recommended-delay 1) - (let* ((actual-delay (min recommended-delay 30))) - (if (common:low-noise-print 30 (conc (round actual-delay) "-safe-load")) - (debug:print-info 0 *default-log-port* "Load control, delaying " + ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again + (numcpus (if (<= 1 numcpus-in) + (common:get-num-cpus remote-host) numcpus-in)) + (first (car loadavg)) + (next (cadr loadavg)) + (adjmaxload (* maxnormload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude + ;; fallback is to at least use 1 + ;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit + ;; etc. + (effective-load (common:get-intercept first next)) + (recommended-delay (common:get-delay effective-load numcpus)) + (effective-host (or remote-host "localhost")) + (normalized-effective-load (/ effective-load numcpus)) + (will-wait (> normalized-effective-load maxnormload))) + (if (and will-wait (> recommended-delay 1)) + (let* ((actual-delay (min recommended-delay 30))) + (if (common:low-noise-print 30 (conc (round actual-delay) "-safe-load")) + (debug:print-info 0 *default-log-port* "Load control, delaying " actual-delay " seconds to maintain safe load. current normalized effective load is " normalized-effective-load". maxnormload = " maxnormload " numcpus = " numcpus " loadavg = " loadavg " effective-load = " effective-load)) - (thread-sleep! actual-delay))) + (thread-sleep! actual-delay))) (cond ;; bad data, try again to get the data ((not will-wait) (if (common:low-noise-print 3600 (conc (round normalized-effective-load) "-load-acceptable-" effective-host)) Index: configf.scm ================================================================== --- configf.scm +++ configf.scm @@ -52,10 +52,14 @@ (let ((newalist (filter (lambda (x)(not (equal? key (car x)))) alist))) (append newalist (list (if metadata (list key val metadata) (list key val)))))) +;; this is used in megatestqa/ext.scm. +;; remove it from here and there by 12/31/21 +;; (define config:assoc-safe-add configf:assoc-safe-add) + (define (configf:section-var-set! cfgdat section-name var value #!key (metadata #f)) (hash-table-set! cfgdat section-name (configf:assoc-safe-add (hash-table-ref/default cfgdat section-name '()) var value metadata: metadata))) @@ -144,11 +148,11 @@ (set! result (conc "#{(" cmdtype ") " cmd "}")))) (case cmdsym ((system shell scheme) (let ((delta (- (current-seconds) start-time))) (if (> delta 2) - (debug:print-info 0 *default-log-port* "for line \"" l "\"\n command: " cmd " took " delta " seconds to run with output:\n " result) + (debug:print-info 2 *default-log-port* "for line \"" l "\"\n command: " cmd " took " delta " seconds to run with output:\n " result) (debug:print-info 9 *default-log-port* "for line \"" l "\"\n command: " cmd " took " delta " seconds to run with output:\n " result))))) (loop (conc prestr result poststr))) res)) res))) @@ -531,12 +535,12 @@ (define configf:read-file read-config) ;; safely look up a value that is expected to be a number, return ;; a default (#f unless provided) ;; -(define (configf:lookup-number cfdat section varname #!key (default #f)) - (let* ((val (configf:lookup *configdat* section varname)) +(define (configf:lookup-number cfgdat section varname #!key (default #f)) + (let* ((val (configf:lookup cfgdat section varname)) (res (if val (string->number (string-substitute "\\s+" "" val #t)) #f))) (cond (res res) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -4659,11 +4659,11 @@ ;; the new prereqs calculation, looks also at itempath if specified ;; all prereqs must be met -;; if prereq test with itempath='' is COMPLETED and PASS, WARN, CHECK, or WAIVED then prereq is met +;; if prereq test with itempath='' is in common:well-ended-states, then prereq is met ;; if prereq test with itempath=ref-item-path and COMPLETED with PASS, WARN, CHECK, or WAIVED then prereq is met ;; ;; Note: mode 'normal means that tests must be COMPLETED and ok (i.e. PASS, WARN, CHECK, SKIP or WAIVED) ;; mode 'toplevel means that tests must be COMPLETED only ;; mode 'itemmatch or 'itemwait means that tests items must be COMPLETED and (PASS|WARN|WAIVED|CHECK) [[ NB// NOT IMPLEMENTED YET ]] @@ -4674,10 +4674,11 @@ ;; 2. any NOT completed and good? if yes => return those as prereqs not met, if no => return null list ;; ;; (define (db:get-prereqs-not-met dbstruct run-id waitons ref-item-path mode) (define (db:get-prereqs-not-met dbstruct run-id waitons ref-test-name ref-item-path mode itemmaps) ;; #!key (mode '(normal))(itemmap #f)) ;; BBnote - rollup of an itemized test's overall state/status done in db:set-state-status-and-roll-up-items + (debug:print 4 *default-log-port* "db:get-prereqs-not-met: " waitons) (append (if (member 'exclusive mode) (let ((running-tests (db:get-tests-for-run dbstruct #f ;; run-id of #f means for all runs. (if (string=? ref-item-path "") ;; testpatt @@ -4700,10 +4701,12 @@ ;; (conc (db:test-get-testname testdat) ;; "/" ;; (db:test-get-item-path testdat)))) running-tests) ;; calling functions want the entire data '()) + + ;; collection of: for each waiton - ;; if this ref-test-name is an item in an itemized test and mode is itemwait/itemmatch: ;; if waiton is not itemized - if waiton is not both completed and in ok status, add as unmet prerequisite ;; if waiton is itemized: Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -476,21 +476,23 @@ (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f) ) ;; prime it for running ((member (db:test-get-state test-info) '("RUNNING" "REMOTEHOSTSTART")) (if (process:alive-on-host? test-host test-pid) (debug:print-error 0 *default-log-port* "test state is " (db:test-get-state test-info) " and process " test-pid " is still running on host " test-host ", cannot proceed") - (exit))) + (exit 1))) ((member (db:test-get-state test-info) '("COMPLETED")) ;; we do NOT want to re-run COMPLETED jobs. Mark as NOT_STARTED to run! - (debug:print-error 0 *default-log-port* "test state is " (db:test-get-state test-info) ", cannot proceed") - (exit)) + (debug:print 0 *default-log-port* "test state is " (db:test-get-state test-info) ", cannot proceed") + (debug:print 0 *default-log-port* "exiting with status 1") + (exit 1)) ((not (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ"))) ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") (rmt:general-call 'set-test-start-time #f test-id) (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f)) (else ;; (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ")) - (debug:print-error 0 *default-log-port* "test state is " (db:test-get-state test-info) ", cannot proceed") - (exit)))) + (debug:print 0 *default-log-port* "test state is " (db:test-get-state test-info) ", cannot proceed") + (debug:print 0 *default-log-port* "exiting with status 1") + (exit 1)))) ;; cleanup prior execution's steps (rmt:delete-steps-for-test! run-id test-id) (debug:print 2 *default-log-port* "Executing " test-name " (id: " test-id ") on " (get-host-name)) @@ -643,11 +645,12 @@ (print content) (change-file-mode name (bitwise-ior perm/irwxg perm/irwxu))))) (else (debug:print-info 0 "Invalid script definiton found in [scripts] section of testconfig. \"" scriptdat "\"")))) scripts)) - ;; + ;; + (let* ((m (make-mutex)) (kill-job? #f) (exit-info (make-launch:einf pid: #t exit-status: #t exit-code: #t rollup-status: 0)) ;; pid exit-status exit-code (i.e. process was successfully run) rollup-status (job-thread #f) ;; (keep-going #t) @@ -657,29 +660,33 @@ (runit (lambda () (launch:manage-steps run-id test-id item-path fullrunscript ezsteps subrun test-name tconfigreg exit-info m))) (monitorjob (lambda () (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags))) (th1 (make-thread monitorjob "monitor job")) - (th2 (make-thread runit "run job"))) + (th2 (make-thread runit "run job")) + (tconfig (tests:get-testconfig test-name item-path tconfigreg #t)) + (propagate-exit-code (configf:lookup *configdat* "setup" "propagate-exit-code")) + (propagate-status-list '("FAIL" "KILLED" "ABORT" "DEAD" "CHECK" "SKIP" "WAIVED")) + (test-status "not set") + ) (set! job-thread th2) (thread-start! th1) (thread-start! th2) (thread-join! th2) - (debug:print-info 0 *default-log-port* "Megatest exectute of test " test-name ", item path " item-path " complete. Notifying the db ...") + (debug:print-info 0 *default-log-port* "Megatest execute of test " test-name ", item path " item-path " complete. Notifying the db ...") + (debug:print-info 2 *default-log-port* "exit-info = " exit-info) (hash-table-set! misc-flags 'keep-going #f) (thread-join! th1) (thread-sleep! 1) ;; givbe thread th1 a chance to be done TODO: Verify this is needed. At 0.1 I was getting fail to stop, increased to total of 1.1 sec. (mutex-lock! m) (let* ((item-path (item-list->path itemdat)) ;; only state and status needed - use lazy routine (testinfo (rmt:get-testinfo-state-status run-id test-id))) ;; Am I completed? - (if (member (db:test-get-state testinfo) '("REMOTEHOSTSTART" "RUNNING")) ;; NOTE: It should *not* be REMOTEHOSTSTART but for reasons I don't yet understand it sometimes gets stuck in that state ;; (not (equal? (db:test-get-state testinfo) "COMPLETED")) - (let ((new-state (if kill-job? "KILLED" "COMPLETED") ;; (if (eq? (vector-ref exit-info 2) 0) ;; exited with "good" status - ;; "COMPLETED" ;; (db:test-get-state testinfo))) ;; else preseve the state as set within the test - ) - (new-status (cond + (if (member (db:test-get-state testinfo) '("REMOTEHOSTSTART" "RUNNING")) + (let ((new-state (if kill-job? "KILLED" "COMPLETED")) + (new-status (cond ((not (launch:einf-exit-status exit-info)) "FAIL") ;; job failed to run ... (vector-ref exit-info 1) ((eq? (launch:einf-rollup-status exit-info) 0) ;; (vector-ref exit-info 3) ;; if the current status is AUTO then defer to the calculated value (i.e. leave this AUTO) (if (equal? (db:test-get-status testinfo) "AUTO") "AUTO" "PASS")) ((eq? (launch:einf-rollup-status exit-info) 1) "FAIL") ;; (vector-ref exit-info 3) @@ -688,35 +695,53 @@ (if (equal? (db:test-get-status testinfo) "AUTO") "AUTO-WARN" "WARN")) ((eq? (launch:einf-rollup-status exit-info) 3) "CHECK") ((eq? (launch:einf-rollup-status exit-info) 4) "WAIVED") ((eq? (launch:einf-rollup-status exit-info) 5) "ABORT") ((eq? (launch:einf-rollup-status exit-info) 6) "SKIP") - (else "FAIL")))) ;; (db:test-get-status testinfo))) - (debug:print-info 1 *default-log-port* "Test exited in state=" (db:test-get-state testinfo) ", setting state/status based on exit code of " (launch:einf-exit-status exit-info) " and rollup-status of " (launch:einf-rollup-status exit-info)) + (else "FAIL"))) + ) ;; (db:test-get-status testinfo))) + (debug:print-info 0 *default-log-port* "Test exited in state=" (db:test-get-state testinfo) ", setting state/status based on exit code of " (launch:einf-exit-status exit-info) " and rollup-status of " (launch:einf-rollup-status exit-info)) - ;; Leave a .final-status file for each sub-test - (tests:save-final-status run-id test-id) + ;; Leave a .final-status file for each sub-test + (tests:save-final-status run-id test-id) (tests:test-set-status! run-id test-id new-state new-status (args:get-arg "-m") #f) ;; need to update the top test record if PASS or FAIL and this is a subtest ;; NO NEED TO CALL set-state-status-and-roll-up-items HERE, THIS IS DONE IN set-state-status-and-roll-up-items called by tests:test-set-status! - )) + ) + ) + + ;; for automated creation of the rollup html file this is a good place... (if (not (equal? item-path "")) (tests:summarize-items run-id test-id test-name #f)) (tests:summarize-test run-id test-id) ;; don't force - just update if no - ;; Leave a .final-status file for the top level test - (tests:save-final-status run-id test-id) - (rmt:update-run-stats run-id (rmt:get-raw-run-stats run-id))) + ;; Leave a .final-status file for the top level test + (tests:save-final-status run-id test-id) + (rmt:update-run-stats run-id (rmt:get-raw-run-stats run-id))) ;; end of let* + (mutex-unlock! m) (launch:end-of-run-check run-id ) (debug:print 2 *default-log-port* "Output from running " fullrunscript ", pid " (launch:einf-pid exit-info) " in work area " work-area ":\n====\n exit code " (launch:einf-exit-code exit-info) "\n" "====\n") + + + (set! test-status (db:test-get-status (rmt:get-testinfo-state-status run-id test-id))) + + ;; If the propagate-exit-code option has been set in the megatest config, and the test status matches the list, set the exit code to 1. + + (if (and propagate-exit-code (string=? propagate-exit-code "yes") (member test-status propagate-status-list)) + (begin + (debug:print 1 *default-log-port* "Setting exit status to 1 because of test status of " test-status) + (set! *globalexitstatus* 1) + ) + ) + (if (not (launch:einf-exit-status exit-info)) (exit 4)))) ))) ;; Spec for End of test Index: megatest-version.scm ================================================================== --- megatest-version.scm +++ megatest-version.scm @@ -18,6 +18,6 @@ ;; Always use two or four digit decimal ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00.. ;; (declare (unit megatest-version)) -(define megatest-version 1.6584) +(define megatest-version 1.6587) Index: mtut.scm ================================================================== --- mtut.scm +++ mtut.scm @@ -154,10 +154,11 @@ show [areas|contours... ] : show areas, contours or other section from megatest.config gendot : generate a graphviz dot file from pkts. Contour actions: process : runs import, rungen and dispatch + go : runs import, rungen and dispatch every five minutes forever Trigger propagation actions: tsend a=b,c=d... : send trigger info to all recpients in the [listeners] section tlisten -port N : listen for trigger info on port N @@ -773,11 +774,11 @@ (begin (print-call-chain) (print "FAILED TO RUN RUNNAME MAPPER " callname " FOR AREA " area) (print " message: " ((condition-property-accessor 'exn 'message) exn)) runname) - (print "(mapper " (string-intersperse (list runkey runname area area-path reason contour mode-patt) ", ") ")") + (print "(mapper " (string-intersperse (map conc (list runkey runname area area-path reason contour mode-patt)) ", ") ")") (mapper runkey runname area area-path reason contour mode-patt)) (case callname ((auto #f) runname) (else runtrans))))) (new-target target) ;; I believe we will want target manipulation here .. (map-targets xlatr-key runkey area contour)) @@ -1612,23 +1613,70 @@ ;; (hash-table-keys adjargs)) (let-values (((uuid pkt) (command-line->pkt *action* adjargs #f area-path: area-path new-ss: new-ss))) (print "run log @ " (conc (current-directory) "/" uuid "-" *action* ".log")) (write-pkt pktsdir uuid pkt)))) - ((dispatch import rungen process) + ((dispatch import rungen process go) (let* ((mtconfdat (simple-setup (args:get-arg "-start-dir"))) (mtconf (car mtconfdat)) - (toppath (configf:lookup mtconf "scratchdat" "toppath"))) + (toppath (configf:lookup mtconf "scratchdat" "toppath")) + (period (configf:lookup-number mtconf "mtutil" "autorun-period" default: 300)) + (rest-time (configf:lookup-number mtconf "mtutil" "autorun-rest" default: 30))) + (print "Using period="period" and rest time="rest-time) (case (string->symbol *action*) ((process) (begin (common:load-pkts-to-db mtconf) (generate-run-pkts mtconf toppath) (common:load-pkts-to-db mtconf) (dispatch-commands mtconf toppath))) ((import) (common:load-pkts-to-db mtconf)) ;; import pkts ((rungen) (generate-run-pkts mtconf toppath)) - ((dispatch) (dispatch-commands mtconf toppath))))) + ((dispatch) (dispatch-commands mtconf toppath)) + ;; [mtutil] + ;; # approximate interval between run processing in mtutil (seconds) + ;; autorun-period 300 + ;; # minimal rest period between processing + ;; autorun-rest 30 + ((go) + ;; determine if I'm the boss + (if (file-exists? "mtutil-go.pid") + (begin + (print "ERROR: mtutil go is already running under host and pid " (with-input-from-file "mtutil-go.pid" read-line) + ". Please kill that process and remove the file \"mutil-go.pid\" and try again.") + (exit))) + (with-output-to-file "mtutil-go.pid" (lambda ()(print (get-host-name) " " (current-process-id)))) + (print "Starting long running import, rungen, and process loop") + (if (file-exists? "do-not-run-mtutil-go") + (begin + (print "NOTE: Removing flag file "(current-directory)"/do-not-run-mtutil-go") + (delete-file* "do-not-run-mtutil-go"))) + (let loop ((last-run (- (current-seconds) (+ period 10))) ;; fake out first time in + (this-run (current-seconds))) + (if (file-exists? "do-not-run-mtutil-go") + (begin + (print "File do-not-run-mtutil-go exists, exiting.") + (delete-file* "mtutil-go.pid") + (exit))) + (let ((delta (- this-run last-run))) + (if (>= delta period) + (let* ((mtconfdat (simple-setup (args:get-arg "-start-dir"))) + (mtconf (car mtconfdat))) + (print "Running import at " (current-seconds)) + (common:load-pkts-to-db mtconf) + (print "Running generate run pkts at " (current-seconds)) + (generate-run-pkts mtconf toppath) + (print "Running run dispatch at " (current-seconds)) + (common:load-pkts-to-db mtconf) + (dispatch-commands mtconf toppath) + (print "Done running import, generate, and dispatch done in " (- (current-seconds) this-run)) + (print "NOTE: touch " (current-directory) "/do-not-run-mtutil-go to kill this runner.") + (loop this-run (current-seconds))) + (let ((now (current-seconds))) + (print "Sleeping " rest-time " seconds, next run in aproximately " (- period (- now last-run)) " seconds") + (thread-sleep! rest-time) + (loop last-run (current-seconds)))))) + (delete-file* "mtutil-go.pid"))))) ;; misc ((show) (if (> (length remargs) 0) (let* ((mtconfdat (simple-setup (args:get-arg "-start-dir"))) (mtconf (car mtconfdat)) @@ -1807,46 +1855,52 @@ ) ))) (loop (nn-recv rep)))) (print "ERROR: Port " portnum " already in use. Try another port"))))))) - - - - - ((tlisten) - (if (null? remargs) - (print "ERROR: useage for tlisten is \"mtutil tlisten portnum\"") - (let ((portnum (string->number (car remargs)))) - - (if (not portnum) - (print "ERROR: the portnumber parameter must be a number, you gave: " (car remargs)) - (begin - (if (not (is-port-in-use portnum)) - (let* ((rep (start-nn-server portnum)) - (mtconfdat (simple-setup (args:get-arg "-start-dir"))) - (mtconf (car mtconfdat)) - (contact (configf:lookup mtconf "listener" "owner")) - (script (configf:lookup mtconf "listener" "script"))) - (print "Listening on port " portnum " for messages.") - (set-signal-handler! signal/int (lambda (signum) - (set! *time-to-exit* #t) - (debug:print-error 0 *default-log-port* "Received signal " signum " sending email befor exiting !!") - (let ((email-body (mtut:stml->string (s:body - (s:p (conc "Received signal " signum ". Lister has been terminated on host " (get-environment-variable "HOST") ". ")))))) - (sendmail contact "Listner has been terminated." email-body use_html: #t)) - (exit))) - (set-signal-handler! signal/term (lambda (signum) - (set! *time-to-exit* #t) - (debug:print-error 0 *default-log-port* "Received signal " signum " sending email befor exiting !!") - (let ((email-body (mtut:stml->string (s:body - (s:p (conc "Received signal " signum ". Lister has been terminated on host " (get-environment-variable "HOST") ". ")))))) - (sendmail contact "Listner has been terminated." email-body use_html: #t)) - (exit))) - - ;(set-signal-handler! signal/term special-signal-handler) - + + ((tlisten) + (if (null? remargs) + (print "ERROR: useage for tlisten is \"mtutil tlisten portnum\"") + (let ((portnum (string->number (car remargs)))) + + (if (not portnum) + (print "ERROR: the portnumber parameter must be a number, you gave: " (car remargs)) + (begin + (if (not (is-port-in-use portnum)) + (let* ((rep (start-nn-server portnum)) + (mtconfdat (simple-setup (args:get-arg "-start-dir"))) + (mtconf (car mtconfdat)) + (contact (configf:lookup mtconf "listener" "owner")) + (script (configf:lookup mtconf "listener" "script"))) + (print "Listening on port " portnum " for messages.") + (set-signal-handler! signal/int + (lambda (signum) + (set! *time-to-exit* #t) + (debug:print-error 0 *default-log-port* "Received signal " signum + " sending email befor exiting !!") + (let ((email-body (mtut:stml->string + (s:body + (s:p (conc "Received signal " signum + ". Lister has been terminated on host " + (get-environment-variable "HOST") ". ")))))) + (sendmail contact "Listner has been terminated." email-body use_html: #t)) + (exit))) + (set-signal-handler! signal/term (lambda (signum) + (set! *time-to-exit* #t) + (debug:print-error 0 *default-log-port* "Received signal " + signum " sending email befor exiting !!") + (let ((email-body (mtut:stml->string + (s:body + (s:p (conc "Received signal " signum + ". Lister has been terminated on host " + (get-environment-variable "HOST") ". ")))))) + (sendmail contact "Listner has been terminated." email-body use_html: #t)) + (exit))) + + ;; (set-signal-handler! signal/term special-signal-handler) + (let loop ((instr (nn-recv rep))) (nn-send rep "ok") (let ((ctime (date->string (current-date)))) (if (equal? instr "time-to-die") (begin Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -501,10 +501,11 @@ ;; ;; test-names: Comma separated patterns same as test-patts but used in selection ;; of tests to run. The item portions are not respected. ;; FIXME: error out if /patt specified ;; +;; run-count is passed from megatest.scm as configf:lookup *configdat* "setup" "reruns", or defaults to 1. (define (runs:run-tests target runname test-patts user flags #!key (run-count 1)) ;; test-names (let* ((keys (keys:config-get-fields *configdat*)) (keyvals (keys:target->keyval keys target)) (run-id (rmt:register-run keyvals runname "new" "n/a" user (args:get-arg "-contour"))) ;; test-name))) ;; (deferred '()) ;; delay running these since they have a waiton clause @@ -624,11 +625,11 @@ ;; look up all tests matching the comma separated list of globs in ;; test-patts (using % as wildcard) ;; (set! test-names (delete-duplicates (tests:get-valid-tests *toppath* test-patts))) (debug:print-info 0 *default-log-port* "tests search path: " (string-intersperse (tests:get-tests-search-path *configdat*) " ")) - (debug:print-info 0 *default-log-port* "all tests: " (string-intersperse (sort all-test-names string<) " ")) + (debug:print-info 2 *default-log-port* "all tests: " (string-intersperse (sort all-test-names string<) " ")) (debug:print-info 0 *default-log-port* "test names: " (string-intersperse (sort test-names string<) " ")) (debug:print-info 0 *default-log-port* "required tests: " (string-intersperse (sort required-tests string<) " ")) ;; on the first pass or call to run-tests set FAILS to NOT_STARTED if ;; -keepgoing is specified @@ -738,11 +739,11 @@ (waiton-itemized (and waiton-tconfig (or (hash-table-ref/default waiton-tconfig "items" #f) (hash-table-ref/default waiton-tconfig "itemstable" #f)))) (itemmaps (tests:get-itemmaps config)) ;; (configf:lookup config "requirements" "itemmap")) (new-test-patts (tests:extend-test-patts test-patts hed waiton itemmaps hed-itemized-waiton))) - (debug:print-info 0 *default-log-port* "Test " waiton " has " (if waiton-record "a" "no") " waiton-record and" (if waiton-itemized " " " no ") "items") + (debug:print-info 2 *default-log-port* "Test " waiton " has " (if waiton-record "a" "no") " waiton-record and" (if waiton-itemized " " " no ") "items") ;; need to account for test-patt here, if I am test "a", selected with a test-patt of "hed/b%" ;; and we are waiting on "waiton" we need to add "waiton/,waiton/b%" to test-patt ;; is this satisfied by merely appending "/" to the waiton name added to the list? ;; ;; This approach causes all of the items in an upstream test to be run @@ -760,11 +761,11 @@ (set! test-patts new-test-patts)) (begin (debug:print-info 0 *default-log-port* "Waitor(s) not yet on testpatt for " waiton ", setting up to re-process it") (set! tal (append (cons waiton tal)(list hed))))) (begin - (debug:print-info 0 *default-log-port* "Adding non-itemized test " waiton " to required-tests") + (debug:print-info 2 *default-log-port* "Adding non-itemized test " waiton " to required-tests") (set! required-tests (cons waiton required-tests)) (set! test-patts new-test-patts))) (begin (debug:print-info 0 *default-log-port* "No testconfig info yet for " waiton ", setting up to re-process it") (set! tal (append (cons waiton tal)(list hed))))) ;; (cons (conc waiton "/") required-tests)) @@ -897,10 +898,11 @@ ;; (tal (cdr sorted-test-names)) ;; (reg '()) ;; registered, put these at the head of tal ;; (reruns '())) (define (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps) (let* ((loop-list (list hed tal reg reruns)) + (junk (debug:print-info 4 *default-log-port* "expand-items calling rmt:get-prereqs-not-met")) (prereqs-not-met (let ((res (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))) (if (list? res) res (begin (debug:print 0 *default-log-port* @@ -1090,11 +1092,11 @@ ((null? runnables) (debug:print-info 4 *default-log-port* "cond branch - " "ei-7") #f) ;; if we get here and non-completed is null then it is all over. (else (debug:print-info 4 *default-log-port* "cond branch - " "ei-8") - (debug:print 0 *default-log-port* "WARNING: FAILS or incomplete tests maybe preventing completion of this run. Watch for issues with test " hed ", continuing for now") + (debug:print 2 *default-log-port* "WARNING: FAILS or incomplete tests maybe preventing completion of this run. Watch for issues with test " hed ", continuing for now") (list (car newtal)(cdr newtal) reg reruns))))) (define (runs:mixed-list-testname-and-testrec->list-of-strings inlst) (if (null? inlst) '() @@ -1667,11 +1669,10 @@ (loop (runs:queue-next-hed tal reg reglen regfull) (runs:queue-next-tal tal reg reglen regfull) (runs:queue-next-reg tal reg reglen regfull) reruns)))) ;; (loop (car tal)(cdr tal) reg reruns)))) - (runs:incremental-print-results run-id) (debug:print 4 *default-log-port* "TOP OF LOOP => " "test-name: " test-name "\n hed: " hed "\n tal: " (runs:pretty-long-list tal) @@ -1851,16 +1852,18 @@ (debug:print-info 4 *default-log-port* " -- Can't expand hed="hed) ) ) ;; if can't run more just loop with next possible test (loop (car newtal)(cdr newtal) reg reruns)))) - + + ;; this case should not happen, added to help catch any bugs ((and (list? items) itemdat) (debug:print-info 4 *default-log-port* "cond branch - " "rtq-5") (debug:print-error 0 *default-log-port* "Should not have a list of items in a test and the itemspath set - please report this") (exit 1)) + ((not (null? reruns)) (debug:print-info 4 *default-log-port* "cond branch - " "rtq-6") (let* ((newlst (tests:filter-non-runnable run-id tal test-records)) ;; i.e. not FAIL, WAIVED, INCOMPLETE, PASS, KILLED, (junked (lset-difference equal? tal newlst))) (debug:print-info 4 *default-log-port* "full drop through, if reruns is less than 100 we will force retry them, reruns=" reruns ", tal=" tal) @@ -1869,10 +1872,11 @@ (set! num-retries (+ num-retries 1)) ;; (thread-sleep! (+ 1 *global-delta*)) (if (not (null? newlst)) ;; since reruns have been tacked on to newlst create new reruns from junked (loop (car newlst)(cdr newlst) reg (delete-duplicates junked))))) + ((not (null? tal)) (debug:print-info 4 *default-log-port* "cond branch - " "rtq-7") (debug:print-info 4 *default-log-port* "I'm pretty sure I shouldn't get here.")) ((not (null? reg)) ;; could we get here with leftovers? (debug:print-info 4 *default-log-port* "cond branch - " "rtq-8") @@ -2108,11 +2112,11 @@ (else (set! runflag #f))) (debug:print 4 *default-log-port* "RUNNING => runflag: " runflag " STATE: " (test:get-state testdat) " STATUS: " (test:get-status testdat)) (if (not runflag) (if (not parent-test) (if (runs:lownoise (conc "not starting test" full-test-name) 60) - (debug:print 1 *default-log-port* "NOTE: Not starting test " full-test-name " as it is state \"" (test:get-state testdat) + (debug:print 3 *default-log-port* "NOTE: Not starting test " full-test-name " as it is state \"" (test:get-state testdat) "\" and status \"" (test:get-status testdat) "\", use -rerun \"" (test:get-status testdat) "\" or -force to override"))) ;; NOTE: No longer be checking prerequisites here! Will never get here unless prereqs are ;; already met. ;; This would be a great place to do the process-fork @@ -2393,11 +2397,11 @@ (tasks:kill-runner target run-name testpatt) ;; (debug:print 0 *default-log-port* "not attempting to kill any run launcher processes as testpatt is " testpatt)) (debug:print 1 *default-log-port* "Removing tests for run: " runkey " " (db:get-value-by-header run header "runname"))) ((set-state-status) ;; (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10)) - (debug:print 1 *default-log-port* "Modifying state and staus for tests for run: " runkey " " (db:get-value-by-header run header "runname"))) + (debug:print 2 *default-log-port* "Modifying state and status for tests for run: " runkey " " (db:get-value-by-header run header "runname"))) ((print-run) (debug:print 1 *default-log-port* "Printing info for run " runkey ", run=" run ", tests=" tests ", header=" header) action) ((run-wait) (debug:print 1 *default-log-port* "Waiting for run " runkey ", run=" runnamepatt " to complete")) @@ -2534,11 +2538,11 @@ ) ; end let ); end cond has-subrun (else ;; BB - TODO - consider backgrounding to threads to delete tests (work below) - (debug:print-info 0 *default-log-port* "test: " test-name " itest-state: " test-state) + (debug:print-info 2 *default-log-port* "test: " test-name " itest-state: " test-state) (if (member test-state (list "RUNNING" "LAUNCHED" "REMOTEHOSTSTART" "KILLREQ")) (begin (if (not (hash-table-ref/default test-retry-time test-fulln #f)) (begin ;; want to set to REMOVING BUT CANNOT do it here? @@ -2729,11 +2733,11 @@ ) (case clean-mode ((remove-data-only)(mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "CLEANING" "LOCKED" #f)) ((remove-all) (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "REMOVING" "LOCKED" #f)) ((archive-remove) (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "ARCHIVE_REMOVING" #f #f))) - (debug:print-info 1 *default-log-port* "Attempting to remove " (if real-dir (conc " dir " real-dir " and ") "") " link " run-dir) + (debug:print-info 2 *default-log-port* "Attempting to remove " (if real-dir (conc " dir " real-dir " and ") "") " link " run-dir) (if (and real-dir (> (string-length real-dir) 5) (common:file-exists? real-dir)) ;; bad heuristic but should prevent /tmp /home etc. (let* ((realpath (resolve-pathname run-dir))) (debug:print-info 1 *default-log-port* "Recursively removing " realpath) @@ -3057,17 +3061,17 @@ (files (if (common:file-exists? runtop) (append (glob (conc runtop "/.megatest*")) (glob (conc runtop "/.runconfig*"))) '()))) (if (null? files) - (debug:print-info 0 *default-log-port* "No cached megatest or runconfigs files found. None removed.") + (debug:print-info 2 *default-log-port* "No cached megatest or runconfigs files found. None removed.") (begin - (debug:print-info 0 *default-log-port* "Removing cached files:\n " (string-intersperse files "\n ")) + (debug:print-info 2 *default-log-port* "Removing cached files:\n " (string-intersperse files "\n ")) (for-each (lambda (f) (handle-exceptions exn (debug:print 0 *default-log-port* "WARNING: Failed to remove file " f ", exn=" exn) (delete-file f))) files)))) (debug:print-error 0 *default-log-port* "-clean-cache requires -runname.")) (debug:print-error 0 *default-log-port* "-clean-cache requires -target or -reqtarg"))) Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -111,12 +111,12 @@ ((fs) result) (else (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*) result))) -;; Given a run id start a server process ### NOTE ### > file 2>&1 -;; if the run-id is zero and the target-host is set +;; Given an area path, start a server process ### NOTE ### > file 2>&1 +;; if the target-host is set ;; try running on that host ;; incidental: rotate logs in logs/ dir. ;; (define (server:run areapath) ;; areapath is *toppath* for a given testsuite area (let* ((curr-host (get-host-name)) @@ -154,21 +154,22 @@ (begin (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) (setenv "TARGETHOST" target-host))) (setenv "TARGETHOST_LOGF" logfile) - (thread-sleep! (/ (random 5000) 1000)) ;; add about a random (up to 5 seconds) initial delay. It seems pretty common that many running tests request a server at the same time - ;; (common:wait-for-normalized-load load-limit " delaying server start due to load" target-host) ;; do not try starting servers on an already overloaded machine, just wait forever - #;(common:wait-for-homehost-load load-limit (conc " delaying server start due to load on homehost. limit is " load-limit)) + (thread-sleep! (/ (random 3000) 1000)) ;; add a random initial delay. It seems pretty common that many running tests request a server at the same time + (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time)) (system (conc "nbfake " cmdln)) (unsetenv "TARGETHOST_LOGF") (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) (thread-join! log-rotate) (pop-directory))) -;; given a path to a server log return: host port startseconds -;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let +;; given a path to a server log return: host port startseconds server-id +;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which use match let +;; example of what it's looking for in the log file: +;; SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 (define (server:logf-get-start-info logf) (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id (dbprep-rx (regexp "^SERVER: dbprep")) (dbprep-found 0)) @@ -201,18 +202,18 @@ (string->number (caddr dat)) (cadr (cddr dat)))))) (begin (if dbprep-found (begin - (debug:print-info 0 *default-log-port* "Server is in dbprep at " (current-seconds)) - (thread-sleep! 25) + (debug:print-info 2 *default-log-port* "Server is in dbprep at " (common:human-time)) + (thread-sleep! 0.5) ;; was 25 sec but that blocked things from starting? ) - (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (current-seconds)) + (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds))) ) (list #f #f #f #f))))))))) -;; get a list of servers with all relevant data +;; get a list of servers from the log files, with all relevant data ;; ( mod-time host port start-time pid ) ;; (define (server:get-list areapath #!key (limit #f)) (let ((fname-rx (regexp "^(|.*/)server-(\\d+)-(\\S+).log$")) (day-seconds (* 24 60 60))) @@ -228,17 +229,19 @@ (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs"))) (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list. exn=" exn))) (directory-exists? (conc areapath "/logs"))) '())) - ;; Get the list of server logs that do not contain "exiting". Ignore logs for servers that have exited. - (let* ((server-logs-cmd (conc "grep -iL exiting " areapath "/logs/server-*-*.log")) - (server-logs (string-split (string-chomp (call-with-input-pipe server-logs-cmd read-all)))) + ;; Get the list of server logs. First remove logs for servers that have exited. + (let* ( + ;; For some reason, when I uncomment the below line, ext-tests sometimes starts 1000's of servers. + ;; (exiting-servers (system (conc "bash -c 'rm -f `grep -il exiting " areapath "/logs/server-*-*.log 2> /dev/null`'"))) + (server-logs (glob (conc areapath "/logs/server-*-*.log"))) (num-serv-logs (length server-logs))) (if (or (null? server-logs) (= num-serv-logs 0)) (let () - (debug:print 1 *default-log-port* "There are no servers running") + (debug:print 2 *default-log-port* "There are no servers running at " (common:human-time)) '() ) (let loop ((hed (string-chomp (car server-logs))) (tal (cdr server-logs)) (res '())) @@ -365,66 +368,70 @@ (if *my-client-signature* *my-client-signature* (let ((sig (server:mk-signature))) (set! *my-client-signature* sig) *my-client-signature*))) -;; wait for server=start-last to be three seconds old + +;; if server-start-last exists, and wasn't old enough, wait + 1, then call this function recursively until it is old enough. +;; if it is old enough, overwrite it and wait 0.25 seconds. +;; if it then has the wrong server key, wait + 1 and call this function recursively. ;; (define (server:wait-for-server-start-last-flag areapath) (let* ((start-flag (conc areapath "/logs/server-start-last")) ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds) - (reftime (configf:lookup-number *configdat* "server" "idletime" default: 4)) + (idletime (configf:lookup-number *configdat* "server" "idletime" default: 4)) (server-key (conc (get-host-name) "-" (current-process-id)))) (if (file-exists? start-flag) (let* ((fmodtime (file-modification-time start-flag)) (delta (- (current-seconds) fmodtime)) - (all-go (> delta reftime))) - (if (and all-go + (old-enough (> delta idletime)) + (new-server-key "") + ) + + ;; write start-flag file, wait 0.25s, then if previously the start-flag file was older than seconds, and the new file still has the same server key as you just wrote, return #t. + ;; the intention is to make sure nfs can read the file we just wrote, and make sure it was written by us, and not another process. + (if (and old-enough (begin - (debug:print-info 0 *default-log-port* "Writing " start-flag) - (with-output-to-file start-flag - (lambda () - (print server-key))) + (debug:print-info 2 *default-log-port* "Writing " start-flag) + (with-output-to-file start-flag (lambda () (print server-key))) (thread-sleep! 0.25) - (let ((res (with-input-from-file start-flag - (lambda () - (read-line))))) - (equal? server-key res)))) - #t ;; (system (conc "touch " start-flag)) ;; lazy but safe + (set! new-server-key (with-input-from-file start-flag (lambda () (read-line)))) + (equal? server-key new-server-key)) + ) + #t + + ;; If either of the above conditions is not true, print a "Gating server start" message, wait + 1, then call this function recursively. (begin (debug:print-info 0 *default-log-port* "Gating server start, last start: " - fmodtime ", delta: " delta ", reftime: " reftime ", all-go=" all-go) - (thread-sleep! reftime) + (seconds->time-string fmodtime) ", time since last start: " delta ", required idletime: " idletime ", gating reason:" (if old-enough "another job started a server" "too soon to start another server")) + + (thread-sleep! ( + 1 idletime)) (server:wait-for-server-start-last-flag areapath))))))) -;; kind start up of servers, wait 40 seconds before allowing another server for a given -;; run-id to be launched + + +;; kind start up of server, wait before allowing another server for a given +;; area to be launched ;; (define (server:kind-run areapath) ;; look for $MT_RUN_AREA_HOME/logs/server-start-last - ;; and wait for it to be at least 3 seconds old + ;; and wait for it to be at least seconds old (server:wait-for-server-start-last-flag areapath) (if (not (server:check-if-running areapath)) ;; why try if there is already a server running? - (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun - (call-num (car last-run-dat)) - (when-run (cadr last-run-dat)) - (run-delay (+ (case call-num - ((0) 0) - ((1) 20) - ((2) 300) - (else 600)) - (random 5))) ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously + (let* ( (lock-file (conc areapath "/logs/server-start.lock"))) - (if (> (- (current-seconds) when-run) run-delay) - (let* ((start-flag (conc areapath "/logs/server-start-last"))) - (common:simple-file-lock-and-wait lock-file expire-time: 15) - (debug:print-info 0 *default-log-port* "server:kind-run: touching " start-flag) - (system (conc "touch " start-flag)) ;; lazy but safe - (server:run areapath) - (thread-sleep! 2) ;; don't release the lock for at least a few seconds - (common:simple-file-release-lock lock-file))) - (hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds)))))) + (let* ((start-flag (conc areapath "/logs/server-start-last"))) + (common:simple-file-lock-and-wait lock-file expire-time: 25) + (debug:print-info 2 *default-log-port* "server:kind-run: touching " start-flag) + (system (conc "touch " start-flag)) ;; lazy but safe + (server:run areapath) + (thread-sleep! 20) ;; don't release the lock for at least a few seconds. And allow time for the server startup to get to "SERVER STARTED". + (common:simple-file-release-lock lock-file))) + + (debug:print-info 0 *default-log-port* "Found server already running. NOT trying to start another.") + ) +) ;; this one seems to be the general entry point ;; (define (server:start-and-wait areapath #!key (timeout 60)) (let ((give-up-time (+ (current-seconds) timeout))) @@ -449,17 +456,17 @@ (or ns numservers))) ;; no longer care if multiple servers are started by accident. older servers will drop off in time. ;; (define (server:check-if-running areapath) ;; #!key (numservers "2")) - (let* ((ns (server:get-num-servers)) + (let* ((ns (server:get-num-servers)) ;; get the setting the for maximum number of servers allowed (servers (server:get-best (server:get-list areapath)))) (if (or (and servers (null? servers)) (not servers) (and (list? servers) - (< (length servers) (random ns)))) ;; somewhere between 0 and numservers + (< (length servers) (+ 1 (random ns))))) ;; somewhere between 1 and numservers #f (let loop ((hed (car servers)) (tal (cdr servers))) (let ((res (server:check-server hed))) (if res Index: utils/mk_wrapper ================================================================== --- utils/mk_wrapper +++ utils/mk_wrapper @@ -44,10 +44,19 @@ else export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$libdir fi export MT_SQLITE3_EXE=$sqlite3_exe + +http_vars="http_proxy https_proxy HTTP_PROXY HTTPS_PROXY" +for i in \$http_vars +do +j=\${!i} +if [ "\$j" != "" ]; then + unset \$i +fi +done __EOF ) > $cfgfile echo else echo "INFO: LD_LIBRARY_PATH not set" >&2 @@ -93,11 +102,9 @@ # echo "#!/bin/bash" > $target # echo "exec $prefix/bin/.\$(lsb_release -sr)/$cmd \"\$@\"" >> $target echo "lsbr=\$(lsb_release -sr)" >> $target -if [ "$LD_LIBRARY_PATH" != "" ];then - echo "source $prefix/bin/.\$lsbr/cfg.sh" >> $target -fi +echo "source $prefix/bin/.\$lsbr/cfg.sh" >> $target # echo "if [[ -e \$lsbr ]];then source \$lsbr;fi" >> $target echo "exec $prefix/bin/.\$lsbr/$cmd \"\$@\"" >> $target