Megatest

Changes On Branch a853e6c4834abd16
Login

Changes In Branch v1.65 Through [a853e6c483] Excluding Merge-Ins

This is equivalent to a diff from 57b5fb07d6 to a853e6c483

2022-01-07
12:10
made missing waiton detection messages more useful check-in: 8b8b1f63e7 user: mmgraham tags: v1.65
2021-10-21
13:15
Updated megatest version to 1.6588 check-in: a853e6c483 user: mmgraham tags: v1.65, v1.6588
2021-10-20
20:31
Fixed bug where subrun would never end due to runwait where the subrun is part of the current run (i.e. it waits forever for itself to end). check-in: 9ecc1c4e98 user: matt tags: v1.65
2021-04-06
10:31
Fixed typo Closed-Leaf check-in: 8283307ec9 user: matt tags: v1.65-real
2021-03-14
05:41
Redo inter-test-delay. Leaf check-in: 262ea69800 user: matt tags: v1.65-real-redo-inter-test-delay
2021-03-11
17:52
updated delay to kind-run check-in: dd2cea12eb user: pjhatwal tags: v1.65, THIS-IS-REAL-1.65
2021-03-09
21:03
Very odd, missing egg in server.scm, util. check-in: 57b5fb07d6 user: matt tags: v1.65-real
18:48
changed version to 1.6584 check-in: b6403cb822 user: mmgraham tags: v1.65-real

Modified common.scm from [82673dacdb] to [526a2263d9].

815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
    (11 "PREQ_DISCARDED")
    (12 "ABORT")))

(define *common:ended-states*       ;; states which indicate the test is stopped and will not proceed
  '("COMPLETED" "ARCHIVED" "KILLED" "KILLREQ" "STUCK" "INCOMPLETE" ))

(define *common:badly-ended-states* ;; these roll up as CHECK, i.e. results need to be checked
  '("KILLED" "KILLREQ" "STUCK" "INCOMPLETE" "DEAD"))

(define *common:well-ended-states* ;; an item's prereq in this state allows item to proceed
  '("PASS" "WARN" "CHECK" "WAIVED" "SKIP"))

;; BBnote: *common:running-states* used from db:set-state-status-and-roll-up-items
(define *common:running-states*     ;; test is either running or can be run
  '("RUNNING" "REMOTEHOSTSTART" "LAUNCHED" "STARTED"))

(define *common:cant-run-states*    ;; These are stopping conditions that prevent a test from being run
  '("COMPLETED" "KILLED" "UNKNOWN" "INCOMPLETE" "ARCHIVED"))







|


|







815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
    (11 "PREQ_DISCARDED")
    (12 "ABORT")))

(define *common:ended-states*       ;; states which indicate the test is stopped and will not proceed
  '("COMPLETED" "ARCHIVED" "KILLED" "KILLREQ" "STUCK" "INCOMPLETE" ))

(define *common:badly-ended-states* ;; these roll up as CHECK, i.e. results need to be checked
  '("KILLED" "KILLREQ" "STUCK" "INCOMPLETE" "DEAD" "CHECK"))

(define *common:well-ended-states* ;; an item's prereq in this state allows item to proceed
  '("PASS" "WARN" "WAIVED" "SKIP"))

;; BBnote: *common:running-states* used from db:set-state-status-and-roll-up-items
(define *common:running-states*     ;; test is either running or can be run
  '("RUNNING" "REMOTEHOSTSTART" "LAUNCHED" "STARTED"))

(define *common:cant-run-states*    ;; These are stopping conditions that prevent a test from being run
  '("COMPLETED" "KILLED" "UNKNOWN" "INCOMPLETE" "ARCHIVED"))
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
;;   count     - count down to zero, at some point we'd give up if the load never drops
;;   num-tries - count down to zero number tries to get numcpus
;;
(define (common:wait-for-cpuload maxnormload numcpus-in
				 #!key (count 1000)
				 (msg #f)(remote-host #f)(num-tries 5))
  (let* ((loadavg     (common:get-cpu-load remote-host))
	  ;; not possible to have zero.  If we get 1, it's possible that we got the previous default, and we should check again
	 (numcpus     (if (<= 1 numcpus-in)
			  (common:get-num-cpus remote-host)
			  numcpus-in))
	 (first       (car loadavg))
	 (next        (cadr loadavg))
	 (adjmaxload  (* maxnormload (max 1 numcpus))) ;; possible bug
						   ;; where numcpus
						   ;; (or could be
						   ;; maxload) is
						   ;; zero, crude
						   ;; fallback is to
						   ;; at least use 1
	 ;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit
	 ;; etc.
	 (effective-load    (common:get-intercept first next))
	 (recommended-delay (common:get-delay effective-load numcpus))
	 (effective-host    (or remote-host "localhost"))
	 (normalized-effective-load (/ effective-load numcpus))
	 (will-wait                 (> normalized-effective-load maxnormload)))
    (if (> recommended-delay 1) 
	(let* ((actual-delay (min recommended-delay 30)))
	  (if (common:low-noise-print 30 (conc (round actual-delay) "-safe-load"))
	      (debug:print-info 0 *default-log-port* "Load control, delaying "
				actual-delay " seconds to maintain safe load. current normalized effective load is "
				normalized-effective-load". maxnormload = " maxnormload " numcpus = " numcpus " loadavg = " loadavg " effective-load = " effective-load))
	  (thread-sleep! actual-delay)))
    
    (cond
     ;; bad data, try again to get the data
     ((not will-wait)
      (if (common:low-noise-print 3600 (conc (round normalized-effective-load) "-load-acceptable-" effective-host))
      	  (debug:print 0 *default-log-port* "Effective load on " effective-host " is acceptable at " effective-load " continuing.")))








|
|
|
<
|
|
|
<
<
<
<
|
<
|
|
|
|
|
|
|
|
|
|
|


|







2206
2207
2208
2209
2210
2211
2212
2213
2214
2215

2216
2217
2218




2219

2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
;;   count     - count down to zero, at some point we'd give up if the load never drops
;;   num-tries - count down to zero number tries to get numcpus
;;
(define (common:wait-for-cpuload maxnormload numcpus-in
				 #!key (count 1000)
				 (msg #f)(remote-host #f)(num-tries 5))
  (let* ((loadavg     (common:get-cpu-load remote-host))
    ;; not possible to have zero.  If we get 1, it's possible that we got the previous default, and we should check again
    (numcpus     (if (<= 1 numcpus-in)
    (common:get-num-cpus remote-host) numcpus-in))

    (first       (car loadavg))
    (next        (cadr loadavg))
    (adjmaxload  (* maxnormload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude




						  ;; fallback is to at least use 1

    ;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit
    ;; etc.
    (effective-load    (common:get-intercept first next))
    (recommended-delay (common:get-delay effective-load numcpus))
    (effective-host    (or remote-host "localhost"))
    (normalized-effective-load (/ effective-load numcpus))
    (will-wait                 (> normalized-effective-load maxnormload)))
    (if (and will-wait (> recommended-delay 1)) 
      (let* ((actual-delay (min recommended-delay 30)))
        (if (common:low-noise-print 30 (conc (round actual-delay) "-safe-load"))
          (debug:print-info 0 *default-log-port* "Load control, delaying "
				actual-delay " seconds to maintain safe load. current normalized effective load is "
				normalized-effective-load". maxnormload = " maxnormload " numcpus = " numcpus " loadavg = " loadavg " effective-load = " effective-load))
	(thread-sleep! actual-delay)))
    
    (cond
     ;; bad data, try again to get the data
     ((not will-wait)
      (if (common:low-noise-print 3600 (conc (round normalized-effective-load) "-load-acceptable-" effective-host))
      	  (debug:print 0 *default-log-port* "Effective load on " effective-host " is acceptable at " effective-load " continuing.")))

Modified configf.scm from [15f0835800] to [b768bf346e].

50
51
52
53
54
55
56




57
58
59
60
61
62
63

(define (configf:assoc-safe-add alist key val #!key (metadata #f))
  (let ((newalist (filter (lambda (x)(not (equal? key (car x)))) alist)))
    (append newalist (list (if metadata
			       (list key val metadata)
			       (list key val))))))





(define (configf:section-var-set! cfgdat section-name var value #!key (metadata #f))
  (hash-table-set! cfgdat section-name
		   (configf:assoc-safe-add
		    (hash-table-ref/default cfgdat section-name '())
		    var value metadata: metadata)))

(define (configf:eval-string-in-environment str)







>
>
>
>







50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67

(define (configf:assoc-safe-add alist key val #!key (metadata #f))
  (let ((newalist (filter (lambda (x)(not (equal? key (car x)))) alist)))
    (append newalist (list (if metadata
			       (list key val metadata)
			       (list key val))))))

;; this is used in megatestqa/ext.scm.
;; remove it from here and there by 12/31/21
;; (define config:assoc-safe-add configf:assoc-safe-add)

(define (configf:section-var-set! cfgdat section-name var value #!key (metadata #f))
  (hash-table-set! cfgdat section-name
		   (configf:assoc-safe-add
		    (hash-table-ref/default cfgdat section-name '())
		    var value metadata: metadata)))

(define (configf:eval-string-in-environment str)
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
		       (lambda ()
			 (set! result ((eval (read)) ht))))
		     (set! result (conc "#{(" cmdtype ") "  cmd "}"))))
		(case cmdsym
		  ((system shell scheme)
		   (let ((delta (- (current-seconds) start-time)))
		     (if (> delta 2)
			 (debug:print-info 0 *default-log-port* "for line \"" l "\"\n command:  " cmd " took " delta " seconds to run with output:\n   " result)
			 (debug:print-info 9 *default-log-port* "for line \"" l "\"\n command:  " cmd " took " delta " seconds to run with output:\n   " result)))))
		(loop (conc prestr result poststr)))
	      res))
	res)))

;; Run a shell command and return the output as a string
(define (shell cmd)







|







146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
		       (lambda ()
			 (set! result ((eval (read)) ht))))
		     (set! result (conc "#{(" cmdtype ") "  cmd "}"))))
		(case cmdsym
		  ((system shell scheme)
		   (let ((delta (- (current-seconds) start-time)))
		     (if (> delta 2)
			 (debug:print-info 2 *default-log-port* "for line \"" l "\"\n command:  " cmd " took " delta " seconds to run with output:\n   " result)
			 (debug:print-info 9 *default-log-port* "for line \"" l "\"\n command:  " cmd " took " delta " seconds to run with output:\n   " result)))))
		(loop (conc prestr result poststr)))
	      res))
	res)))

;; Run a shell command and return the output as a string
(define (shell cmd)
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
;; redefines
(define config-lookup configf:lookup)
(define configf:read-file read-config)

;; safely look up a value that is expected to be a number, return
;; a default (#f unless provided)
;;
(define (configf:lookup-number cfdat section varname #!key (default #f))
  (let* ((val (configf:lookup *configdat* section varname))
         (res (if val
                  (string->number (string-substitute "\\s+" "" val #t))
                  #f)))
    (cond
     (res  res)
     (val  (debug:print 0 *default-log-port* "ERROR: no number found for [" section "], " varname ", got: " val))
     (else default))))







|
|







533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
;; redefines
(define config-lookup configf:lookup)
(define configf:read-file read-config)

;; safely look up a value that is expected to be a number, return
;; a default (#f unless provided)
;;
(define (configf:lookup-number cfgdat section varname #!key (default #f))
  (let* ((val (configf:lookup cfgdat section varname))
         (res (if val
                  (string->number (string-substitute "\\s+" "" val #t))
                  #f)))
    (cond
     (res  res)
     (val  (debug:print 0 *default-log-port* "ERROR: no number found for [" section "], " varname ", got: " val))
     (else default))))

Modified db.scm from [ed256dd44f] to [65246b91b8].

4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678

4679
4680
4681
4682
4683
4684
4685
		(loop (car tal)(cdr tal) newr)))))))




;; the new prereqs calculation, looks also at itempath if specified
;; all prereqs must be met
;;    if prereq test with itempath='' is COMPLETED and PASS, WARN, CHECK, or WAIVED then prereq is met
;;    if prereq test with itempath=ref-item-path and COMPLETED with PASS, WARN, CHECK, or WAIVED then prereq is met
;;
;; Note: mode 'normal means that tests must be COMPLETED and ok (i.e. PASS, WARN, CHECK, SKIP or WAIVED)
;;       mode 'toplevel means that tests must be COMPLETED only
;;       mode 'itemmatch or 'itemwait means that tests items must be COMPLETED and (PASS|WARN|WAIVED|CHECK) [[ NB// NOT IMPLEMENTED YET ]]
;;       mode 'exclusive means this test/item cannot run if the same test/item is LAUNCHED,REMOTEHOSTSTART or RUNNING
;;
;; IDEA for consideration:
;;    1. collect all tests "upstream"
;;    2. any NOT completed and good? if yes => return those as prereqs not met, if no => return null list
;; 
;; (define (db:get-prereqs-not-met dbstruct run-id waitons ref-item-path mode)
(define (db:get-prereqs-not-met dbstruct run-id waitons ref-test-name ref-item-path mode itemmaps) ;; #!key (mode '(normal))(itemmap #f))
  ;; BBnote - rollup of an itemized test's overall state/status done in db:set-state-status-and-roll-up-items

  (append
   (if (member 'exclusive mode)
       (let ((running-tests (db:get-tests-for-run dbstruct
						  #f  ;; run-id of #f means for all runs. 
						  (if (string=? ref-item-path "")   ;; testpatt
						      ref-test-name
						      (conc ref-test-name "/" ref-item-path))







|














>







4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
		(loop (car tal)(cdr tal) newr)))))))




;; the new prereqs calculation, looks also at itempath if specified
;; all prereqs must be met
;;    if prereq test with itempath='' is in common:well-ended-states, then prereq is met
;;    if prereq test with itempath=ref-item-path and COMPLETED with PASS, WARN, CHECK, or WAIVED then prereq is met
;;
;; Note: mode 'normal means that tests must be COMPLETED and ok (i.e. PASS, WARN, CHECK, SKIP or WAIVED)
;;       mode 'toplevel means that tests must be COMPLETED only
;;       mode 'itemmatch or 'itemwait means that tests items must be COMPLETED and (PASS|WARN|WAIVED|CHECK) [[ NB// NOT IMPLEMENTED YET ]]
;;       mode 'exclusive means this test/item cannot run if the same test/item is LAUNCHED,REMOTEHOSTSTART or RUNNING
;;
;; IDEA for consideration:
;;    1. collect all tests "upstream"
;;    2. any NOT completed and good? if yes => return those as prereqs not met, if no => return null list
;; 
;; (define (db:get-prereqs-not-met dbstruct run-id waitons ref-item-path mode)
(define (db:get-prereqs-not-met dbstruct run-id waitons ref-test-name ref-item-path mode itemmaps) ;; #!key (mode '(normal))(itemmap #f))
  ;; BBnote - rollup of an itemized test's overall state/status done in db:set-state-status-and-roll-up-items
  (debug:print 4 *default-log-port* "db:get-prereqs-not-met: " waitons)
  (append
   (if (member 'exclusive mode)
       (let ((running-tests (db:get-tests-for-run dbstruct
						  #f  ;; run-id of #f means for all runs. 
						  (if (string=? ref-item-path "")   ;; testpatt
						      ref-test-name
						      (conc ref-test-name "/" ref-item-path))
4698
4699
4700
4701
4702
4703
4704


4705
4706
4707
4708
4709
4710
4711
	;;	(if (equal? (db:test-get-item-path testdat) "")
	;;	    (db:test-get-testname testdat)
	;;	    (conc (db:test-get-testname testdat)
	;;		  "/"
	;;		  (db:test-get-item-path testdat))))
	 running-tests) ;; calling functions want the entire data
       '())



   ;; collection of: for each waiton -
   ;;   if this ref-test-name is an item in an itemized test and mode is itemwait/itemmatch:
   ;;     if waiton is not itemized - if waiton is not both completed and in ok status, add as unmet prerequisite
   ;;     if waiton is itemized:
   ;;           and waiton's items are not expanded, add as unmet prerequisite
   ;;           else if matching waiton item is not both completed and in an ok status, add as unmet prerequisite







>
>







4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
	;;	(if (equal? (db:test-get-item-path testdat) "")
	;;	    (db:test-get-testname testdat)
	;;	    (conc (db:test-get-testname testdat)
	;;		  "/"
	;;		  (db:test-get-item-path testdat))))
	 running-tests) ;; calling functions want the entire data
       '())



   ;; collection of: for each waiton -
   ;;   if this ref-test-name is an item in an itemized test and mode is itemwait/itemmatch:
   ;;     if waiton is not itemized - if waiton is not both completed and in ok status, add as unmet prerequisite
   ;;     if waiton is itemized:
   ;;           and waiton's items are not expanded, add as unmet prerequisite
   ;;           else if matching waiton item is not both completed and in an ok status, add as unmet prerequisite

Modified ezsteps.scm from [5de5d166c7] to [0c12b39167].

102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
     (lambda () ;; (process-run "/bin/bash" "-c" "exec ls -l /tmp/foobar > /tmp/delme-more.log 2>&1")
       (let* ((cmd (conc stepcmd " > " stepname ".log 2>&1")) ;; >outfile 2>&1 
	      (pid #f))
	 (let ((proc (lambda ()
		       (set! pid (process-run "/bin/bash" (list "-c" cmd))))))
	   (if subrun
               (begin
                 (debug:print-info 0 *default-log-port* "Running without MT_.* environment variables.")
                 (common:without-vars proc "^MT_.*"))
	       (proc)))
	 
         (with-output-to-file "Makefile.ezsteps"
           (lambda ()
             (print stepname ".log :")
             (print "\t" cmd)







|







102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
     (lambda () ;; (process-run "/bin/bash" "-c" "exec ls -l /tmp/foobar > /tmp/delme-more.log 2>&1")
       (let* ((cmd (conc stepcmd " > " stepname ".log 2>&1")) ;; >outfile 2>&1 
	      (pid #f))
	 (let ((proc (lambda ()
		       (set! pid (process-run "/bin/bash" (list "-c" cmd))))))
	   (if subrun
               (begin
                 (debug:print-info 0 *default-log-port* "Running "cmd" without MT_.* environment variables.")
                 (common:without-vars proc "^MT_.*"))
	       (proc)))
	 
         (with-output-to-file "Makefile.ezsteps"
           (lambda ()
             (print stepname ".log :")
             (print "\t" cmd)

Modified launch.scm from [7e65ac64d4] to [b0b1044b95].

157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
	;; 3. fix target
	;; 4. fix runname
	;; 5. fix testpatt or calculate it from contour
	;; 6. launch the run
	;; 7. roll up the run result and or roll up the logpro processed result
	(when (configf:lookup testconfig "subrun" "runwait") ;; we use runwait as the flag that a subrun is requested
            (subrun:initialize-toprun-test testconfig test-run-dir)
	    (let* ((mt-cmd (subrun:launch-cmd test-run-dir)))
              (debug:print-info 0 *default-log-port* "Subrun command is \"" mt-cmd "\"")
              (set! ezsteps #t) ;; set the needed flag
	      (set! ezstepslst
                    (append (or ezstepslst '())
                            (list (list "subrun" (conc "{subrun=true} " mt-cmd)))))))

	;; process the ezsteps







|







157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
	;; 3. fix target
	;; 4. fix runname
	;; 5. fix testpatt or calculate it from contour
	;; 6. launch the run
	;; 7. roll up the run result and or roll up the logpro processed result
	(when (configf:lookup testconfig "subrun" "runwait") ;; we use runwait as the flag that a subrun is requested
            (subrun:initialize-toprun-test testconfig test-run-dir)
	    (let* ((mt-cmd (subrun:launch-cmd test-run-dir (configf:lookup testconfig "subrun" "runwait"))))
              (debug:print-info 0 *default-log-port* "Subrun command is \"" mt-cmd "\"")
              (set! ezsteps #t) ;; set the needed flag
	      (set! ezstepslst
                    (append (or ezstepslst '())
                            (list (list "subrun" (conc "{subrun=true} " mt-cmd)))))))

	;; process the ezsteps
474
475
476
477
478
479
480
481
482
483

484
485
486
487
488
489
490

491
492
493
494
495
496
497
498

              (rmt:general-call 'set-test-start-time #f test-id)
              (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f)
	      ) ;; prime it for running
	     ((member (db:test-get-state test-info) '("RUNNING" "REMOTEHOSTSTART"))
	      (if (process:alive-on-host? test-host test-pid)
		  (debug:print-error 0 *default-log-port* "test state is "  (db:test-get-state test-info) " and process " test-pid " is still running on host " test-host ", cannot proceed")
		  (exit)))
	     ((member (db:test-get-state test-info) '("COMPLETED"))  ;; we do NOT want to re-run COMPLETED jobs. Mark as NOT_STARTED to run!
	      (debug:print-error 0 *default-log-port* "test state is " (db:test-get-state test-info) ", cannot proceed")

	      (exit))
	     ((not (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ")))
	      ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a")
              (rmt:general-call 'set-test-start-time #f test-id)
	      (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f))
	     (else ;; (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ"))
	      (debug:print-error 0 *default-log-port* "test state is " (db:test-get-state test-info) ", cannot proceed")

	      (exit))))

          ;; cleanup prior execution's steps
          (rmt:delete-steps-for-test! run-id test-id)
          
	  (debug:print 2 *default-log-port* "Executing " test-name " (id: " test-id ") on " (get-host-name))
	  (set! keys       (rmt:get-keys))
	  ;; (runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals) ;; these may be needed by the launching process







|

|
>
|





|
>
|







474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500

              (rmt:general-call 'set-test-start-time #f test-id)
              (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f)
	      ) ;; prime it for running
	     ((member (db:test-get-state test-info) '("RUNNING" "REMOTEHOSTSTART"))
	      (if (process:alive-on-host? test-host test-pid)
		  (debug:print-error 0 *default-log-port* "test state is "  (db:test-get-state test-info) " and process " test-pid " is still running on host " test-host ", cannot proceed")
		  (exit 1)))
	     ((member (db:test-get-state test-info) '("COMPLETED"))  ;; we do NOT want to re-run COMPLETED jobs. Mark as NOT_STARTED to run!
	      (debug:print 0 *default-log-port* "test state is " (db:test-get-state test-info) ", cannot proceed")
	      (debug:print 0 *default-log-port* "exiting with status 1")
	      (exit 1))
	     ((not (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ")))
	      ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a")
              (rmt:general-call 'set-test-start-time #f test-id)
	      (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f))
	     (else ;; (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ"))
	      (debug:print 0 *default-log-port* "test state is " (db:test-get-state test-info) ", cannot proceed")
	      (debug:print 0 *default-log-port* "exiting with status 1")
	      (exit 1))))

          ;; cleanup prior execution's steps
          (rmt:delete-steps-for-test! run-id test-id)
          
	  (debug:print 2 *default-log-port* "Executing " test-name " (id: " test-id ") on " (get-host-name))
	  (set! keys       (rmt:get-keys))
	  ;; (runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals) ;; these may be needed by the launching process
641
642
643
644
645
646
647
648

649
650
651
652
653
654
655
656
657
658
659
660
661
662





663
664
665
666
667

668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692

693
694
695
696
697
698
699
700
701
702
703
704
705


706

707
708
709
710
711
712
713

714
715
716
717













718
719
720
721
722
723
724
		       (with-output-to-file name
			 (lambda ()
			   (print content)
			   (change-file-mode name (bitwise-ior perm/irwxg perm/irwxu)))))
		      (else
		       (debug:print-info 0 "Invalid script definiton found in [scripts] section of testconfig. \"" scriptdat "\""))))
	     scripts))
	  ;; 

	  (let* ((m            (make-mutex))
		 (kill-job?    #f)
		 (exit-info    (make-launch:einf pid: #t exit-status: #t exit-code: #t rollup-status: 0)) ;; pid exit-status exit-code (i.e. process was successfully run) rollup-status
		 (job-thread   #f)
		 ;; (keep-going   #t)
		 (misc-flags   (let ((ht (make-hash-table)))
				 (hash-table-set! ht 'keep-going #t)
				 ht))
		 (runit        (lambda ()
				 (launch:manage-steps run-id test-id item-path fullrunscript ezsteps subrun test-name tconfigreg exit-info m)))
		 (monitorjob   (lambda ()
				 (launch:monitor-job  run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags)))
		 (th1          (make-thread monitorjob "monitor job"))
		 (th2          (make-thread runit "run job")))





	    (set! job-thread th2)
	    (thread-start! th1)
	    (thread-start! th2)
	    (thread-join! th2)
	    (debug:print-info 0 *default-log-port* "Megatest exectute of test " test-name ", item path " item-path " complete. Notifying the db ...")

	    (hash-table-set! misc-flags 'keep-going #f)
	    (thread-join! th1)
	    (thread-sleep! 1)       ;; givbe thread th1 a chance to be done TODO: Verify this is needed. At 0.1 I was getting fail to stop, increased to total of 1.1 sec.
	    (mutex-lock! m)
	    (let* ((item-path (item-list->path itemdat))
		   ;; only state and status needed - use lazy routine
		   (testinfo  (rmt:get-testinfo-state-status run-id test-id)))
	      ;; Am I completed?
	      (if (member (db:test-get-state testinfo) '("REMOTEHOSTSTART" "RUNNING")) ;; NOTE: It should *not* be REMOTEHOSTSTART but for reasons I don't yet understand it sometimes gets stuck in that state ;; (not (equal? (db:test-get-state testinfo) "COMPLETED"))
		  (let ((new-state  (if kill-job? "KILLED" "COMPLETED") ;; (if (eq? (vector-ref exit-info 2) 0) ;; exited with "good" status
				                                        ;; "COMPLETED"							                ;; (db:test-get-state testinfo)))   ;; else preseve the state as set within the test
				    )
			(new-status (cond
				     ((not (launch:einf-exit-status exit-info)) "FAIL") ;; job failed to run ... (vector-ref exit-info 1)
				     ((eq? (launch:einf-rollup-status exit-info) 0)     ;; (vector-ref exit-info 3)
				      ;; if the current status is AUTO then defer to the calculated value (i.e. leave this AUTO)
				      (if (equal? (db:test-get-status testinfo) "AUTO") "AUTO" "PASS"))
				     ((eq? (launch:einf-rollup-status exit-info) 1) "FAIL")  ;; (vector-ref exit-info 3)
				     ((eq? (launch:einf-rollup-status exit-info) 2)	     ;;	(vector-ref exit-info 3)
				      ;; if the current status is AUTO the defer to the calculated value but qualify (i.e. make this AUTO-WARN)
				      (if (equal? (db:test-get-status testinfo) "AUTO") "AUTO-WARN" "WARN"))
				     ((eq? (launch:einf-rollup-status exit-info) 3) "CHECK")
				     ((eq? (launch:einf-rollup-status exit-info) 4) "WAIVED")
				     ((eq? (launch:einf-rollup-status exit-info) 5) "ABORT")
				     ((eq? (launch:einf-rollup-status exit-info) 6) "SKIP")

				     (else "FAIL")))) ;; (db:test-get-status testinfo)))
		    (debug:print-info 1 *default-log-port* "Test exited in state=" (db:test-get-state testinfo) ", setting state/status based on exit code of " (launch:einf-exit-status exit-info) " and rollup-status of " (launch:einf-rollup-status exit-info))
   
        ;; Leave a .final-status file for each sub-test
        (tests:save-final-status run-id test-id)

		    (tests:test-set-status! run-id 
					    test-id 
					    new-state
					    new-status
					    (args:get-arg "-m") #f)
		    ;; need to update the top test record if PASS or FAIL and this is a subtest
		    ;; NO NEED TO CALL set-state-status-and-roll-up-items HERE, THIS IS DONE IN set-state-status-and-roll-up-items called by tests:test-set-status!


		    ))

	      ;; for automated creation of the rollup html file this is a good place...
	      (if (not (equal? item-path ""))
		      (tests:summarize-items run-id test-id test-name #f))
	      (tests:summarize-test run-id test-id)  ;; don't force - just update if no
        ;; Leave a .final-status file for the top level test
        (tests:save-final-status run-id test-id)
	      (rmt:update-run-stats run-id (rmt:get-raw-run-stats run-id)))

	    (mutex-unlock! m)
            (launch:end-of-run-check run-id )
	    (debug:print 2 *default-log-port* "Output from running " fullrunscript ", pid " (launch:einf-pid exit-info) " in work area " 
			 work-area ":\n====\n exit code " (launch:einf-exit-code exit-info) "\n" "====\n")













	    (if (not (launch:einf-exit-status exit-info))
		(exit 4))))
        )))

;; Spec for End of test
;; At end of each test call, after marking self as COMPLETED do run-state-status-rollup
;; At transition to run COMPLETED/X do hooks







|
>













|
>
>
>
>
>




|
>








|
|
<
<
|












>
|
|

|
|








>
>
|
>




|
|
|
>




>
>
>
>
>
>
>
>
>
>
>
>
>







643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686


687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
		       (with-output-to-file name
			 (lambda ()
			   (print content)
			   (change-file-mode name (bitwise-ior perm/irwxg perm/irwxu)))))
		      (else
		       (debug:print-info 0 "Invalid script definiton found in [scripts] section of testconfig. \"" scriptdat "\""))))
	     scripts))
	  ;;

	  (let* ((m            (make-mutex))
		 (kill-job?    #f)
		 (exit-info    (make-launch:einf pid: #t exit-status: #t exit-code: #t rollup-status: 0)) ;; pid exit-status exit-code (i.e. process was successfully run) rollup-status
		 (job-thread   #f)
		 ;; (keep-going   #t)
		 (misc-flags   (let ((ht (make-hash-table)))
				 (hash-table-set! ht 'keep-going #t)
				 ht))
		 (runit        (lambda ()
				 (launch:manage-steps run-id test-id item-path fullrunscript ezsteps subrun test-name tconfigreg exit-info m)))
		 (monitorjob   (lambda ()
				 (launch:monitor-job  run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags)))
		 (th1          (make-thread monitorjob "monitor job"))
		 (th2          (make-thread runit "run job"))
                 (tconfig         (tests:get-testconfig test-name item-path tconfigreg #t))
                 (propagate-exit-code (configf:lookup *configdat* "setup" "propagate-exit-code"))
                 (propagate-status-list '("FAIL" "KILLED" "ABORT" "DEAD" "CHECK" "SKIP" "WAIVED"))
                 (test-status "not set")
                 )
	    (set! job-thread th2)
	    (thread-start! th1)
	    (thread-start! th2)
	    (thread-join! th2)
	    (debug:print-info 0 *default-log-port* "Megatest execute of test " test-name ", item path " item-path " complete. Notifying the db ...")
            (debug:print-info 2 *default-log-port* "exit-info = " exit-info)
	    (hash-table-set! misc-flags 'keep-going #f)
	    (thread-join! th1)
	    (thread-sleep! 1)       ;; givbe thread th1 a chance to be done TODO: Verify this is needed. At 0.1 I was getting fail to stop, increased to total of 1.1 sec.
	    (mutex-lock! m)
	    (let* ((item-path (item-list->path itemdat))
		   ;; only state and status needed - use lazy routine
		   (testinfo  (rmt:get-testinfo-state-status run-id test-id)))
	      ;; Am I completed?
	      (if (member (db:test-get-state testinfo) '("REMOTEHOSTSTART" "RUNNING"))
                 (let ((new-state  (if kill-job? "KILLED" "COMPLETED"))


		        (new-status (cond
				     ((not (launch:einf-exit-status exit-info)) "FAIL") ;; job failed to run ... (vector-ref exit-info 1)
				     ((eq? (launch:einf-rollup-status exit-info) 0)     ;; (vector-ref exit-info 3)
				      ;; if the current status is AUTO then defer to the calculated value (i.e. leave this AUTO)
				      (if (equal? (db:test-get-status testinfo) "AUTO") "AUTO" "PASS"))
				     ((eq? (launch:einf-rollup-status exit-info) 1) "FAIL")  ;; (vector-ref exit-info 3)
				     ((eq? (launch:einf-rollup-status exit-info) 2)	     ;;	(vector-ref exit-info 3)
				      ;; if the current status is AUTO the defer to the calculated value but qualify (i.e. make this AUTO-WARN)
				      (if (equal? (db:test-get-status testinfo) "AUTO") "AUTO-WARN" "WARN"))
				     ((eq? (launch:einf-rollup-status exit-info) 3) "CHECK")
				     ((eq? (launch:einf-rollup-status exit-info) 4) "WAIVED")
				     ((eq? (launch:einf-rollup-status exit-info) 5) "ABORT")
				     ((eq? (launch:einf-rollup-status exit-info) 6) "SKIP")
				     (else "FAIL")))
                        ) ;; (db:test-get-status testinfo)))
		    (debug:print-info 0 *default-log-port* "Test exited in state=" (db:test-get-state testinfo) ", setting state/status based on exit code of " (launch:einf-exit-status exit-info) " and rollup-status of " (launch:einf-rollup-status exit-info))
   
                    ;; Leave a .final-status file for each sub-test
                    (tests:save-final-status run-id test-id)

		    (tests:test-set-status! run-id 
					    test-id 
					    new-state
					    new-status
					    (args:get-arg "-m") #f)
		    ;; need to update the top test record if PASS or FAIL and this is a subtest
		    ;; NO NEED TO CALL set-state-status-and-roll-up-items HERE, THIS IS DONE IN set-state-status-and-roll-up-items called by tests:test-set-status!
		 )
              )


	      ;; for automated creation of the rollup html file this is a good place...
	      (if (not (equal? item-path ""))
		      (tests:summarize-items run-id test-id test-name #f))
	      (tests:summarize-test run-id test-id)  ;; don't force - just update if no
              ;; Leave a .final-status file for the top level test
              (tests:save-final-status run-id test-id)
	      (rmt:update-run-stats run-id (rmt:get-raw-run-stats run-id))) ;; end of let*

	    (mutex-unlock! m)
            (launch:end-of-run-check run-id )
	    (debug:print 2 *default-log-port* "Output from running " fullrunscript ", pid " (launch:einf-pid exit-info) " in work area " 
			 work-area ":\n====\n exit code " (launch:einf-exit-code exit-info) "\n" "====\n")


            (set! test-status (db:test-get-status (rmt:get-testinfo-state-status run-id test-id)))

            ;; If the propagate-exit-code option has been set in the megatest config, and the test status matches the list, set the exit code to 1.

            (if (and propagate-exit-code (string=? propagate-exit-code "yes") (member test-status propagate-status-list))
               (begin
                (debug:print 1 *default-log-port* "Setting exit status to 1 because of test status of " test-status) 
                (set! *globalexitstatus* 1)
               )
            )

	    (if (not (launch:einf-exit-status exit-info))
		(exit 4))))
        )))

;; Spec for End of test
;; At end of each test call, after marking self as COMPLETED do run-state-status-rollup
;; At transition to run COMPLETED/X do hooks

Modified megatest-version.scm from [f92dc46346] to [84fab1de9e].

16
17
18
19
20
21
22
23
;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.

;; Always use two or four digit decimal
;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00..

;; (declare (unit megatest-version))

(define megatest-version 1.6584)







|
16
17
18
19
20
21
22
23
;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.

;; Always use two or four digit decimal
;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00..

;; (declare (unit megatest-version))

(define megatest-version 1.6588)

Modified mtut.scm from [ead30f316f] to [413cf26858].

152
153
154
155
156
157
158

159
160
161
162
163
164
165

Queries:
   show [areas|contours... ] : show areas, contours or other section from megatest.config
   gendot                    : generate a graphviz dot file from pkts.

Contour actions:
   process                   : runs import, rungen and dispatch 

			     
Trigger propagation actions:
   tsend a=b,c=d...          : send trigger info to all recpients in the [listeners] section
   tlisten -port N           : listen for trigger info on port N
			     
Selectors 		     
  -immediate                 : apply this action immediately, default is to queue up actions







>







152
153
154
155
156
157
158
159
160
161
162
163
164
165
166

Queries:
   show [areas|contours... ] : show areas, contours or other section from megatest.config
   gendot                    : generate a graphviz dot file from pkts.

Contour actions:
   process                   : runs import, rungen and dispatch 
   go                        : runs import, rungen and dispatch every five minutes forever
			     
Trigger propagation actions:
   tsend a=b,c=d...          : send trigger info to all recpients in the [listeners] section
   tlisten -port N           : listen for trigger info on port N
			     
Selectors 		     
  -immediate                 : apply this action immediately, default is to queue up actions
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
			    (handle-exceptions
				exn
				(begin
				  (print-call-chain)
				  (print "FAILED TO RUN RUNNAME MAPPER " callname " FOR AREA " area)
				  (print " message: " ((condition-property-accessor 'exn 'message) exn))
				  runname)
			      (print "(mapper " (string-intersperse (list runkey runname area area-path reason contour mode-patt) ", ") ")")
			      (mapper runkey runname area area-path reason contour mode-patt))
			    (case callname
			      ((auto #f) runname)
			      (else   runtrans)))))
	 (new-target     target) ;; I believe we will want target manipulation here .. (map-targets xlatr-key runkey area contour))
	 (actual-action  (if action
			     (if (equal? action "sync-prepend")







|







772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
			    (handle-exceptions
				exn
				(begin
				  (print-call-chain)
				  (print "FAILED TO RUN RUNNAME MAPPER " callname " FOR AREA " area)
				  (print " message: " ((condition-property-accessor 'exn 'message) exn))
				  runname)
			      (print "(mapper " (string-intersperse (map conc (list runkey runname area area-path reason contour mode-patt)) ", ") ")")
			      (mapper runkey runname area area-path reason contour mode-patt))
			    (case callname
			      ((auto #f) runname)
			      (else   runtrans)))))
	 (new-target     target) ;; I believe we will want target manipulation here .. (map-targets xlatr-key runkey area contour))
	 (actual-action  (if action
			     (if (equal? action "sync-prepend")
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620



1621
1622
1623
1624
1625
1626
1627
1628
1629












































1630
1631
1632
1633
1634
1635
1636
	 ;;    (if (not (member key *legal-params*))
	 ;; 	(hash-table-delete! adjargs key))) ;; we need to delete any params intended for mtutil
	 ;;  (hash-table-keys adjargs))
	 (let-values (((uuid pkt)
		       (command-line->pkt *action* adjargs #f area-path: area-path new-ss: new-ss)))
           (print "run log @ " (conc (current-directory) "/" uuid "-" *action* ".log"))
	   (write-pkt pktsdir uuid pkt))))
      ((dispatch import rungen process)
       (let* ((mtconfdat (simple-setup (args:get-arg "-start-dir")))
	      (mtconf    (car mtconfdat))
	      (toppath   (configf:lookup mtconf "scratchdat" "toppath")))



	 (case (string->symbol *action*)
	   ((process)  (begin
			 (common:load-pkts-to-db mtconf)
			 (generate-run-pkts mtconf toppath)
			 (common:load-pkts-to-db mtconf)
			 (dispatch-commands mtconf toppath)))
	   ((import)   (common:load-pkts-to-db mtconf)) ;; import pkts
	   ((rungen)   (generate-run-pkts mtconf toppath))
	   ((dispatch) (dispatch-commands mtconf toppath)))))












































      ;; misc
      ((show)
       (if (> (length remargs) 0)
	   (let* ((mtconfdat (simple-setup (args:get-arg "-start-dir")))
		  (mtconf    (car mtconfdat))
		  (sect-dat (configf:get-section mtconf (car remargs))))
	     (if sect-dat







|


|
>
>
>








|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
	 ;;    (if (not (member key *legal-params*))
	 ;; 	(hash-table-delete! adjargs key))) ;; we need to delete any params intended for mtutil
	 ;;  (hash-table-keys adjargs))
	 (let-values (((uuid pkt)
		       (command-line->pkt *action* adjargs #f area-path: area-path new-ss: new-ss)))
           (print "run log @ " (conc (current-directory) "/" uuid "-" *action* ".log"))
	   (write-pkt pktsdir uuid pkt))))
      ((dispatch import rungen process go)
       (let* ((mtconfdat (simple-setup (args:get-arg "-start-dir")))
	      (mtconf    (car mtconfdat))
	      (toppath   (configf:lookup mtconf "scratchdat" "toppath"))
	      (period    (configf:lookup-number mtconf "mtutil" "autorun-period" default: 300))
	      (rest-time (configf:lookup-number mtconf "mtutil" "autorun-rest"   default: 30)))
	 (print "Using period="period" and rest time="rest-time)
	 (case (string->symbol *action*)
	   ((process)  (begin
			 (common:load-pkts-to-db mtconf)
			 (generate-run-pkts mtconf toppath)
			 (common:load-pkts-to-db mtconf)
			 (dispatch-commands mtconf toppath)))
	   ((import)   (common:load-pkts-to-db mtconf)) ;; import pkts
	   ((rungen)   (generate-run-pkts mtconf toppath))
	   ((dispatch) (dispatch-commands mtconf toppath))
	   ;; [mtutil]
	   ;; # approximate interval between run processing in mtutil (seconds)
	   ;; autorun-period 300
	   ;; # minimal rest period between processing 
	   ;; autorun-rest   30
	   ((go)
	    ;; determine if I'm the boss
	    (if (file-exists? "mtutil-go.pid")
		(begin
		  (print "ERROR: mtutil go is already running under host and pid " (with-input-from-file "mtutil-go.pid" read-line)
			 ". Please kill that process and remove the file \"mutil-go.pid\" and try again.")
		  (exit)))
	    (with-output-to-file "mtutil-go.pid" (lambda ()(print (get-host-name) " " (current-process-id))))
	    (print "Starting long running import, rungen, and process loop")
	    (if (file-exists? "do-not-run-mtutil-go")
		(begin
		  (print "NOTE: Removing flag file "(current-directory)"/do-not-run-mtutil-go")
		  (delete-file* "do-not-run-mtutil-go")))
	    (let loop ((last-run (- (current-seconds) (+ period 10))) ;; fake out first time in 
		       (this-run (current-seconds)))
	      (if (file-exists? "do-not-run-mtutil-go")
		  (begin
		    (print "File do-not-run-mtutil-go exists, exiting.")
		    (delete-file* "mtutil-go.pid")
		    (exit)))
	      (let ((delta (- this-run last-run)))
		(if (>= delta period)
		    (let* ((mtconfdat (simple-setup (args:get-arg "-start-dir")))
			   (mtconf    (car mtconfdat)))
		      (print "Running import at " (current-seconds))
		      (common:load-pkts-to-db mtconf)
		      (print "Running generate run pkts at " (current-seconds))
		      (generate-run-pkts mtconf toppath)
		      (print "Running run dispatch at " (current-seconds))
		      (common:load-pkts-to-db mtconf)
		      (dispatch-commands mtconf toppath)
		      (print "Done running import, generate, and dispatch done in " (- (current-seconds) this-run))
		      (print "NOTE: touch " (current-directory) "/do-not-run-mtutil-go to kill this runner.")
		      (loop this-run (current-seconds)))
		    (let ((now (current-seconds)))
		      (print "Sleeping " rest-time " seconds, next run in aproximately " (- period (- now last-run)) " seconds")
		      (thread-sleep! rest-time)
		      (loop last-run (current-seconds))))))
	    (delete-file* "mtutil-go.pid")))))
      ;; misc
      ((show)
       (if (> (length remargs) 0)
	   (let* ((mtconfdat (simple-setup (args:get-arg "-start-dir")))
		  (mtconf    (car mtconfdat))
		  (sect-dat (configf:get-section mtconf (car remargs))))
	     (if sect-dat
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831

1832
1833

1834

1835


1836
1837
1838
1839
1840

1841

1842


1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
                                   )

                                  )

                                )))
                           (loop (nn-recv rep))))
		       (print "ERROR: Port " portnum " already in use. Try another port")))))))
      



      ((tlisten)
       (if (null? remargs)
           (print "ERROR: useage for tlisten is \"mtutil tlisten portnum\"")
           (let ((portnum (string->number (car remargs))))
              
             (if (not portnum)
                 (print "ERROR: the portnumber parameter must be a number, you gave: " (car remargs))
                 (begin
                   (if (not (is-port-in-use portnum))  
                       (let* ((rep       (start-nn-server portnum))
                              (mtconfdat (simple-setup (args:get-arg "-start-dir")))
                              (mtconf    (car mtconfdat))
                              (contact   (configf:lookup mtconf "listener" "owner"))
                              (script    (configf:lookup mtconf "listener" "script")))
                         (print "Listening on port " portnum " for messages.")
                         (set-signal-handler! signal/int  (lambda (signum) 

															(set! *time-to-exit* #t)
  														(debug:print-error 0 *default-log-port* "Received signal " signum " sending email befor exiting !!")

  														(let ((email-body (mtut:stml->string (s:body

																						(s:p (conc "Received signal " signum ". Lister has been terminated on host " (get-environment-variable "HOST") ". "))))))


             					        (sendmail contact "Listner has been terminated." email-body  use_html: #t))
                              (exit)))
															(set-signal-handler! signal/term  (lambda (signum) 
															(set! *time-to-exit* #t)
  														(debug:print-error 0 *default-log-port* "Received signal " signum " sending email befor exiting !!")

  														(let ((email-body (mtut:stml->string (s:body

																						(s:p (conc "Received signal " signum ". Lister has been terminated on host " (get-environment-variable "HOST") ". "))))))


             					        (sendmail contact "Listner has been terminated." email-body  use_html: #t))
                              (exit)))

                         ;(set-signal-handler! signal/term special-signal-handler)
                         
                         (let loop ((instr (nn-recv rep)))
                             (nn-send rep "ok")
                             (let ((ctime (date->string (current-date)))) 
                             (if  (equal? instr "time-to-die")
                              (begin 
                              (debug:print 0 *default-log-port* ctime " received '" instr "'. Time to sucide." )
                               (let ((pid  (current-process-id)))







<
<
<

|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
>
|
|
>
|
>
|
>
>
|
|
|
|
|
>
|
>
|
>
>
|
|

|
|







1853
1854
1855
1856
1857
1858
1859



1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
                                   )

                                  )

                                )))
                           (loop (nn-recv rep))))
		       (print "ERROR: Port " portnum " already in use. Try another port")))))))




    ((tlisten)
     (if (null? remargs)
         (print "ERROR: useage for tlisten is \"mtutil tlisten portnum\"")
         (let ((portnum (string->number (car remargs))))
           
           (if (not portnum)
               (print "ERROR: the portnumber parameter must be a number, you gave: " (car remargs))
               (begin
                 (if (not (is-port-in-use portnum))  
                     (let* ((rep       (start-nn-server portnum))
                            (mtconfdat (simple-setup (args:get-arg "-start-dir")))
                            (mtconf    (car mtconfdat))
                            (contact   (configf:lookup mtconf "listener" "owner"))
                            (script    (configf:lookup mtconf "listener" "script")))
                       (print "Listening on port " portnum " for messages.")
                       (set-signal-handler! signal/int
					    (lambda (signum) 
					      (set! *time-to-exit* #t)
  					      (debug:print-error 0 *default-log-port* "Received signal " signum
								 " sending email befor exiting !!")
  					      (let ((email-body (mtut:stml->string
								 (s:body
								  (s:p (conc "Received signal " signum
									     ". Lister has been terminated on host "
									     (get-environment-variable "HOST") ". "))))))
             					(sendmail contact "Listner has been terminated." email-body  use_html: #t))
					      (exit)))
		       (set-signal-handler! signal/term  (lambda (signum) 
							   (set! *time-to-exit* #t)
  							   (debug:print-error 0 *default-log-port* "Received signal "
									      signum " sending email befor exiting !!")
  							   (let ((email-body (mtut:stml->string
									      (s:body
									       (s:p (conc "Received signal " signum
											  ". Lister has been terminated on host "
											  (get-environment-variable "HOST") ". "))))))
             					             (sendmail contact "Listner has been terminated." email-body  use_html: #t))
							   (exit)))

		       ;; (set-signal-handler! signal/term special-signal-handler)
                       
                         (let loop ((instr (nn-recv rep)))
                             (nn-send rep "ok")
                             (let ((ctime (date->string (current-date)))) 
                             (if  (equal? instr "time-to-die")
                              (begin 
                              (debug:print 0 *default-log-port* ctime " received '" instr "'. Time to sucide." )
                               (let ((pid  (current-process-id)))

Modified runs.scm from [2583922f1c] to [dcda6cf6d1].

499
500
501
502
503
504
505

506
507
508
509
510
511
512
;; runs:run-tests is called from megatest.scm and itself
;;======================================================================
;;
;;  test-names: Comma separated patterns same as test-patts but used in selection 
;;              of tests to run. The item portions are not respected.
;;              FIXME: error out if /patt specified
;;            

(define (runs:run-tests target runname test-patts user flags #!key (run-count 1)) ;; test-names
  (let* ((keys               (keys:config-get-fields *configdat*))
	 (keyvals            (keys:target->keyval keys target))
	 (run-id             (rmt:register-run keyvals runname "new" "n/a" user (args:get-arg "-contour")))  ;;  test-name)))
	 ;; (deferred          '()) ;; delay running these since they have a waiton clause
	 (runconfigf         (conc  *toppath* "/runconfigs.config"))
         (dbfile             (conc  *toppath* "/megatest.db"))







>







499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
;; runs:run-tests is called from megatest.scm and itself
;;======================================================================
;;
;;  test-names: Comma separated patterns same as test-patts but used in selection 
;;              of tests to run. The item portions are not respected.
;;              FIXME: error out if /patt specified
;;            
;; run-count is passed from megatest.scm as configf:lookup *configdat* "setup" "reruns", or defaults to 1.
(define (runs:run-tests target runname test-patts user flags #!key (run-count 1)) ;; test-names
  (let* ((keys               (keys:config-get-fields *configdat*))
	 (keyvals            (keys:target->keyval keys target))
	 (run-id             (rmt:register-run keyvals runname "new" "n/a" user (args:get-arg "-contour")))  ;;  test-name)))
	 ;; (deferred          '()) ;; delay running these since they have a waiton clause
	 (runconfigf         (conc  *toppath* "/runconfigs.config"))
         (dbfile             (conc  *toppath* "/megatest.db"))
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
    ;; (set! required-tests     (lset-intersection equal? test-names all-test-names))
    
    ;; look up all tests matching the comma separated list of globs in
    ;; test-patts (using % as wildcard)

    ;; (set! test-names (delete-duplicates (tests:get-valid-tests *toppath* test-patts)))
    (debug:print-info 0 *default-log-port* "tests search path: " (string-intersperse (tests:get-tests-search-path *configdat*) " "))
    (debug:print-info 0 *default-log-port* "all tests:         " (string-intersperse (sort all-test-names string<) " "))
    (debug:print-info 0 *default-log-port* "test names:        " (string-intersperse (sort test-names string<) " "))
    (debug:print-info 0 *default-log-port* "required tests:    " (string-intersperse (sort required-tests string<) " "))

    ;; on the first pass or call to run-tests set FAILS to NOT_STARTED if
    ;; -keepgoing is specified
    (if (eq? *passnum* 0)
	(begin







|







623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
    ;; (set! required-tests     (lset-intersection equal? test-names all-test-names))
    
    ;; look up all tests matching the comma separated list of globs in
    ;; test-patts (using % as wildcard)

    ;; (set! test-names (delete-duplicates (tests:get-valid-tests *toppath* test-patts)))
    (debug:print-info 0 *default-log-port* "tests search path: " (string-intersperse (tests:get-tests-search-path *configdat*) " "))
    (debug:print-info 2 *default-log-port* "all tests:         " (string-intersperse (sort all-test-names string<) " "))
    (debug:print-info 0 *default-log-port* "test names:        " (string-intersperse (sort test-names string<) " "))
    (debug:print-info 0 *default-log-port* "required tests:    " (string-intersperse (sort required-tests string<) " "))

    ;; on the first pass or call to run-tests set FAILS to NOT_STARTED if
    ;; -keepgoing is specified
    (if (eq? *passnum* 0)
	(begin
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
                          (waiton-record   (hash-table-ref/default test-records waiton #f))
			  (waiton-tconfig  (if waiton-record (vector-ref waiton-record 1) #f))
			  (waiton-itemized (and waiton-tconfig
						(or (hash-table-ref/default waiton-tconfig "items" #f)
						    (hash-table-ref/default waiton-tconfig "itemstable" #f))))
			  (itemmaps        (tests:get-itemmaps config))  ;; (configf:lookup config "requirements" "itemmap"))
			  (new-test-patts  (tests:extend-test-patts test-patts hed waiton itemmaps hed-itemized-waiton))) 
		     (debug:print-info 0 *default-log-port* "Test " waiton " has " (if waiton-record "a" "no") " waiton-record and" (if waiton-itemized " " " no ") "items")
		     ;; need to account for test-patt here, if I am test "a", selected with a test-patt of "hed/b%"
		     ;; and we are waiting on "waiton" we need to add "waiton/,waiton/b%" to test-patt
		     ;; is this satisfied by merely appending "/" to the waiton name added to the list?
		     ;;
		     ;; This approach causes all of the items in an upstream test to be run 
		     ;; if we have this waiton already processed once we can analzye it for extending
		     ;; tests to be run, since we can't properly process waitons unless they have been







|







737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
                          (waiton-record   (hash-table-ref/default test-records waiton #f))
			  (waiton-tconfig  (if waiton-record (vector-ref waiton-record 1) #f))
			  (waiton-itemized (and waiton-tconfig
						(or (hash-table-ref/default waiton-tconfig "items" #f)
						    (hash-table-ref/default waiton-tconfig "itemstable" #f))))
			  (itemmaps        (tests:get-itemmaps config))  ;; (configf:lookup config "requirements" "itemmap"))
			  (new-test-patts  (tests:extend-test-patts test-patts hed waiton itemmaps hed-itemized-waiton))) 
		     (debug:print-info 2 *default-log-port* "Test " waiton " has " (if waiton-record "a" "no") " waiton-record and" (if waiton-itemized " " " no ") "items")
		     ;; need to account for test-patt here, if I am test "a", selected with a test-patt of "hed/b%"
		     ;; and we are waiting on "waiton" we need to add "waiton/,waiton/b%" to test-patt
		     ;; is this satisfied by merely appending "/" to the waiton name added to the list?
		     ;;
		     ;; This approach causes all of the items in an upstream test to be run 
		     ;; if we have this waiton already processed once we can analzye it for extending
		     ;; tests to be run, since we can't properly process waitons unless they have been
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
                                   (set! test-names (cons waiton test-names)) ;; need to process this one, only add once the waiton tconfig read
                                   (set! required-tests (cons (conc waiton "/") required-tests))
                                   (set! test-patts new-test-patts))
                                 (begin
                                   (debug:print-info 0 *default-log-port* "Waitor(s) not yet on testpatt for " waiton ", setting up to re-process it")
                                   (set! tal (append (cons waiton tal)(list hed)))))
                             (begin
                               (debug:print-info 0 *default-log-port* "Adding non-itemized test " waiton " to required-tests")
                               (set! required-tests (cons waiton required-tests))
                               (set! test-patts new-test-patts)))
			 (begin
			   (debug:print-info 0 *default-log-port* "No testconfig info yet for " waiton ", setting up to re-process it")
			   (set! tal (append (cons waiton tal)(list hed))))) ;; (cons (conc waiton "/") required-tests))
		     ;; NOPE: didn't work. required needs to be plain test names. Try tacking on to test-patts
		     ;;  - doesn't work







|







759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
                                   (set! test-names (cons waiton test-names)) ;; need to process this one, only add once the waiton tconfig read
                                   (set! required-tests (cons (conc waiton "/") required-tests))
                                   (set! test-patts new-test-patts))
                                 (begin
                                   (debug:print-info 0 *default-log-port* "Waitor(s) not yet on testpatt for " waiton ", setting up to re-process it")
                                   (set! tal (append (cons waiton tal)(list hed)))))
                             (begin
                               (debug:print-info 2 *default-log-port* "Adding non-itemized test " waiton " to required-tests")
                               (set! required-tests (cons waiton required-tests))
                               (set! test-patts new-test-patts)))
			 (begin
			   (debug:print-info 0 *default-log-port* "No testconfig info yet for " waiton ", setting up to re-process it")
			   (set! tal (append (cons waiton tal)(list hed))))) ;; (cons (conc waiton "/") required-tests))
		     ;; NOPE: didn't work. required needs to be plain test names. Try tacking on to test-patts
		     ;;  - doesn't work
895
896
897
898
899
900
901

902
903
904
905
906
907
908
;; return value of runs:expand-items is passed back to runs-tests-queue and is fed to named loop with this signature:
;;    (let loop ((hed         (car sorted-test-names))
;;	         (tal         (cdr sorted-test-names))
;;	         (reg         '()) ;; registered, put these at the head of tal 
;;	         (reruns      '()))
(define (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps)
  (let* ((loop-list       (list hed tal reg reruns))

	 (prereqs-not-met (let ((res (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)))
			    (if (list? res)
				res
				(begin
				  (debug:print 0 *default-log-port*
					       "ERROR: rmt:get-prereqs-not-met returned non-list!\n"
					       "  res=" res " run-id=" run-id " waitons=" waitons " hed=" hed " item-path=" item-path " testmode=" testmode " itemmaps=" itemmaps)







>







896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
;; return value of runs:expand-items is passed back to runs-tests-queue and is fed to named loop with this signature:
;;    (let loop ((hed         (car sorted-test-names))
;;	         (tal         (cdr sorted-test-names))
;;	         (reg         '()) ;; registered, put these at the head of tal 
;;	         (reruns      '()))
(define (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps)
  (let* ((loop-list       (list hed tal reg reruns))
         (junk (debug:print-info 4 *default-log-port* "expand-items calling rmt:get-prereqs-not-met"))
	 (prereqs-not-met (let ((res (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)))
			    (if (list? res)
				res
				(begin
				  (debug:print 0 *default-log-port*
					       "ERROR: rmt:get-prereqs-not-met returned non-list!\n"
					       "  res=" res " run-id=" run-id " waitons=" waitons " hed=" hed " item-path=" item-path " testmode=" testmode " itemmaps=" itemmaps)
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
	   (list (car newtal)(append (cdr newtal) reg) '() reruns)
	  #f)) 
     ((null? runnables)
      (debug:print-info 4 *default-log-port* "cond branch - "  "ei-7")
      #f) ;; if we get here and non-completed is null then it is all over.
     (else
      (debug:print-info 4 *default-log-port* "cond branch - "  "ei-8")
      (debug:print 0 *default-log-port* "WARNING: FAILS or incomplete tests maybe preventing completion of this run. Watch for issues with test " hed ", continuing for now")
      (list (car newtal)(cdr newtal) reg reruns)))))

(define (runs:mixed-list-testname-and-testrec->list-of-strings inlst)
  (if (null? inlst)
      '()
      (map (lambda (t)
	     (cond







|







1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
	   (list (car newtal)(append (cdr newtal) reg) '() reruns)
	  #f)) 
     ((null? runnables)
      (debug:print-info 4 *default-log-port* "cond branch - "  "ei-7")
      #f) ;; if we get here and non-completed is null then it is all over.
     (else
      (debug:print-info 4 *default-log-port* "cond branch - "  "ei-8")
      (debug:print 2 *default-log-port* "WARNING: FAILS or incomplete tests maybe preventing completion of this run. Watch for issues with test " hed ", continuing for now")
      (list (car newtal)(cdr newtal) reg reruns)))))

(define (runs:mixed-list-testname-and-testrec->list-of-strings inlst)
  (if (null? inlst)
      '()
      (map (lambda (t)
	     (cond
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
		  (debug:print-info 0 *default-log-port* "Skipping test " tfullname " as it has been marked do not run due to being completed or not runnable"))
	      (if (or (not (null? tal))(not (null? reg)))
		  (loop (runs:queue-next-hed tal reg reglen regfull)
			(runs:queue-next-tal tal reg reglen regfull)
			(runs:queue-next-reg tal reg reglen regfull)
			reruns))))
        ;; (loop (car tal)(cdr tal) reg reruns))))

	(runs:incremental-print-results run-id)
	(debug:print 4 *default-log-port* "TOP OF LOOP => "
		     "test-name: " test-name
		     "\n  hed:         " hed
		     "\n  tal:         " (runs:pretty-long-list tal)
		     "\n  reg:         " reg
                     "\n  test-record  " test-record







<







1667
1668
1669
1670
1671
1672
1673

1674
1675
1676
1677
1678
1679
1680
		  (debug:print-info 0 *default-log-port* "Skipping test " tfullname " as it has been marked do not run due to being completed or not runnable"))
	      (if (or (not (null? tal))(not (null? reg)))
		  (loop (runs:queue-next-hed tal reg reglen regfull)
			(runs:queue-next-tal tal reg reglen regfull)
			(runs:queue-next-reg tal reg reglen regfull)
			reruns))))
        ;; (loop (car tal)(cdr tal) reg reruns))))

	(runs:incremental-print-results run-id)
	(debug:print 4 *default-log-port* "TOP OF LOOP => "
		     "test-name: " test-name
		     "\n  hed:         " hed
		     "\n  tal:         " (runs:pretty-long-list tal)
		     "\n  reg:         " reg
                     "\n  test-record  " test-record
1849
1850
1851
1852
1853
1854
1855
1856

1857
1858
1859
1860
1861

1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873

1874
1875
1876
1877
1878
1879
1880
		  (if loop-list
		      (apply loop loop-list)
                      (debug:print-info 4 *default-log-port* " -- Can't expand hed="hed)
                      )
                  )
		;; if can't run more just loop with next possible test
		(loop (car newtal)(cdr newtal) reg reruns))))
         

	 ;; this case should not happen, added to help catch any bugs
	 ((and (list? items) itemdat)
          (debug:print-info 4 *default-log-port* "cond branch - "  "rtq-5")
	  (debug:print-error 0 *default-log-port* "Should not have a list of items in a test and the itemspath set - please report this")
	  (exit 1))

	 ((not (null? reruns))
          (debug:print-info 4 *default-log-port* "cond branch - "  "rtq-6")
	  (let* ((newlst (tests:filter-non-runnable run-id tal test-records)) ;; i.e. not FAIL, WAIVED, INCOMPLETE, PASS, KILLED,
		 (junked (lset-difference equal? tal newlst)))
	    (debug:print-info 4 *default-log-port* "full drop through, if reruns is less than 100 we will force retry them, reruns=" reruns ", tal=" tal)
	    (if (< num-retries max-retries)
		(set! newlst (append reruns newlst)))
	    (set! num-retries (+ num-retries 1))
	    ;; (thread-sleep! (+ 1 *global-delta*))
	    (if (not (null? newlst))
		;; since reruns have been tacked on to newlst create new reruns from junked
		(loop (car newlst)(cdr newlst) reg (delete-duplicates junked)))))

	 ((not (null? tal))
          (debug:print-info 4 *default-log-port* "cond branch - "  "rtq-7")
	  (debug:print-info 4 *default-log-port* "I'm pretty sure I shouldn't get here."))
	 ((not (null? reg)) ;; could we get here with leftovers?
          (debug:print-info 4 *default-log-port* "cond branch - "  "rtq-8")
	  (debug:print-info 0 *default-log-port* "Have leftovers!")
	  (loop (car reg)(cdr reg) '() reruns))







|
>





>












>







1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
		  (if loop-list
		      (apply loop loop-list)
                      (debug:print-info 4 *default-log-port* " -- Can't expand hed="hed)
                      )
                  )
		;; if can't run more just loop with next possible test
		(loop (car newtal)(cdr newtal) reg reruns))))
        

	 ;; this case should not happen, added to help catch any bugs
	 ((and (list? items) itemdat)
          (debug:print-info 4 *default-log-port* "cond branch - "  "rtq-5")
	  (debug:print-error 0 *default-log-port* "Should not have a list of items in a test and the itemspath set - please report this")
	  (exit 1))

	 ((not (null? reruns))
          (debug:print-info 4 *default-log-port* "cond branch - "  "rtq-6")
	  (let* ((newlst (tests:filter-non-runnable run-id tal test-records)) ;; i.e. not FAIL, WAIVED, INCOMPLETE, PASS, KILLED,
		 (junked (lset-difference equal? tal newlst)))
	    (debug:print-info 4 *default-log-port* "full drop through, if reruns is less than 100 we will force retry them, reruns=" reruns ", tal=" tal)
	    (if (< num-retries max-retries)
		(set! newlst (append reruns newlst)))
	    (set! num-retries (+ num-retries 1))
	    ;; (thread-sleep! (+ 1 *global-delta*))
	    (if (not (null? newlst))
		;; since reruns have been tacked on to newlst create new reruns from junked
		(loop (car newlst)(cdr newlst) reg (delete-duplicates junked)))))

	 ((not (null? tal))
          (debug:print-info 4 *default-log-port* "cond branch - "  "rtq-7")
	  (debug:print-info 4 *default-log-port* "I'm pretty sure I shouldn't get here."))
	 ((not (null? reg)) ;; could we get here with leftovers?
          (debug:print-info 4 *default-log-port* "cond branch - "  "rtq-8")
	  (debug:print-info 0 *default-log-port* "Have leftovers!")
	  (loop (car reg)(cdr reg) '() reruns))
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
	     (set! runflag #t))

	    (else (set! runflag #f)))
	   (debug:print 4 *default-log-port* "RUNNING => runflag: " runflag " STATE: " (test:get-state testdat) " STATUS: " (test:get-status testdat))
	   (if (not runflag)
	       (if (not parent-test)
		   (if (runs:lownoise (conc "not starting test" full-test-name) 60)
		       (debug:print 1 *default-log-port* "NOTE: Not starting test " full-test-name " as it is state \"" (test:get-state testdat) 
				    "\" and status \"" (test:get-status testdat) "\", use -rerun \"" (test:get-status testdat)
				    "\" or -force to override")))
	       ;; NOTE: No longer be checking prerequisites here! Will never get here unless prereqs are
	       ;;       already met.
	       ;; This would be a great place to do the process-fork
	       ;; 
	       (let ((skip-test   #f)







|







2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
	     (set! runflag #t))

	    (else (set! runflag #f)))
	   (debug:print 4 *default-log-port* "RUNNING => runflag: " runflag " STATE: " (test:get-state testdat) " STATUS: " (test:get-status testdat))
	   (if (not runflag)
	       (if (not parent-test)
		   (if (runs:lownoise (conc "not starting test" full-test-name) 60)
		       (debug:print 3 *default-log-port* "NOTE: Not starting test " full-test-name " as it is state \"" (test:get-state testdat) 
				    "\" and status \"" (test:get-status testdat) "\", use -rerun \"" (test:get-status testdat)
				    "\" or -force to override")))
	       ;; NOTE: No longer be checking prerequisites here! Will never get here unless prereqs are
	       ;;       already met.
	       ;; This would be a great place to do the process-fork
	       ;; 
	       (let ((skip-test   #f)
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
		    ;; seek and kill in flight -runtests with % as testpatt here
		    ;; (if (equal? testpatt "%")
		    (tasks:kill-runner target run-name testpatt)
		    ;; (debug:print 0 *default-log-port* "not attempting to kill any run launcher processes as testpatt is " testpatt))
		    (debug:print 1 *default-log-port* "Removing tests for run: " runkey " " (db:get-value-by-header run header "runname")))
		   ((set-state-status)
		    ;; (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10))
		    (debug:print 1 *default-log-port* "Modifying state and staus for tests for run: " runkey " " (db:get-value-by-header run header "runname")))
		   ((print-run)
		    (debug:print 1 *default-log-port* "Printing info for run " runkey ", run=" run ", tests=" tests ", header=" header)
		    action)
		   ((run-wait)
		    (debug:print 1 *default-log-port* "Waiting for run " runkey ", run=" runnamepatt " to complete"))
		   ((archive)
		    (debug:print 1 *default-log-port* "Archiving/restoring (" (args:get-arg "-archive") ") data for run: " runkey " " (db:get-value-by-header run header "runname"))







|







2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
		    ;; seek and kill in flight -runtests with % as testpatt here
		    ;; (if (equal? testpatt "%")
		    (tasks:kill-runner target run-name testpatt)
		    ;; (debug:print 0 *default-log-port* "not attempting to kill any run launcher processes as testpatt is " testpatt))
		    (debug:print 1 *default-log-port* "Removing tests for run: " runkey " " (db:get-value-by-header run header "runname")))
		   ((set-state-status)
		    ;; (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10))
		    (debug:print 2 *default-log-port* "Modifying state and status for tests for run: " runkey " " (db:get-value-by-header run header "runname")))
		   ((print-run)
		    (debug:print 1 *default-log-port* "Printing info for run " runkey ", run=" run ", tests=" tests ", header=" header)
		    action)
		   ((run-wait)
		    (debug:print 1 *default-log-port* "Waiting for run " runkey ", run=" runnamepatt " to complete"))
		   ((archive)
		    (debug:print 1 *default-log-port* "Archiving/restoring (" (args:get-arg "-archive") ") data for run: " runkey " " (db:get-value-by-header run header "runname"))
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
				       )
				      ) ; end case rem-status
                                    ) ; end let
                                  ); end cond has-subrun

                                 (else
                                  ;; BB - TODO - consider backgrounding to threads to delete tests (work below) 
                                  (debug:print-info 0 *default-log-port* "test: " test-name " itest-state: " test-state)
                                  (if (member test-state (list "RUNNING" "LAUNCHED" "REMOTEHOSTSTART" "KILLREQ"))
                                      (begin
                                        (if (not (hash-table-ref/default test-retry-time test-fulln #f))
                                            (begin
                                              ;; want to set to REMOVING BUT CANNOT do it here?
                                              (hash-table-set! test-retry-time test-fulln (current-seconds))))
                                        (if (> (- (current-seconds)(hash-table-ref test-retry-time test-fulln)) allow-run-time)







|







2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
				       )
				      ) ; end case rem-status
                                    ) ; end let
                                  ); end cond has-subrun

                                 (else
                                  ;; BB - TODO - consider backgrounding to threads to delete tests (work below) 
                                  (debug:print-info 2 *default-log-port* "test: " test-name " itest-state: " test-state)
                                  (if (member test-state (list "RUNNING" "LAUNCHED" "REMOTEHOSTSTART" "KILLREQ"))
                                      (begin
                                        (if (not (hash-table-ref/default test-retry-time test-fulln #f))
                                            (begin
                                              ;; want to set to REMOVING BUT CANNOT do it here?
                                              (hash-table-set! test-retry-time test-fulln (current-seconds))))
                                        (if (> (- (current-seconds)(hash-table-ref test-retry-time test-fulln)) allow-run-time)
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
	;; 			      (thread-sleep! 1)
	 ;; 			      (loop (rmt:no-sync-get-lock lock-key) expire-time)))))))
	 )
    (case clean-mode
      ((remove-data-only)(mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "CLEANING" "LOCKED" #f))
      ((remove-all)      (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "REMOVING" "LOCKED" #f))
      ((archive-remove)  (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "ARCHIVE_REMOVING" #f #f)))
    (debug:print-info 1 *default-log-port* "Attempting to remove " (if real-dir (conc " dir " real-dir " and ") "") " link " run-dir)
    (if (and real-dir 
	     (> (string-length real-dir) 5)
	     (common:file-exists? real-dir)) ;; bad heuristic but should prevent /tmp /home etc.
	(let* ((realpath (resolve-pathname run-dir)))
	  (debug:print-info 1 *default-log-port* "Recursively removing " realpath)
	  (if (common:file-exists? realpath)
	      (runs:safe-delete-test-dir realpath)







|







2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
	;; 			      (thread-sleep! 1)
	 ;; 			      (loop (rmt:no-sync-get-lock lock-key) expire-time)))))))
	 )
    (case clean-mode
      ((remove-data-only)(mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "CLEANING" "LOCKED" #f))
      ((remove-all)      (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "REMOVING" "LOCKED" #f))
      ((archive-remove)  (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "ARCHIVE_REMOVING" #f #f)))
    (debug:print-info 2 *default-log-port* "Attempting to remove " (if real-dir (conc " dir " real-dir " and ") "") " link " run-dir)
    (if (and real-dir 
	     (> (string-length real-dir) 5)
	     (common:file-exists? real-dir)) ;; bad heuristic but should prevent /tmp /home etc.
	(let* ((realpath (resolve-pathname run-dir)))
	  (debug:print-info 1 *default-log-port* "Recursively removing " realpath)
	  (if (common:file-exists? realpath)
	      (runs:safe-delete-test-dir realpath)
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
	  (let* ((linktree (common:get-linktree)) ;; (if toppath (configf:lookup *configdat* "setup" "linktree")))
		 (runtop   (conc linktree "/" target "/" runname))
		 (files    (if (common:file-exists? runtop)
			       (append (glob (conc runtop "/.megatest*"))
				       (glob (conc runtop "/.runconfig*")))
			       '())))
	    (if (null? files)
		(debug:print-info 0 *default-log-port* "No cached megatest or runconfigs files found. None removed.")
		(begin
		  (debug:print-info 0 *default-log-port* "Removing cached files:\n    " (string-intersperse files "\n    "))
		  (for-each 
		   (lambda (f)
		     (handle-exceptions
			 exn
			 (debug:print 0 *default-log-port* "WARNING: Failed to remove file " f ", exn=" exn)
		       (delete-file f)))
		   files))))
	  (debug:print-error 0 *default-log-port* "-clean-cache requires -runname."))
      (debug:print-error 0 *default-log-port* "-clean-cache requires -target or -reqtarg")))







|

|









3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
	  (let* ((linktree (common:get-linktree)) ;; (if toppath (configf:lookup *configdat* "setup" "linktree")))
		 (runtop   (conc linktree "/" target "/" runname))
		 (files    (if (common:file-exists? runtop)
			       (append (glob (conc runtop "/.megatest*"))
				       (glob (conc runtop "/.runconfig*")))
			       '())))
	    (if (null? files)
		(debug:print-info 2 *default-log-port* "No cached megatest or runconfigs files found. None removed.")
		(begin
		  (debug:print-info 2 *default-log-port* "Removing cached files:\n    " (string-intersperse files "\n    "))
		  (for-each 
		   (lambda (f)
		     (handle-exceptions
			 exn
			 (debug:print 0 *default-log-port* "WARNING: Failed to remove file " f ", exn=" exn)
		       (delete-file f)))
		   files))))
	  (debug:print-error 0 *default-log-port* "-clean-cache requires -runname."))
      (debug:print-error 0 *default-log-port* "-clean-cache requires -target or -reqtarg")))

Modified server.scm from [6d2db21290] to [99d72bd3eb].

109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
    ((rpc)  (db:obj->string (vector success/fail query-sig result)))
    ((http) (db:obj->string (vector success/fail query-sig result)))
    ((fs)   result)
    (else 
     (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*)
     result)))

;; Given a run id start a server process    ### NOTE ### > file 2>&1 
;; if the run-id is zero and the target-host is set 
;; try running on that host
;;   incidental: rotate logs in logs/ dir.
;;
(define  (server:run areapath) ;; areapath is *toppath* for a given testsuite area
  (let* ((curr-host   (get-host-name))
         ;; (attempt-in-progress (server:start-attempted? areapath))
         ;; (dot-server-url (server:check-if-running areapath))







|
|







109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
    ((rpc)  (db:obj->string (vector success/fail query-sig result)))
    ((http) (db:obj->string (vector success/fail query-sig result)))
    ((fs)   result)
    (else 
     (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*)
     result)))

;; Given an area path,  start a server process    ### NOTE ### > file 2>&1 
;; if the target-host is set 
;; try running on that host
;;   incidental: rotate logs in logs/ dir.
;;
(define  (server:run areapath) ;; areapath is *toppath* for a given testsuite area
  (let* ((curr-host   (get-host-name))
         ;; (attempt-in-progress (server:start-attempted? areapath))
         ;; (dot-server-url (server:check-if-running areapath))
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169


170
171
172
173
174
175
176
	     (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host))
	     (not (equal? curr-ip target-host)))
	(begin
	  (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
	  (setenv "TARGETHOST" target-host)))
      
    (setenv "TARGETHOST_LOGF" logfile)
    (thread-sleep! (/ (random 5000) 1000)) ;; add about a random (up to 5 seconds) initial delay. It seems pretty common that many running tests request a server at the same time
    ;; (common:wait-for-normalized-load load-limit " delaying server start due to load" target-host) ;; do not try starting servers on an already overloaded machine, just wait forever
    #;(common:wait-for-homehost-load load-limit (conc " delaying server start due to load on homehost. limit is " load-limit))
    (system (conc "nbfake " cmdln))
    (unsetenv "TARGETHOST_LOGF")
    (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
    (thread-join! log-rotate)
    (pop-directory)))

;; given a path to a server log return: host port startseconds
;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let 



(define (server:logf-get-start-info logf)
  (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id
        (dbprep-rx (regexp "^SERVER: dbprep"))
        (dbprep-found 0)) 
    (handle-exceptions
	exn







|
|
<






|
|
>
>







152
153
154
155
156
157
158
159
160

161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
	     (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host))
	     (not (equal? curr-ip target-host)))
	(begin
	  (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
	  (setenv "TARGETHOST" target-host)))
      
    (setenv "TARGETHOST_LOGF" logfile)
    (thread-sleep! (/ (random 3000) 1000)) ;; add a random initial delay. It seems pretty common that many running tests request a server at the same time
    (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))

    (system (conc "nbfake " cmdln))
    (unsetenv "TARGETHOST_LOGF")
    (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
    (thread-join! log-rotate)
    (pop-directory)))

;; given a path to a server log return: host port startseconds server-id
;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which use match let 
;; example of what it's looking for in the log file:
;;     SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 

(define (server:logf-get-start-info logf)
  (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id
        (dbprep-rx (regexp "^SERVER: dbprep"))
        (dbprep-found 0)) 
    (handle-exceptions
	exn
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233



234
235
236
237
238
239
240
241
242
243
244
245
246
			(list (car dat) ;; host
			      (string->number (cadr dat)) ;; port
			      (string->number (caddr dat))
                              (cadr (cddr dat))))))
                (begin 
                   (if dbprep-found
                      (begin
                         (debug:print-info 0 *default-log-port* "Server is in dbprep at " (current-seconds))
                         (thread-sleep! 25)
                      )
                      (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (current-seconds))
                   )
		    (list #f #f #f #f)))))))))

;; get a list of servers with all relevant data
;; ( mod-time host port start-time pid )
;;
(define (server:get-list areapath #!key (limit #f))
  (let ((fname-rx    (regexp "^(|.*/)server-(\\d+)-(\\S+).log$"))
	(day-seconds (* 24 60 60)))
    ;; if the directory exists continue to get the list
    ;; otherwise attempt to create the logs dir and then
    ;; continue
    (if (if (directory-exists? (conc areapath "/logs"))
	    '()
	    (if (file-write-access? areapath)
		(begin
		  (condition-case
		   (create-directory (conc areapath "/logs") #t)
		   (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs")))
		   (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list. exn=" exn)))
		  (directory-exists? (conc areapath "/logs")))
		'()))

        ;; Get the list of server logs that do not contain "exiting". Ignore logs for servers that have exited.



	(let* ((server-logs-cmd  (conc "grep -iL exiting " areapath "/logs/server-*-*.log"))
               (server-logs   (string-split (string-chomp (call-with-input-pipe server-logs-cmd read-all))))
	       (num-serv-logs (length server-logs)))
	  (if (or (null? server-logs) (= num-serv-logs 0))
              (let ()
                 (debug:print 1  *default-log-port* "There are no servers running")
	         '()
              )
	      (let loop ((hed  (string-chomp (car server-logs)))
			 (tal  (cdr server-logs))
			 (res '()))
		(let* ((mod-time  (handle-exceptions
				   exn







|
|

|



|



















|
>
>
>
|
<



|







200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238

239
240
241
242
243
244
245
246
247
248
249
			(list (car dat) ;; host
			      (string->number (cadr dat)) ;; port
			      (string->number (caddr dat))
                              (cadr (cddr dat))))))
                (begin 
                   (if dbprep-found
                      (begin
                         (debug:print-info 2 *default-log-port* "Server is in dbprep at " (common:human-time))
                         (thread-sleep! 0.5) ;; was 25 sec but that blocked things from starting?
                      )
                      (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds)))
                   )
		    (list #f #f #f #f)))))))))

;; get a list of servers from the log files, with all relevant data
;; ( mod-time host port start-time pid )
;;
(define (server:get-list areapath #!key (limit #f))
  (let ((fname-rx    (regexp "^(|.*/)server-(\\d+)-(\\S+).log$"))
	(day-seconds (* 24 60 60)))
    ;; if the directory exists continue to get the list
    ;; otherwise attempt to create the logs dir and then
    ;; continue
    (if (if (directory-exists? (conc areapath "/logs"))
	    '()
	    (if (file-write-access? areapath)
		(begin
		  (condition-case
		   (create-directory (conc areapath "/logs") #t)
		   (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs")))
		   (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list. exn=" exn)))
		  (directory-exists? (conc areapath "/logs")))
		'()))

        ;; Get the list of server logs. First remove logs for servers that have exited.
	(let* (
               ;; For some reason, when I uncomment the below line, ext-tests sometimes starts 1000's of servers.
               ;; (exiting-servers (system (conc "bash -c 'rm -f `grep -il exiting " areapath "/logs/server-*-*.log 2> /dev/null`'")))
               (server-logs   (glob (conc areapath "/logs/server-*-*.log")))

	       (num-serv-logs (length server-logs)))
	  (if (or (null? server-logs) (= num-serv-logs 0))
              (let ()
                 (debug:print 2  *default-log-port* "There are no servers running at " (common:human-time))
	         '()
              )
	      (let loop ((hed  (string-chomp (car server-logs)))
			 (tal  (cdr server-logs))
			 (res '()))
		(let* ((mod-time  (handle-exceptions
				   exn
363
364
365
366
367
368
369
370



371
372
373
374
375
376
377
378
379
380





381
382
383
384
385
386
387
388
389
390
391


392

393
394

395
396
397
398


399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425



426
427
428
429
430
431
432

(define (server:get-client-signature) ;; BB> why is this proc named "get-"?  it returns nothing -- set! has not return value.
  (if *my-client-signature* *my-client-signature*
      (let ((sig (server:mk-signature)))
        (set! *my-client-signature* sig)
        *my-client-signature*)))

;; wait for server=start-last to be three seconds old



;;
(define (server:wait-for-server-start-last-flag areapath)
  (let* ((start-flag (conc areapath "/logs/server-start-last"))
	 ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds)
	 (reftime    (configf:lookup-number *configdat* "server" "idletime" default: 4))
	 (server-key (conc (get-host-name) "-" (current-process-id))))
    (if (file-exists? start-flag)
	(let* ((fmodtime (file-modification-time start-flag))
	       (delta    (- (current-seconds) fmodtime))
	       (all-go   (> delta reftime)))





	  (if (and all-go
		   (begin
                     (debug:print-info 0 *default-log-port* "Writing " start-flag)
		     (with-output-to-file start-flag
		       (lambda ()
			 (print server-key)))
		     (thread-sleep! 0.25)
		     (let ((res (with-input-from-file start-flag
				  (lambda ()
				    (read-line)))))
		       (equal? server-key res))))


	      #t ;; (system (conc "touch " start-flag)) ;; lazy but safe

	      (begin
		(debug:print-info 0 *default-log-port* "Gating server start, last start: "

				  fmodtime ", delta: " delta ", reftime: " reftime ", all-go=" all-go)
		(thread-sleep! reftime)
		(server:wait-for-server-start-last-flag areapath)))))))



;; kind start up of servers, wait 40 seconds before allowing another server for a given
;; run-id to be launched
;;
(define (server:kind-run areapath)
  ;; look for $MT_RUN_AREA_HOME/logs/server-start-last
  ;; and wait for it to be at least 3 seconds old
  (server:wait-for-server-start-last-flag areapath)
  (if (not (server:check-if-running areapath)) ;; why try if there is already a server running?
      (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun
	     (call-num     (car last-run-dat))
	     (when-run     (cadr last-run-dat))
	     (run-delay    (+ (case call-num
				((0)    0)
				((1)   20)
				((2)  300)
				(else 600))
			      (random 5)))   ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously
	     (lock-file    (conc areapath "/logs/server-start.lock")))
	(if	(> (- (current-seconds) when-run) run-delay)
		(let* ((start-flag (conc areapath "/logs/server-start-last")))
		  (common:simple-file-lock-and-wait lock-file expire-time: 15)
                  (debug:print-info  0 *default-log-port* "server:kind-run: touching " start-flag)
		  (system (conc "touch " start-flag)) ;; lazy but safe
		  (server:run areapath)
		  (thread-sleep! 2) ;; don't release the lock for at least a few seconds
		  (common:simple-file-release-lock lock-file)))
	(hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds))))))




;; this one seems to be the general entry point
;;
(define (server:start-and-wait areapath #!key (timeout 60))
  (let ((give-up-time (+ (current-seconds) timeout)))
    (let loop ((server-info (server:check-if-running areapath))
	       (try-num    0))







|
>
>
>




|




|
>
>
>
>
>
|

|
|
<
<

|
<
<
|
>
>
|
>


>
|
|


>
>
|
|



|


|
<
<
<
<
<
<
<
<

<
|
|
|
|
|
|
|
|
>
>
>







366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395


396
397


398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420








421

422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439

(define (server:get-client-signature) ;; BB> why is this proc named "get-"?  it returns nothing -- set! has not return value.
  (if *my-client-signature* *my-client-signature*
      (let ((sig (server:mk-signature)))
        (set! *my-client-signature* sig)
        *my-client-signature*)))


;; if server-start-last exists, and wasn't old enough, wait <idle time> + 1, then call this function recursively until it is old enough.
;; if it is old enough, overwrite it and wait 0.25 seconds.
;; if it then has the wrong server key, wait <idle time> + 1 and call this function recursively.
;;
(define (server:wait-for-server-start-last-flag areapath)
  (let* ((start-flag (conc areapath "/logs/server-start-last"))
	 ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds)
	 (idletime    (configf:lookup-number *configdat* "server" "idletime" default: 4))
	 (server-key (conc (get-host-name) "-" (current-process-id))))
    (if (file-exists? start-flag)
	(let* ((fmodtime (file-modification-time start-flag))
	       (delta    (- (current-seconds) fmodtime))
	       (old-enough   (> delta idletime))
               (new-server-key "")
              )

          ;; write start-flag file, wait 0.25s, then if previously the start-flag file was older than <idletime> seconds, and the new file still has the same server key as you just wrote, return #t.
	  ;; the intention is to make sure nfs can read the file we just wrote, and make sure it was written by us, and not another process.
           (if (and old-enough
		   (begin
                     (debug:print-info 2 *default-log-port* "Writing " start-flag)
		     (with-output-to-file start-flag (lambda () (print server-key)))


		     (thread-sleep! 0.25)
		     (set! new-server-key (with-input-from-file start-flag (lambda () (read-line))))


		     (equal? server-key new-server-key))
                )
	      #t

           ;; If either of the above conditions is not true, print a "Gating server start" message, wait <idle-time> + 1, then call this function recursively. 
	      (begin
		(debug:print-info 0 *default-log-port* "Gating server start, last start: "
				  (seconds->time-string fmodtime) ", time since last start: " delta ", required idletime: " idletime ", gating reason:" (if old-enough "another job started a server" "too soon to start another server"))

		(thread-sleep! ( + 1 idletime))
		(server:wait-for-server-start-last-flag areapath)))))))


        
;; kind start up of server, wait before allowing another server for a given
;; area to be launched
;;
(define (server:kind-run areapath)
  ;; look for $MT_RUN_AREA_HOME/logs/server-start-last
  ;; and wait for it to be at least <server idletime> seconds old
  (server:wait-for-server-start-last-flag areapath)
  (if (not (server:check-if-running areapath)) ;; why try if there is already a server running?
      (let* (








	     (lock-file    (conc areapath "/logs/server-start.lock")))

	(let* ((start-flag (conc areapath "/logs/server-start-last")))
	  (common:simple-file-lock-and-wait lock-file expire-time: 25)
	  (debug:print-info  2 *default-log-port* "server:kind-run: touching " start-flag)
	  (system (conc "touch " start-flag)) ;; lazy but safe
	  (server:run areapath)
	  (thread-sleep! 20) ;; don't release the lock for at least a few seconds. And allow time for the server startup to get to "SERVER STARTED".
	  (common:simple-file-release-lock lock-file)))

      (debug:print-info 0 *default-log-port* "Found server already running. NOT trying to start another.")
   )
)

;; this one seems to be the general entry point
;;
(define (server:start-and-wait areapath #!key (timeout 60))
  (let ((give-up-time (+ (current-seconds) timeout)))
    (let loop ((server-info (server:check-if-running areapath))
	       (try-num    0))
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
  (let ((ns (string->number
	     (or (configf:lookup *configdat* "server" "numservers") "notanumber"))))
    (or ns numservers)))

;; no longer care if multiple servers are started by accident. older servers will drop off in time.
;;
(define (server:check-if-running areapath) ;;  #!key (numservers "2"))
  (let* ((ns            (server:get-num-servers))
	 (servers       (server:get-best (server:get-list areapath))))
    (if (or (and servers
		 (null? servers))
	    (not servers)
	    (and (list? servers)
		 (< (length servers) (random ns)))) ;; somewhere between 0 and numservers
        #f
        (let loop ((hed (car servers))
                   (tal (cdr servers)))
          (let ((res (server:check-server hed)))
            (if res
                hed
                (if (null? tal)







|





|







454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
  (let ((ns (string->number
	     (or (configf:lookup *configdat* "server" "numservers") "notanumber"))))
    (or ns numservers)))

;; no longer care if multiple servers are started by accident. older servers will drop off in time.
;;
(define (server:check-if-running areapath) ;;  #!key (numservers "2"))
  (let* ((ns            (server:get-num-servers)) ;; get the setting the for maximum number of servers allowed
	 (servers       (server:get-best (server:get-list areapath))))
    (if (or (and servers
		 (null? servers))
	    (not servers)
	    (and (list? servers)
		 (< (length servers) (+ 1 (random ns))))) ;; somewhere between 1 and numservers
        #f
        (let loop ((hed (car servers))
                   (tal (cdr servers)))
          (let ((res (server:check-server hed)))
            (if res
                hed
                (if (null? tal)

Modified subrun.scm from [bd1952a98c] to [85650ceb7f].

126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
      (let* ((action-switches-str
              (conc "-kill-runs" ))
             (kill-result
              (subrun:exec-sub-megatest test-run-dir action-switches-str "kill")))
        kill-result)
      #t))

(define (subrun:launch-cmd test-run-dir #!optional (sub-cmd "-run")) ;; BUG: "-run" should be changed to "-rerun-clean" but current doesn't work
  (if (subrun:subrun-removed? test-run-dir)
      (subrun:unset-subrun-removed test-run-dir))      

  (let* ((log-prefix "run")
         (switches (subrun:selector+log-switches test-run-dir log-prefix))
         (run-wait #t)
         (cmd      (conc "megatest " sub-cmd " " switches" "
                         (if run-wait "-run-wait " ""))))
    cmd))


(define (subrun:sanitize-path inpath)
  (let* ((insane-pattern (irregex "[^[a-zA-Z0-9_\\-]")))







|





|







126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
      (let* ((action-switches-str
              (conc "-kill-runs" ))
             (kill-result
              (subrun:exec-sub-megatest test-run-dir action-switches-str "kill")))
        kill-result)
      #t))

(define (subrun:launch-cmd test-run-dir run-mode #!optional (sub-cmd "-run")) ;; BUG: "-run" should be changed to "-rerun-clean" but current doesn't work
  (if (subrun:subrun-removed? test-run-dir)
      (subrun:unset-subrun-removed test-run-dir))      

  (let* ((log-prefix "run")
         (switches (subrun:selector+log-switches test-run-dir log-prefix))
         (run-wait (equal? run-mode "yes"))
         (cmd      (conc "megatest " sub-cmd " " switches" "
                         (if run-wait "-run-wait " ""))))
    cmd))


(define (subrun:sanitize-path inpath)
  (let* ((insane-pattern (irregex "[^[a-zA-Z0-9_\\-]")))

Modified utils/mk_wrapper from [713ec8f660] to [4fc79e58e7].

42
43
44
45
46
47
48









49
50
51
52
53
54
55
if [ "\$LD_LIBRARY_PATH" != "" ];then
  export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:\$LD_LIBRARY_PATH:$libdir
else
  export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$libdir
fi

export MT_SQLITE3_EXE=$sqlite3_exe









__EOF
) > $cfgfile
  echo 
else
  echo "INFO: LD_LIBRARY_PATH not set" >&2
fi








>
>
>
>
>
>
>
>
>







42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
if [ "\$LD_LIBRARY_PATH" != "" ];then
  export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:\$LD_LIBRARY_PATH:$libdir
else
  export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$libdir
fi

export MT_SQLITE3_EXE=$sqlite3_exe

http_vars="http_proxy https_proxy HTTP_PROXY HTTPS_PROXY"
for i in \$http_vars
do
j=\${!i}
if [ "\$j" != "" ]; then
   unset \$i
fi
done
__EOF
) > $cfgfile
  echo 
else
  echo "INFO: LD_LIBRARY_PATH not set" >&2
fi

91
92
93
94
95
96
97
98
99
100
101
102
103
if [[ \$(ulimit -a | grep 'open files' | awk '{print \$4}') -gt 10000 ]];then ulimit -n 10000;fi
EOF

# echo "#!/bin/bash" > $target
# echo "exec $prefix/bin/.\$(lsb_release -sr)/$cmd \"\$@\"" >> $target

echo "lsbr=\$(lsb_release -sr)" >> $target
if [ "$LD_LIBRARY_PATH" != "" ];then
  echo "source $prefix/bin/.\$lsbr/cfg.sh" >> $target
fi

# echo "if [[ -e \$lsbr ]];then source \$lsbr;fi" >> $target
echo "exec $prefix/bin/.\$lsbr/$cmd \"\$@\"" >> $target







<
|
<



100
101
102
103
104
105
106

107

108
109
110
if [[ \$(ulimit -a | grep 'open files' | awk '{print \$4}') -gt 10000 ]];then ulimit -n 10000;fi
EOF

# echo "#!/bin/bash" > $target
# echo "exec $prefix/bin/.\$(lsb_release -sr)/$cmd \"\$@\"" >> $target

echo "lsbr=\$(lsb_release -sr)" >> $target

echo "source $prefix/bin/.\$lsbr/cfg.sh" >> $target


# echo "if [[ -e \$lsbr ]];then source \$lsbr;fi" >> $target
echo "exec $prefix/bin/.\$lsbr/$cmd \"\$@\"" >> $target