Megatest

Diff
Login

Differences From Artifact [4385dac738]:

To Artifact [7b35777037]:


34
35
36
37
38
39
40
41
42
43





44
45
46

47
48
49
50
51
52

53
54
55
56
57
58
59
60
34
35
36
37
38
39
40



41
42
43
44
45
46
47

48
49
50
51
52
53

54

55
56
57
58
59
60
61







-
-
-
+
+
+
+
+


-
+





-
+
-







(declare (uses tasksmod))
(declare (uses testsmod))
(declare (uses itemsmod))

(module runsmod
	*
	
(import scheme chicken data-structures extras ports files)

(use (prefix base64 base64:)
(import scheme chicken.base chicken.random chicken.port chicken.file chicken.string)
(import chicken.time chicken.condition chicken.process chicken.process-context.posix chicken.process-context)
(import system-information chicken.process.signal chicken.sort chicken.file.posix chicken.io chicken.time.posix chicken.pretty-print)
(import chicken.pathname) 
(import (prefix base64 base64:)
     (prefix sqlite3 sqlite3:)
     call-with-environment-variables
     csv
     ;;csv
     directory-utils
     format
     matchable
     message-digest
     md5
     ports
     chicken.port
     posix
     regex
     srfi-1
     srfi-1
     srfi-13
     srfi-18
     srfi-18
     srfi-69
94
95
96
97
98
99
100
101

102
103
104
105
106
107
108
95
96
97
98
99
100
101

102
103
104
105
106
107
108
109







-
+







  hed tal reg reruns  test-record
  test-name item-path jobgroup
  waitons testmode  newtal itemmaps prereqs-not-met)

(define (set-item-env-vars itemdat)
  (for-each (lambda (item)
	      (debug:print 2 *default-log-port* "setenv " (car item) " " (cadr item))
	      (setenv (car item) (cadr item)))
	      (set-environment-variable! (car item) (cadr item)))
	    itemdat))

;; Every time can-run-more-tests is called increment the delay
;;
;; NOTE: We run this server-side!! Do not use this global except in the runs:can-run-more-tests routine
;;
(define *last-num-running-tests* 0)
325
326
327
328
329
330
331
332

333
334
335
336
337
338
339
326
327
328
329
330
331
332

333
334
335
336
337
338
339
340







-
+







(define (runs:run-tests target runname test-patts user flags #!key (run-count 1)) ;; test-names
  (let* ((keys               (keys:config-get-fields *configdat*))
	 (keyvals            (keys:target->keyval keys target))
	 (run-id             (rmt:register-run keyvals runname "new" "n/a" user (args:get-arg "-contour")))  ;;  test-name)))
	 ;; (deferred          '()) ;; delay running these since they have a waiton clause
	 (runconfigf         (conc  *toppath* "/runconfigs.config"))
         (dbfile             (conc  *toppath* "/megatest.db"))
         (readonly-mode      (not (file-write-access? dbfile)))
         (readonly-mode      (not (file-writable? dbfile)))
	 (test-records       (make-hash-table))
	 ;; need to process runconfigs before generating these lists
	 (all-tests-registry #f)  ;; (tests:get-all)) ;; (tests:get-valid-tests (make-hash-table) test-search-path)) ;; all valid tests to check waiton names
	 (all-test-names     #f)  ;; (hash-table-keys all-tests-registry))
	 (test-names         #f)  ;; Generated by a call to (tests:filter-test-names all-test-names test-patts))
	 (required-tests     #f)  ;; Put fully qualified test/testpath names in this list to be done
         (waitors-upon       (make-hash-table)) ;; given a test, return list of tests waiting upon this test.
492
493
494
495
496
497
498
499

500
501
502
503
504
505
506
493
494
495
496
497
498
499

500
501
502
503
504
505
506
507







-
+







    ;;======================================================================
    
    (if (not (null? test-names)) ;; BEGIN test-names loop
	(let loop ((hed (car test-names))   ;; NOTE: This is the main loop that iterates over the test-names
		   (tal (cdr test-names)))         ;; 'return-procs tells the config reader to prep running system but return a proc
          (debug:print-info 4 *default-log-port* "\n\ntestpatt elaboration loop => hed="hed " tal="tal" test-patts="test-patts" test-names="test-names)
	  (change-directory *toppath*) ;; PLEASE OPTIMIZE ME!!! I think this should be a no-op but there are several places where change-directories could be happening.
	  (setenv "MT_TEST_NAME" hed) ;; 
	  (set-environment-variable! "MT_TEST_NAME" hed) ;; 
	  (let*-values (((waitons waitors config) (tests:get-waitons hed all-tests-registry))

			;; NOTE: Have the config - can extract [waitons] section
			
                        ((hed-mode)
                         (let ((m (configf:lookup config "requirements" "mode")))
                           (if m (map string->symbol (string-split m)) '(normal))))
783
784
785
786
787
788
789
790
791


792
793
794
795
796
797
798
784
785
786
787
788
789
790


791
792
793
794
795
796
797
798
799







-
-
+
+







     ;;       - prereqs are completed and passed (we could consider removing "and passed" -- it would change behavior from current)          
    ((or (null? prereqs-not-met)
	  (and (member 'toplevel testmode)
	       (null? non-completed)))
      (debug:print-info 4 *default-log-port* "cond branch - "  "ei-2")
      (debug:print-info 4 *default-log-port* "runs:expand-items: (or (null? prereqs-not-met) (and (member 'toplevel testmode)(null? non-completed)))")
      (let ((test-name (tests:testqueue-get-testname test-record)))
	(setenv "MT_TEST_NAME" test-name) ;; 
	(setenv "MT_RUNNAME"   runname)
	(set-environment-variable! "MT_TEST_NAME" test-name) ;; 
	(set-environment-variable! "MT_RUNNAME"   runname)
	(runs:set-megatest-env-vars run-id inrunname: runname) ;; these may be needed by the launching process
	(let ((items-list (items:get-items-from-config tconfig)))
	  (if (list? items-list)
	      (begin
		(if (null? items-list)
		    (let ((test-id   (rmt:get-test-id run-id test-name ""))
			  (num-items (rmt:test-toplevel-num-items run-id test-name)))
1914
1915
1916
1917
1918
1919
1920
1921

1922
1923
1924
1925
1926
1927
1928
1915
1916
1917
1918
1919
1920
1921

1922
1923
1924
1925
1926
1927
1928
1929







-
+







     (lambda (vardat)
       (let ((var (car vardat))
	     (val (cdr vardat)))
	 (if (not (equal? (get-environment-variable var) val))
	     (handle-exceptions
	      exn
	      (debug:print-error 0 *default-log-port* "Failed to set " var " to " val)
	      (setenv var val)))))
	      (set-environment-variable! var val)))))
     all-vars)
        ))

;;======================================================================
;; END OF NEW STUFF
;;======================================================================

2065
2066
2067
2068
2069
2070
2071
2072

2073
2074
2075
2076
2077
2078
2079
2066
2067
2068
2069
2070
2071
2072

2073
2074
2075
2076
2077
2078
2079
2080







-
+







	 (state-status (if (string? new-state-status) (string-split new-state-status ",") '(#f #f)))
	 (rp-mutex     (make-mutex))
	 (bup-mutex    (make-mutex))
         (keep-records (args:get-arg "-keep-records"))) ;; used in conjunction with -remove-runs to keep the records, TODO: consolidate this with "mode".

    (let* ((write-access-actions '(remove-runs set-state-status archive run-wait kill-runs))
           (dbfile             (conc  *toppath* "/megatest.db"))
           (readonly-mode      (not (file-write-access? dbfile))))
           (readonly-mode      (not (file-writable? dbfile))))
      (when (and readonly-mode
                 (member action write-access-actions))
        (debug:print-error 0 *default-log-port* "megatest.db is readonly.  Cannot proceed with action ["action"] in which write-access isrequired .")
        (exit 1)))
    
    (debug:print-info 4 *default-log-port* "runs:operate-on => Header: " header " action: " action " new-state-status: " new-state-status)
    (if (> 2 (length state-status))
2871
2872
2873
2874
2875
2876
2877
2878
2879


2880
2881
2882
2883



2884
2885
2886
2887
2888
2889
2890
2872
2873
2874
2875
2876
2877
2878


2879
2880
2881



2882
2883
2884
2885
2886
2887
2888
2889
2890
2891







-
-
+
+

-
-
-
+
+
+







			    #t)
			  (process-signal pid signal/int)
			  (thread-sleep! 5)
			  (if (process:alive? pid)
			      (process-signal pid signal/kill)))))
		   ;;  (call-with-environment-variables
		   (let ((old-targethost (getenv "TARGETHOST")))
		     (setenv "TARGETHOST" hostname)
		     (setenv "TARGETHOST_LOGF" "server-kills.log")
		     (set-environment-variable! "TARGETHOST" hostname)
		     (set-environment-variable! "TARGETHOST_LOGF" "server-kills.log")
		     (system (conc "nbfake kill " pid))
		     (if old-targethost (setenv "TARGETHOST" old-targethost))
		     (unsetenv "TARGETHOST")
		     (unsetenv "TARGETHOST_LOGF"))))
		     (if old-targethost (set-environment-variable! "TARGETHOST" old-targethost))
		     (unset-environment-variable! "TARGETHOST")
		     (unset-environment-variable! "TARGETHOST_LOGF"))))
	     (debug:print-error 0 *default-log-port* "no record or improper record for " target "/" run-name " in tasks_queue in main.db"))))
     records)))

(define (task:get-run-times)
   (let* ( 
           (run-patt (if (args:get-arg "-run-patt")
                        (args:get-arg "-run-patt")
3541
3542
3543
3544
3545
3546
3547
3548

3549
3550
3551
3552
3553
3554
3555
3542
3543
3544
3545
3546
3547
3548

3549
3550
3551
3552
3553
3554
3555
3556







-
+







	 (maxload (if force-maxload
		      maxload-in
		      (max maxload-in 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME?
	 (first   (car loadavg))
	 (next    (cadr loadavg))
	 (adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1
	 (loadjmp (- first next))
         (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)(/ (- 1000 count) 10) waitdelay) (- first adjload) ))  ))) ;; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously
         (adjwait (min (+ 300 (pseudo-random-integer 10)) (abs (* (+ (pseudo-random-integer 10)(/ (- 1000 count) 10) waitdelay) (- first adjload) ))  ))) ;; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously
    (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload
		      ", load: " first ", adjload: " adjload ", loadjmp: " loadjmp)
    (cond
     ((and (> first adjload)
	   (> count 0))
      (debug:print-info 0 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg ""))
      (thread-sleep! adjwait)
3792
3793
3794
3795
3796
3797
3798
3799

3800
3801
3802
3803
3804
3805
3806
3793
3794
3795
3796
3797
3798
3799

3800
3801
3802
3803
3804
3805
3806
3807







-
+







;; Do NOT check if not on homehost!
;;
(define (common:exit-on-version-changed)
  (if (common:on-homehost?)
      (if (common:api-changed?)
	  (let* ((mtconf (conc (get-environment-variable "MT_RUN_AREA_HOME") "/megatest.config"))
                (dbfile  (conc (get-environment-variable "MT_RUN_AREA_HOME") "/megatest.db"))
                (read-only (not (file-write-access? dbfile)))
                (read-only (not (file-writable? dbfile)))
                (dbstruct (db:setup #t)))
	    (debug:print 0 *default-log-port*
			 "WARNING: Version mismatch!\n"
			 "   expected: " (common:version-signature) "\n"
			 "   got:      " (common:get-last-run-version))
            (cond
             ((get-environment-variable "MT_SKIP_DB_MIGRATE") #t)
3930
3931
3932
3933
3934
3935
3936
3937

3938
3939
3940
3941
3942
3943
3944
3931
3932
3933
3934
3935
3936
3937

3938
3939
3940
3941
3942
3943
3944
3945







-
+







  (let* ((ns            (server:get-num-servers))
	 (servers       (server:get-best (server:get-list areapath))))
    ;; (print "servers: " servers " ns: " ns)
    (if (or (and servers
		 (null? servers))
	    (not servers)
	    (and (list? servers)
		 (< (length servers) (random ns)))) ;; somewhere between 0 and numservers
		 (< (length servers) (pseudo-random-integer ns)))) ;; somewhere between 0 and numservers
        #f
        (let loop ((hed (car servers))
                   (tal (cdr servers)))
          (let ((res (server:check-server hed)))
            (if res
                res
                (if (null? tal)
3986
3987
3988
3989
3990
3991
3992
3993

3994
3995
3996


3997
3998
3999
4000


4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016

4017
4018
4019
4020
4021
4022
4023
3987
3988
3989
3990
3991
3992
3993

3994
3995


3996
3997
3998
3999


4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016

4017
4018
4019
4020
4021
4022
4023
4024







-
+

-
-
+
+


-
-
+
+















-
+







    (if (and target-host 
	     ;; look at target host, is it host.domain.tld or ip address and does it 
	     ;; match current ip or hostname
	     (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host))
	     (not (equal? curr-ip target-host)))
	(begin
	  (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
	  (setenv "TARGETHOST" target-host)))
	  (set-environment-variable! "TARGETHOST" target-host)))
      
    (setenv "TARGETHOST_LOGF" logfile)
    (thread-sleep! (/ (random 5000) 1000)) ;; add about a random (up to 5 seconds) initial delay. It seems pretty common that many running tests request a server at the same time
    (set-environment-variable! "TARGETHOST_LOGF" logfile)
    (thread-sleep! (/ (pseudo-random-integer 5000) 1000)) ;; add about a random (up to 5 seconds) initial delay. It seems pretty common that many running tests request a server at the same time
    (common:wait-for-normalized-load load-limit " delaying server start due to load" target-host) ;; do not try starting servers on an already overloaded machine, just wait forever
    (system (conc "nbfake " cmdln))
    (unsetenv "TARGETHOST_LOGF")
    (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
    (unset-environment-variable! "TARGETHOST_LOGF")
    (if (get-environment-variable "TARGETHOST")(unset-environment-variable! "TARGETHOST"))
    (thread-join! log-rotate)
    (pop-directory)))

;; kind start up of servers, wait 40 seconds before allowing another server for a given
;; run-id to be launched
#;(define (server:kind-run areapath)
  (if (not (server:check-if-running areapath)) ;; why try if there is already a server running?
      (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun
	     (call-num     (car last-run-dat))
	     (when-run     (cadr last-run-dat))
	     (run-delay    (+ (case call-num
				((0)    0)
				((1)   20)
				((2)  300)
				(else 600))
			      (random 5)))   ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously
			      (pseudo-random-integer 5)))   ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously
	     (lock-file    (conc areapath "/logs/server-start.lock")))
	(if	(> (- (current-seconds) when-run) run-delay)
		(begin
		  (common:simple-file-lock-and-wait lock-file expire-time: 15)
		  (server:run areapath)
		  (thread-sleep! 2) ;; don't release the lock for at least a few seconds
		  (common:simple-file-release-lock lock-file)))
4087
4088
4089
4090
4091
4092
4093
4094

4095
4096
4097
4098
4099
4100
4101
4088
4089
4090
4091
4092
4093
4094

4095
4096
4097
4098
4099
4100
4101
4102







-
+







		(if (> my-start-time (handle-exceptions
					 exn
					 0
				       (file-modification-time lockf)))
		    ;; we started since current re-gen in flight, delay a little and try again
		    (begin
		      (debug:print-info 1 *default-log-port* "Waiting to update " outputfilename ", another test currently updating it")
		      (thread-sleep! (+ 5 (random 5))) ;; delay between 5 and 10 seconds
		      (thread-sleep! (+ 5 (pseudo-random-integer 5))) ;; delay between 5 and 10 seconds
		      (loop (common:simple-file-lock lockf))))))))))

(define (tests:generate-html-summary-for-iterated-test run-id test-id test-name outputfilename)
  (let ((counts              (make-hash-table))
	(statecounts         (make-hash-table))
	(outtxt              "")
	(tot                 0)
4768
4769
4770
4771
4772
4773
4774
4775

4776
4777
4778
4779
4780
4781
4782
4769
4770
4771
4772
4773
4774
4775

4776
4777
4778
4779
4780
4781
4782
4783







-
+







						      '()
						      (lambda (x p)
							(let* ((targ-path (string-intersperse p "/"))
                                                               (full-path (conc linktree "/" targ-path))
                                                               (run-name  (car (reverse p))))
                                                          (if (and (common:file-exists? full-path)
                                                                   (directory?   full-path)
                                                                   (file-write-access? full-path))
                                                                   (file-writable? full-path))
                                                              (s:a run-name 'href (conc targ-path "/run-summary.html"))
                                                              (begin
                                                                (debug:print 0 *default-log-port* "INFO: Can't create " targ-path "/run-summary.html")
                                                                (conc run-name " (Not able to create summary at " targ-path ")")))))))))))
          (close-output-port oup)
	  (common:simple-file-release-lock lockfile)
               
4807
4808
4809
4810
4811
4812
4813
4814

4815
4816
4817
4818
4819
4820
4821
4808
4809
4810
4811
4812
4813
4814

4815
4816
4817
4818
4819
4820
4821
4822







-
+







                                           path-parts))
                                       test-dats))
                    (tests-htree (common:list->htree tests-tree-dat))
                    (html-dir    (conc linktree "/" (string-intersperse run-dir "/")))
                    (html-path   (conc html-dir "/run-summary.html"))
                    (oup         (if (and (common:file-exists? html-dir)
                                          (directory?   html-dir)
                                          (file-write-access? html-dir))
                                          (file-writable? html-dir))
                                     (open-output-file  html-path)
                                     #f)))
               ;; (print "run-dir: " run-dir ", tests-tree-dat: " tests-tree-dat)
               (if oup
                   (begin
                     (s:output-new
                      oup
4839
4840
4841
4842
4843
4844
4845
4846

4847
4848
4849
4850
4851
4852
4853
4840
4841
4842
4843
4844
4845
4846

4847
4848
4849
4850
4851
4852
4853
4854







-
+







                                                                          (alt-file  (conc full-targ "/megatest-rollup-" test-name ".html"))
                                                                          (html-file (if (common:file-exists? alt-file)
                                                                                         alt-file
                                                                                         std-file))
                                                                          (run-name  (car (reverse p))))
                                                                     (if (and (not (common:file-exists? full-targ))
                                                                              (directory? full-targ)
                                                                              (file-write-access? full-targ))
                                                                              (file-writable? full-targ))
                                                                         (tests:summarize-test 
                                                                          run-id 
                                                                          (rmt:get-test-id run-id test-name item-path)))
                                                                     (if (common:file-exists? full-targ)
                                                                         (s:a run-name 'href html-file)
                                                                         (begin
                                                                           (debug:print 0 *default-log-port* "ERROR: can't access " full-targ)
4918
4919
4920
4921
4922
4923
4924
4925

4926
4927
4928
4929
4930
4931
4932
4919
4920
4921
4922
4923
4924
4925

4926
4927
4928
4929
4930
4931
4932
4933







-
+







;; summarize test in to a file test-summary.html in the test directory
;;
(define (tests:summarize-test run-id test-id)
  (let* ((test-dat  (rmt:get-test-info-by-id run-id test-id))
	 (out-dir   (db:test-get-rundir test-dat))
	 (out-file  (conc out-dir "/test-summary.html")))
    ;; first verify we are able to write the output file
    (if (not (file-write-access? out-dir))
    (if (not (file-writable? out-dir))
	(debug:print 0 *default-log-port* "ERROR: cannot write test-summary.html to " out-dir)
	(let* (;; (steps-dat (rmt:get-steps-for-test run-id test-id))
	       (test-name (db:test-get-testname test-dat))
	       (item-path (db:test-get-item-path test-dat))
	       (full-name (db:test-make-full-name test-name item-path))
	       (oup       (open-output-file out-file))
	       (status    (db:test-get-status   test-dat))
5122
5123
5124
5125
5126
5127
5128
5129

5130
5131
5132
5133
5134
5135
5136
5123
5124
5125
5126
5127
5128
5129

5130
5131
5132
5133
5134
5135
5136
5137







-
+







    (if enccmd
	(common:read-encoded-string enccmd)
	'())))

;; return (conc status ": " comment) from the final section so that
;;   the comment can be set in the step record in launch.scm
;;
(define (launch:load-logpro-dat run-id test-id stepname)
#;(define (launch:load-logpro-dat run-id test-id stepname)
  (let ((cname (conc stepname ".dat")))
    (if (common:file-exists? cname)
	(let* ((dat  (configf:read-config cname #f #f))
	       (csvr (db:logpro-dat->csv dat stepname))
	       (csvt (let-values (((fmt-cell fmt-record fmt-csv) (make-format ",")))
		       (fmt-csv (map list->csv-record csvr))))
	       (status (configf:lookup dat "final" "exit-status"))
5254
5255
5256
5257
5258
5259
5260
5261
5262


5263
5264
5265
5266
5267
5268
5269
5255
5256
5257
5258
5259
5260
5261


5262
5263
5264
5265
5266
5267
5268
5269
5270







-
-
+
+







    
    (let ((exinfo (launch:einf-exit-code exit-info)) ;; (vector-ref exit-info 2))
	  (logfna (if logpro-used (conc stepname ".html") ""))
	  (comment #f))
      (if logpro-used
	  (let ((datfile (conc stepname ".dat")))
	    ;; load the .dat file into the test_data table if it exists
	    (if (common:file-exists? datfile)
		(set! comment (launch:load-logpro-dat run-id test-id stepname)))
	    ;;(if (common:file-exists? datfile)
	;;	(set! comment (launch:load-logpro-dat run-id test-id stepname)))
	    (rmt:test-set-log! run-id test-id (conc stepname ".html"))))
      (rmt:teststep-set-status! run-id test-id stepname "end" exinfo comment logfna))
    ;; set the test final status
    (let* ((process-exit-status (launch:einf-exit-code exit-info)) ;; (vector-ref exit-info 2))
	   (this-step-status (cond
			      ((and (eq? process-exit-status 2) logpro-used) 'warn)   ;; logpro 2 = warnings
			      ((and (eq? process-exit-status 3) logpro-used) 'check)  ;; logpro 3 = check
5413
5414
5415
5416
5417
5418
5419
5420
5421


5422
5423
5424
5425
5426
5427
5428
5414
5415
5416
5417
5418
5419
5420


5421
5422
5423
5424
5425
5426
5427
5428
5429







-
-
+
+







			     (prevstep #f))
                    (debug:print-info 0 *default-log-port* "Processing ezstep \"" (string-intersperse ezstep " ") "\"")
		    ;; check exit-info (vector-ref exit-info 1)
		    (if (launch:einf-exit-status exit-info) ;; (vector-ref exit-info 1)
			(let ((logpro-used (launch:runstep ezstep run-id test-id exit-info m tal testconfig))
			      (stepname    (car ezstep)))
			  ;; if logpro-used read in the stepname.dat file
			  (if (and logpro-used (common:file-exists? (conc stepname ".dat")))
			      (launch:load-logpro-dat run-id test-id stepname))
			  ;;(if (and logpro-used (common:file-exists? (conc stepname ".dat")))
			  ;;    (launch:load-logpro-dat run-id test-id stepname))
			  (if (steprun-good? logpro-used (launch:einf-exit-code exit-info))
			      (if (not (null? tal))
				  (loop (car tal) (cdr tal) stepname))
			      (debug:print 0 *default-log-port* "WARNING: step " (car ezstep) " failed. Stopping")))
			(debug:print 0 *default-log-port* "WARNING: a prior step failed, stopping at " ezstep)))))))))

(define (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags)
5551
5552
5553
5554
5555
5556
5557
5558
5559


5560
5561
5562
5563

5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583

5584
5585
5586
5587
5588
5589
5590
5552
5553
5554
5555
5556
5557
5558


5559
5560
5561
5562
5563

5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583

5584
5585
5586
5587
5588
5589
5590
5591







-
-
+
+



-
+



















-
+







  (let* ((target    (or intarget 
			(common:args-get-target)
			(get-environment-variable "MT_TARGET")))
	 (keys      (if inkeys    inkeys    (rmt:get-keys)))
	 (keyvals   (if inkeyvals inkeyvals (keys:target->keyval keys target)))
	 (vals      (hash-table-ref/default *env-vars-by-run-id* run-id #f))
	 (link-tree (common:get-linktree))) ;; (configf:lookup *configdat* "setup" "linktree")))
    (if testname (setenv "MT_TEST_NAME" testname))
    (if itempath (setenv "MT_ITEMPATH"  itempath))
    (if testname (set-environment-variable! "MT_TEST_NAME" testname))
    (if itempath (set-environment-variable! "MT_ITEMPATH"  itempath))

    ;; get the info from the db and put it in the cache
    (if link-tree
	(setenv "MT_LINKTREE" link-tree)
	(set-environment-variable! "MT_LINKTREE" link-tree)
	(debug:print-error 0 *default-log-port* "linktree not set, should be set in megatest.config in [setup] section."))
    (if (not vals)
	(let ((ht (make-hash-table)))
	  (hash-table-set! *env-vars-by-run-id* run-id ht)
	  (set! vals ht)
	  (for-each
	   (lambda (key)
	     (hash-table-set! vals (car key) (cadr key)))
	   keyvals)))
    ;; from the cached data set the vars
    
    (hash-table-for-each
     vals
     (lambda (key val)
       (debug:print 2 *default-log-port* "setenv " key " " val)
       (safe-setenv key val)))
    ;;(bb-check-path msg: "runs:set-megatest-env-vars block 1")
    ;;(BB> "*env-vars-by-run-id*/runid("run-id" vals="(hash-table->alist vals))

    (if (not (get-environment-variable "MT_TARGET"))(setenv "MT_TARGET" target))
    (if (not (get-environment-variable "MT_TARGET"))(set-environment-variable! "MT_TARGET" target))
    ;; we had a case where there was an exception generated by the hash-table-ref
    ;; due to *configdat* being #f Adding a handle and exit
    (let fatal-loop ((count 0)) 
      (handle-exceptions
	  exn
	  (let ((call-chain (get-call-chain))
		(msg        ((condition-property-accessor 'exn 'message) exn)))
5611
5612
5613
5614
5615
5616
5617
5618

5619
5620

5621
5622
5623


5624
5625
5626

5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638

5639
5640
5641
5642
5643
5644
5645
5612
5613
5614
5615
5616
5617
5618

5619
5620

5621
5622


5623
5624
5625
5626

5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638

5639
5640
5641
5642
5643
5644
5645
5646







-
+

-
+

-
-
+
+


-
+











-
+







              (thread-sleep! 2) ;; assuming nfs lag.
              (launch:setup force-reread: #t))
          (alist->env-vars (hash-table-ref/default *configdat* "env-override" '())))) ;;;; environment is tainted HERE in this let block.
    ;;(bb-check-path msg: "runs:set-megatest-env-vars block 2")
    ;; Lets use this as an opportunity to put MT_RUNNAME in the environment
    (let ((runname  (if inrunname inrunname (rmt:get-run-name-from-id run-id))))
      (if runname
	  (setenv "MT_RUNNAME" runname)
	  (set-environment-variable! "MT_RUNNAME" runname)
	  (debug:print-error 0 *default-log-port* "no value for runname for id " run-id)))
    (setenv "MT_RUN_AREA_HOME" *toppath*)
    (set-environment-variable! "MT_RUN_AREA_HOME" *toppath*)
    ;; if a testname and itempath are available set the remaining appropriate variables
    (if testname (setenv "MT_TEST_NAME" testname))
    (if itempath (setenv "MT_ITEMPATH"  itempath))
    (if testname (set-environment-variable! "MT_TEST_NAME" testname))
    (if itempath (set-environment-variable! "MT_ITEMPATH"  itempath))
    ;;(bb-check-path msg: "runs:set-megatest-env-vars block 3")
    (if (and testname link-tree)
	(setenv "MT_TEST_RUN_DIR" (conc (getenv "MT_LINKTREE")  "/"
	(set-environment-variable! "MT_TEST_RUN_DIR" (conc (getenv "MT_LINKTREE")  "/"
					(getenv "MT_TARGET")    "/"
					(getenv "MT_RUNNAME")   "/"
					(getenv "MT_TEST_NAME")
					(if (and itempath
						 (not (equal? itempath "")))
					    (conc "/" itempath)
					    ""))))))

(define (launch:execute encoded-cmd)
  (let* ((cmdinfo    (common:read-encoded-string encoded-cmd))
	 (tconfigreg #f))
    (setenv "MT_CMDINFO" encoded-cmd)
    (set-environment-variable! "MT_CMDINFO" encoded-cmd)
    ;;(bb-check-path msg: "launch:execute incoming")
    (if (list? cmdinfo) ;; ((testpath /tmp/mrwellan/jazzmind/src/example_run/tests/sqlitespeed)
	;; (test-name sqlitespeed) (runscript runscript.rb) (db-host localhost) (run-id 1))
	(let* ((testpath  (assoc/default 'testpath  cmdinfo))  ;; testpath is the test spec area
	       (top-path  (assoc/default 'toppath   cmdinfo))
	       (work-area (assoc/default 'work-area cmdinfo))  ;; work-area is the test run area
	       (test-name (assoc/default 'test-name cmdinfo))
5669
5670
5671
5672
5673
5674
5675
5676

5677
5678
5679
5680
5681
5682
5683
5670
5671
5672
5673
5674
5675
5676

5677
5678
5679
5680
5681
5682
5683
5684







-
+







	       (keyvals   #f)
	       (fullrunscript (if (not runscript)
                                  #f
                                  (if (substring-index "/" runscript)
                                      runscript ;; use unadultered if contains slashes
                                      (let ((fulln (conc work-area "/" runscript)))
	                                  (if (and (common:file-exists? fulln)
                                                   (file-execute-access? fulln))
                                                   (file-executable? fulln))
                                              fulln
                                              runscript))))) ;; assume it is on the path
               (check-work-area           (lambda ()
                                            ;; NFS might not have propagated the directory meta data to the run host - give it time if needed
                                            (let loop ((count 0))
                                              (if (or (common:directory-exists? work-area)
                                                      (> count 10))
5710
5711
5712
5713
5714
5715
5716
5717

5718
5719
5720
5721
5722


5723
5724
5725

5726
5727
5728
5729

5730
5731
5732
5733
5734


5735
5736
5737

5738
5739
5740
5741
5742
5743
5744
5711
5712
5713
5714
5715
5716
5717

5718
5719
5720
5721


5722
5723
5724
5725

5726
5727
5728
5729

5730
5731
5732
5733


5734
5735
5736
5737

5738
5739
5740
5741
5742
5743
5744
5745







-
+



-
-
+
+


-
+



-
+



-
-
+
+


-
+







                                                    (debug:print 0 *default-log-port* "INFO: test data from " testpath " not copied properly or filesystem problems causing data to not be found. Re-running the copy command.")
                                                    (debug:print 0 *default-log-port* "INFO: missing files from " work-area ": " (string-intersperse bad-files ", "))
                                                    (launch:test-copy testpath work-area))))
                                            ;; one more time, change to the work-area directory
                                            (change-directory work-area)))
	       ) ;; let*

	  (if contour (setenv "MT_CONTOUR" contour))
	  (if contour (set-environment-variable! "MT_CONTOUR" contour))
	  
	  ;; immediated set some key variables from CMDINFO data, yes, these will be set again below ...
	  ;;
	  (setenv "MT_TESTSUITENAME" areaname)
	  (setenv "MT_RUN_AREA_HOME" top-path)
	  (set-environment-variable! "MT_TESTSUITENAME" areaname)
	  (set-environment-variable! "MT_RUN_AREA_HOME" top-path)
	  (set! *toppath* top-path)
          (change-directory *toppath*) ;; temporarily switch to the run area home
	  (setenv "MT_TEST_RUN_DIR"  work-area)
	  (set-environment-variable! "MT_TEST_RUN_DIR"  work-area)

	  (launch:setup) ;; should be properly in the run area home now

	  (if contour (setenv "MT_CONTOUR" contour))
	  (if contour (set-environment-variable! "MT_CONTOUR" contour))
	  
	  ;; immediated set some key variables from CMDINFO data, yes, these will be set again below ...
	  ;;
	  (setenv "MT_TESTSUITENAME" areaname)
	  (setenv "MT_RUN_AREA_HOME" top-path)
	  (set-environment-variable! "MT_TESTSUITENAME" areaname)
	  (set-environment-variable! "MT_RUN_AREA_HOME" top-path)
	  (set! *toppath* top-path)
          (change-directory *toppath*) ;; temporarily switch to the run area home
	  (setenv "MT_TEST_RUN_DIR"  work-area)
	  (set-environment-variable! "MT_TEST_RUN_DIR"  work-area)

	  (launch:setup) ;; should be properly in the run area home now
          
	  (set! tconfigreg (tests:get-all)) ;; mapping of testname => test source path
	  (let ((sighand (lambda (signum)
			   ;; (signal-mask! signum) ;; to mask or not? seems to cause issues in exiting
			   (if (eq? signum signal/stop)
5858
5859
5860
5861
5862
5863
5864
5865

5866
5867
5868
5869
5870
5871
5872
5873

5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889

5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908

5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923

5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935

5936
5937
5938
5939
5940
5941
5942
5859
5860
5861
5862
5863
5864
5865

5866
5867
5868
5869
5870
5871
5872
5873

5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889

5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908

5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923

5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935

5936
5937
5938
5939
5940
5941
5942
5943







-
+







-
+















-
+


















-
+














-
+











-
+







		(debug:print 4 *default-log-port* "varpairs: " varpairs)
		(map (lambda (varpair)
		       (let ((varval (string-split varpair "=")))
			 (if (eq? (length varval) 2)
			     (let ((var (car varval))
				   (val (cadr varval)))
			       (debug:print 1 *default-log-port* "Adding pre-var/val " var " = " val " to the environment")
			       (setenv var val)))))
			       (set-environment-variable! var val)))))
		     varpairs)))
          ;;(bb-check-path msg: "launch:execute post block 2")
	  (for-each
	   (lambda (varval)
	     (let ((var (car varval))
		   (val (cadr varval)))
	       (if val
		   (setenv var val)
		   (set-environment-variable! var val)
		   (begin
		     (debug:print-error 0 *default-log-port* "required variable " var " does not have a valid value. Exiting")
		     (exit)))))
	     (list 
	      (list  "MT_TEST_RUN_DIR" work-area)
	      (list  "MT_TEST_NAME" test-name)
	      (list  "MT_ITEM_INFO" (conc itemdat))
	      (list  "MT_ITEMPATH"  item-path)
	      (list  "MT_RUNNAME"   runname)
	      (list  "MT_MEGATEST"  megatest)
	      (list  "MT_TARGET"    target)
	      (list  "MT_LINKTREE"  (common:get-linktree)) ;; (configf:lookup *configdat* "setup" "linktree"))
	      (list  "MT_TESTSUITENAME" (common:get-testsuite-name))))
          ;;(bb-check-path msg: "launch:execute post block 3")

	  (if mt-bindir-path (setenv "PATH" (conc (getenv "PATH") ":" mt-bindir-path)))
	  (if mt-bindir-path (set-environment-variable! "PATH" (conc (getenv "PATH") ":" mt-bindir-path)))
          ;;(bb-check-path msg: "launch:execute post block 4")
	  ;; (change-directory top-path)
	  ;; Can setup as client for server mode now
	  ;; (client:setup)

	  
	  ;; environment overrides are done *before* the remaining critical envars.
	  (alist->env-vars env-ovrd)
          ;;(bb-check-path msg: "launch:execute post block 41")
	  (runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals)
          ;;(bb-check-path msg: "launch:execute post block 42")
	  (set-item-env-vars itemdat)
          ;;(bb-check-path msg: "launch:execute post block 43")
          (let ((blacklist (configf:lookup *configdat* "setup" "blacklistvars")))
            (if blacklist
		(let ((vars (string-split blacklist)))
		  (save-environment-as-files "megatest" ignorevars: vars)
		  (for-each (lambda (var)
			      (unsetenv var))
			      (unset-environment-variable! var))
			    vars))
                (save-environment-as-files "megatest")))
          ;;(bb-check-path msg: "launch:execute post block 44")
	  ;; open-run-close not needed for test-set-meta-info
	  ;; (tests:set-full-meta-info #f test-id run-id 0 work-area)
	  ;; (tests:set-full-meta-info test-id run-id 0 work-area)
	  (tests:set-full-meta-info #f test-id run-id 0 work-area 10)

	  ;; (thread-sleep! 0.3) ;; NFS slowness has caused grief here

	  (if (args:get-arg "-xterm")
	      (set! fullrunscript "xterm")
	      (if (and fullrunscript 
		       (common:file-exists? fullrunscript)
		       (not (file-execute-access? fullrunscript)))
		       (not (file-executable? fullrunscript)))
		  (system (conc "chmod ug+x " fullrunscript))))

	  ;; We are about to actually kick off the test
	  ;; so this is a good place to remove the records for 
	  ;; any previous runs
	  ;; (db:test-remove-steps db run-id testname itemdat)
	  ;; now is also a good time to write the .testconfig file
	  (let* ((tconfig-fname   (conc work-area "/.testconfig"))
		 (tconfig-tmpfile (conc tconfig-fname ".tmp"))
		 (tconfig         (tests:get-testconfig test-name item-path tconfigreg #t force-create: #t))) ;; 'return-procs)))
	    (configf:write-alist tconfig tconfig-tmpfile)
	    (file-move tconfig-tmpfile tconfig-fname #t))
	    (move-file tconfig-tmpfile tconfig-fname #t))
	  ;; 
	  (let* ((m            (make-mutex))
		 (kill-job?    #f)
		 (exit-info    (make-launch:einf pid: #t exit-status: #t exit-code: #t rollup-status: 0)) ;; pid exit-status exit-code (i.e. process was successfully run) rollup-status
		 (job-thread   #f)
		 ;; (keep-going   #t)
		 (misc-flags   (let ((ht (make-hash-table)))
6193
6194
6195
6196
6197
6198
6199
6200

6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215

6216
6217
6218
6219
6220
6221
6222
6194
6195
6196
6197
6198
6199
6200

6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215

6216
6217
6218
6219
6220
6221
6222
6223







-
+














-
+







		  (set! *configinfo* first-pass)
		  (set! *toppath*    (or toppath (cadr first-pass))) ;; use the gathered data unless already have it
		  (set! toppath      *toppath*)
		  (if (not *toppath*)
		      (begin
			(debug:print-error 0 *default-log-port* "you are not in a megatest area!")
			(exit 1)))
		  (setenv "MT_RUN_AREA_HOME" *toppath*)
		  (set-environment-variable! "MT_RUN_AREA_HOME" *toppath*)
		  ;; the seed read is done, now read runconfigs, cache it then read megatest.config one more time and cache it
		  (let* ((keys         (rmt:get-keys))
			 (key-vals     (keys:target->keyval keys target))
			 (linktree     (common:get-linktree)) ;; (or (getenv "MT_LINKTREE")(if *configdat* (configf:lookup *configdat* "setup" "linktree") #f)))
					;     (if *configdat*
					; 	   (configf:lookup *configdat* "setup" "linktree")
					; 	   (conc *toppath* "/lt"))))
			 (second-pass  (configf:find-and-read-config
					mtconfig
					environ-patt: "env-override"
					given-toppath: toppath
					pathenvvar: "MT_RUN_AREA_HOME"))
			 (runconfigdat (begin     ;; this read of the runconfigs will see any adjustments made by re-reading megatest.config
					 (for-each (lambda (kt)
						     (setenv (car kt) (cadr kt)))
						     (set-environment-variable! (car kt) (cadr kt)))
						   key-vals)
					 (configf:read-config (conc toppath "/runconfigs.config") *runconfigdat* #t ;; consider using runconfig:read some day ...
						      sections: sections)))
                         (cachefiles   (launch:get-cache-file-paths areapath toppath target mtconfig))
                         (mtcachef     (car cachefiles))
                         (rccachef     (cdr cachefiles)))
                    ;;  trap exception due to stale NFS handle -- Error: (open-output-file) cannot open file - Stale NFS file handle: "/p/fdk/gwa/lefkowit/mtTesting/qa/primbeqa/links/p1222/11/PDK_r1.1.1/prim/clean/pcell_testgen/.runconfigs.cfg-1.6427-7d1e789cb3f62f9cde719a4865bb51b3c17ea853" - ticket 220546342
6287
6288
6289
6290
6291
6292
6293
6294
6295


6296
6297
6298
6299
6300
6301
6302
6288
6289
6290
6291
6292
6293
6294


6295
6296
6297
6298
6299
6300
6301
6302
6303







-
-
+
+







			(create-symbolic-link linktree tlink)))))
	      (begin
		(debug:print-error 0 *default-log-port* "linktree not defined in [setup] section of megatest.config")
		)))
	(if (and *toppath*
		 (directory-exists? *toppath*))
	    (begin
	      (setenv "MT_RUN_AREA_HOME" *toppath*)
	      (setenv "MT_TESTSUITENAME" (common:get-testsuite-name)))
	      (set-environment-variable! "MT_RUN_AREA_HOME" *toppath*)
	      (set-environment-variable! "MT_TESTSUITENAME" (common:get-testsuite-name)))
	    (begin
	      (debug:print-error 0 *default-log-port* "failed to find the top path to your Megatest area.")
	      (set! *toppath* #f) ;; force it to be false so we return #f
	      #f))
	
        ;; one more attempt to cache the configs for future reading
        (let* ((cachefiles   (launch:get-cache-file-paths areapath toppath target mtconfig))
6320
6321
6322
6323
6324
6325
6326
6327

6328
6329
6330
6331
6332
6333
6334
6321
6322
6323
6324
6325
6326
6327

6328
6329
6330
6331
6332
6333
6334
6335







-
+







              )
          (if (and rccachef mtcachef *runconfigdat* *configdat*)
              (set! *configstatus* 'fulldata)))

	;; if have -append-config then read and append here
	(let ((cfname (args:get-arg "-append-config")))
	  (if (and cfname
		   (file-read-access? cfname))
		   (file-readable? cfname))
	      (configf:read-config cfname *configdat* #t))) ;; values are added to the hash, no need to do anything special.
	*toppath*)))

(define (get-best-disk confdat testconfig)
  (let* ((disks   (or (and testconfig (hash-table-ref/default testconfig "disks" #f))
		      (hash-table-ref/default confdat "disks" #f)))
	 (minspace (let ((m (configf:lookup confdat "setup" "minspace")))
7081
7082
7083
7084
7085
7086
7087
7088

7089
7090
7091
7092
7093
7094
7095
7096

7097
7098
7099

7100
7101
7102
7103
7104
7105

7106
7107
7108
7109
7110
7111
7112
7082
7083
7084
7085
7086
7087
7088

7089
7090
7091
7092
7093
7094
7095
7096

7097
7098
7099

7100
7101
7102
7103
7104
7105

7106
7107
7108
7109
7110
7111
7112
7113







-
+







-
+


-
+





-
+








  (let* ((rundir (if (and (getenv "MT_LINKTREE")(getenv "MT_TARGET")(getenv "MT_RUNNAME"))
		     (conc (getenv "MT_LINKTREE") "/" (getenv "MT_TARGET") "/" (getenv "MT_RUNNAME"))
		     #f))
	 (cfgf   (if rundir (conc rundir "/.runconfig." megatest-version "-" megatest-fossil-hash) #f)))
    (if (and cfgf
	     (common:file-exists? cfgf)
	     (file-write-access? cfgf)
	     (file-writable? cfgf)
	     (common:use-cache?))
	(configf:read-alist cfgf)
	(let* ((keys   (rmt:get-keys))
	       (target (common:args-get-target))
	       (key-vals (if target (keys:target->keyval keys target) #f))
	       (sections (if target (list "default" target) #f))
	       (data     (begin
			   (setenv "MT_RUN_AREA_HOME" *toppath*)
			   (set-environment-variable! "MT_RUN_AREA_HOME" *toppath*)
			   (if key-vals
			       (for-each (lambda (kt)
					   (setenv (car kt) (cadr kt)))
					   (set-environment-variable! (car kt) (cadr kt)))
					 key-vals))
			   ;; (read-config (conc *toppath* "/runconfigs.config") #f #t sections: sections))))
                           (runconfig:read (conc *toppath* "/runconfigs.config") target #f))))
	  (if (and rundir ;; have all needed variabless
		   (directory-exists? rundir)
		   (file-write-access? rundir))
		   (file-writable? rundir))
	      (begin
                (if (not (common:in-running-test?))
                    (configf:write-alist data cfgf))
		;; force re-read of megatest.config - this resolves circular references between megatest.config
		(launch:setup force-reread: #t)
		;; (launch:cache-config) ;; there are two independent config cache locations, turning this one off for now. MRW.
		)) ;; we can safely cache megatest.config since we have a valid runconfig