Megatest

Diff
Login

Differences From Artifact [98ad71ee6e]:

To Artifact [e959ea01bb]:


250
251
252
253
254
255
256




257
258
259
260




261
262
263
264
265
266
267
268
269
270
271

272
273
274
275
276
277



278
279

280
281









282
283
284
285
286
287
288
250
251
252
253
254
255
256
257
258
259
260




261
262
263
264
265
266
267
268
269
270
271
272
273
274

275
276
277
278
279
280
281
282
283
284
285
286
287
288

289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304







+
+
+
+
-
-
-
-
+
+
+
+










-
+






+
+
+


+

-
+
+
+
+
+
+
+
+
+







                                   (delta (abs (- df disk-free))))
                              (if (and (> df 0)
                                       (> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg
                                  df
                                  #f)))
             (do-sync       (or new-cpu-load new-disk-free over-time))

	     ;;
	     ;; MOVE THIS TO A FILE FLAG BASED APPROACH (FOR NOW)
	     ;;
	     
             (test-info   (rmt:get-test-state-status-by-id run-id test-id))
             (state       (car test-info));; (db:test-get-state test-info))
             (status      (cdr test-info));; (db:test-get-status test-info))
	     (killreq     (equal? state "KILLREQ"))
             ;; (test-info   (rmt:get-test-state-status-by-id run-id test-id))
             ;; (state       (car test-info));; (db:test-get-state test-info))
             ;; (status      (cdr test-info));; (db:test-get-status test-info))
	     (killreq     (file-exists? (conc work-area"/kill-test"))) ;; (equal? state "KILLREQ"))
             (kill-reason  "no kill reason specified")
             (kill-job?    #f))
        ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
        (cond
         (killreq
          (set! kill-reason "KILLING TEST since received kill request (KILLREQ)")
          (set! kill-job? #t))
         ((and runtlim (> (- (current-seconds) start-seconds) runtlim))
          (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim))
          (set! kill-job? #t))
         ((equal? status "DEAD")
         #;((equal? status "DEAD") ;; NEED ALTERNATIVE MECHANISM FOR THIS.
          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
          (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
          ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
          (set! kill-job? #f)))

        (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)

	;; revisit logic in zombie handling.
	;;
        (if (common:low-noise-print 600 "run zombie") ;; every five minutes is plenty
	    (launch:handle-zombie-tests run-id))
	
        (when do-sync
          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f))
          ;; (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)

	  (let ((oup (open-output-file (conc work-area"/.run-logging-stats.csv") :append))
		(csv (conc run-id","test-id","new-cpu-load","new-disk-free","(calc-minutes))))
	    (debug:print 0 *default-log-port* "Updating run log, csv="csv)
	    (with-output-to-port oup
	      (lambda ()
		(print csv)))
	    (close-output-port oup)))
        
	(if kill-job? 
	    (begin
              (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason)
	      (mutex-lock! m)
	      ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this
	      ;;       section and the runit section? Or add a loop that tries three times with a 1/4 second
369
370
371
372
373
374
375
376
377


378
379
380
381
382
383
384
385
386
387
388
389
390
391


392
393
394
395
396
397
398
399
400







-
-
+
+







     scripts)

    ;; extract logpro from testconfig and write them to files in test run dir
    (for-each
     (lambda (logprodat)
       (match logprodat
	      ((name content)
	       (debug:print-info 2 *default-log-port* "Creating logpro file "(current-directory)"/"name".logpro")
	       (with-output-to-file (conc name".logpro")
	       (debug:print-info 2 *default-log-port* "Creating logpro file "(current-directory)"/"name ".logpro")
               (with-output-to-file (conc name".logpro")
		 (lambda ()
		   (print content)
		   ;; (change-file-mode name (bitwise-ior perm/irwxg perm/irwxu))
		   )))
	      (else
	       (debug:print-info 0 "Invalid logpro definiton found in [logpro] section of testconfig. \"" logprodat "\""))))
     logpros)))
424
425
426
427
428
429
430

431








432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449



450
451
452
453
454
455
456
440
441
442
443
444
445
446
447

448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472

473
474
475
476
477
478
479
480
481
482







+
-
+
+
+
+
+
+
+
+

















-
+
+
+







                                      runscript ;; use unadultered if contains slashes
                                      (let ((fulln (conc work-area "/" runscript)))
	                                  (if (and (common:file-exists? fulln)
                                                   (file-execute-access? fulln))
                                              fulln
                                              runscript))))) ;; assume it is on the path
               (check-work-area           (lambda ()
					    ;;
                                            ;; NFS might not have propagated the directory meta data to the run host - give it time if needed
                                            ;; NFS might not have propagated the
                                            ;; directory meta data to the run host - give it
                                            ;; time if needed
					    ;;
					    ;; alternatively - if nonfs is set, find a working
					    ;; directory using [host-disks] and copy from
					    ;; the homehost using rsync
					    ;;
                                            (let loop ((count 0))
                                              (if (or (common:directory-exists? work-area)
                                                      (> count 10))
                                                  (change-directory work-area)
                                                  (begin
                                                    (debug:print 0 *default-log-port* "INFO: Not starting job yet - directory " work-area " not found")
                                                    (thread-sleep! 10)
                                                    (loop (+ count 1)))))

                                            (if (not (string=?  (common:real-path work-area)(common:real-path (current-directory))))
                                                (begin
                                                  (debug:print 0 *default-log-port*
                                                               "INFO: we are expecting to be in directory " work-area "\n"
                                                               "     but we are actually in the directory " (current-directory) "\n"
                                                               "     doing another change dir.")
                                                  (change-directory work-area)))
                                            
                                            ;; spot check that the files in testpath are available. Too often NFS delays cause problems here.
                                            ;; spot check that the files in testpath are available.
					    ;; Too often NFS delays cause problems here.
					    
                                            (let ((files      (glob (conc testpath "/*")))
                                                  (bad-files '()))
                                              (for-each
                                               (lambda (fullname)
                                                 (let* ((fname (pathname-strip-directory fullname))
                                                        (targn (conc work-area "/" fname)))
                                                   (if (not (file-exists? targn))
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581


582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618

619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648

649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738


739
740
741
742


743

744
745
746
747
748
749
750
751
752

753
754




755
756

757
758
759
760
761
762
763
764
765
766

767



768
769
770
771
772
773

774
775
776
777
778
779
780
781
782
783
784

785
786
787
788
789
790
791
792
793
794

795
796
797
798


799

800
801
802
803
804
805
806
807



808
809
810


811
812
813
814
815
816
817
818


819
820






821
822
823
824
825
826
827
589
590
591
592
593
594
595


596
597
598
599
600
601
602
603
604
605
606
607
608
609
610


611
612
613
614
615
616
617
618
619
620
621
622
623

624
625
626
627
628
629
630
631
632
633
634
635
636


637
638

639
640
641
642
643
644
645
646
647
648
649
650

651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667

668
669

670
671
672
673
674





675
676

677

678

679
680
681
682
683
684
685
686




687



688
689
690
691
692
693
694






























695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711

712
713
714
715
716
717
718
719

720
721
722
723
724
725
726
727
728

729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749

750
751
752
753
754
755
756
757

758

759
760
761
762
763
764
765
766
767

768





769
770
771
772

773

774
775
776
777
778

779
780
781
782
783
784
785


786
787
788
789


790
791


792
793
794
795


796
797


798
799
800
801
802
803
804
805
806
807
808
809
810







-
-










+
+



-
-













-













-
-


-
+











-

















-
+

-





-
-
-
-
-


-

-

-








-
-
-
-

-
-
-







-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-

















-
+
+




+
+
-
+








-
+


+
+
+
+


+










+
-
+
+
+





-
+
-









-
+
-
-
-
-
-




-
+
-



+
+
-
+






-
-
+
+
+

-
-
+
+
-
-




-
-
+
+
-
-
+
+
+
+
+
+







	  (debug:print 2 *default-log-port* "Executing " test-name " (id: " test-id ") on " (get-host-name))
	  (set! keys       (rmt:get-keys))
	  ;; (runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals) ;; these may be needed by the launching process
	  ;; one of these is defunct/redundant ...
	  (if (not (launch:setup force-reread: #t))
	      (begin
		(debug:print 0 *default-log-port* "Failed to setup, exiting") 
		;; (sqlite3:finalize! db)
		;; (sqlite3:finalize! tdb)
		(exit 1)))
          ;; validate that the test run area is available
          (check-work-area)
          
          ;; still need to go back to run area home for next couple steps
	  (change-directory *toppath*) 

	  ;; NOTE: Current order is to process runconfigs *before* setting the MT_ vars. This 
	  ;;       seems non-ideal but could well break stuff
	  ;;    BUG? BUG? BUG?

	  ;; CHANGE THIS: Do NOT read the runconfigs again - ONLY use the cached version
	  
	  (let ((rconfig (full-runconfigs-read)) ;; (read-config (conc  *toppath* "/runconfigs.config") #f #t sections: (list "default" target))))
		(wconfig (read-config "waivers.config" #f #t sections: `( "default" ,target )))) ;; read the waivers config if it exists
	    ;; (setup-env-defaults (conc *toppath* "/runconfigs.config") run-id (make-hash-table) keyvals target)
	    ;; (set-run-config-vars run-id keyvals target) ;; (db:get-target db run-id))
	    ;; Now have runconfigs data loaded, set environment vars
	    (for-each
	     (lambda (section)
	       (for-each
		(lambda (varval)
		  (let ((var (car varval))
			(val (cadr varval)))
		    (if (and (string? var)(string? val))
			(begin
			  (safe-setenv var (configf:eval-string-in-environment val))) ;; val)
			(debug:print-error 0 *default-log-port* "bad variable spec, " var "=" val))))
		(configf:get-section rconfig section)))
	     (list "default" target)))
          ;;(bb-check-path msg: "launch:execute post block 1")

	  ;; NFS might not have propagated the directory meta data to the run host - give it time if needed
	  (let loop ((count 0))
	    (if (or (common:file-exists? work-area)
		    (> count 10))
		(change-directory work-area)
		(begin
		  (debug:print 0 *default-log-port* "INFO: Not starting job yet - directory " work-area " not found")
		  (thread-sleep! 10)
		  (loop (+ count 1)))))

          ;; now we can switch to the work-area?
          (change-directory work-area)
          ;;(bb-check-path msg: "launch:execute post block 1.5")
	  ;; (change-directory work-area) 
	  (set! keyvals    (keys:target->keyval keys target))
	  ;; apply pre-overrides before other variables. The pre-override vars must not
	  ;; clobbers things from the official sources such as megatest.config and runconfigs.config
	  ;; clobber things from the official sources such as megatest.config and runconfigs.config
	  (if (string? set-vars)
	      (let ((varpairs (string-split set-vars ",")))
		(debug:print 4 *default-log-port* "varpairs: " varpairs)
		(map (lambda (varpair)
		       (let ((varval (string-split varpair "=")))
			 (if (eq? (length varval) 2)
			     (let ((var (car varval))
				   (val (cadr varval)))
			       (debug:print 1 *default-log-port* "Adding pre-var/val " var " = " val " to the environment")
			       (setenv var val)))))
		     varpairs)))
          ;;(bb-check-path msg: "launch:execute post block 2")
	  (for-each
	   (lambda (varval)
	     (let ((var (car varval))
		   (val (cadr varval)))
	       (if val
		   (setenv var val)
		   (begin
		     (debug:print-error 0 *default-log-port* "required variable " var " does not have a valid value. Exiting")
		     (exit)))))
	     (list 
	      (list  "MT_TEST_RUN_DIR" work-area)
	      (list  "MT_TEST_NAME" test-name)
	      (list  "MT_ITEM_INFO" (conc itemdat))
	      (list  "MT_ITEMPATH"  item-path)
	      (list  "MT_RUNNAME"   runname)
	      (list  "MT_MEGATEST"  megatest)
	      (list  "MT_TARGET"    target)
	      (list  "MT_LINKTREE"  (common:get-linktree)) ;; (configf:lookup *configdat* "setup" "linktree"))
	      (list  "MT_LINKTREE"  (common:get-linktree))
	      (list  "MT_TESTSUITENAME" (common:get-testsuite-name))))
          ;;(bb-check-path msg: "launch:execute post block 3")

	  (let ((tmppath (getenv "PATH")))
	    (if (string-search tmppath " ")
		(debug:print 0 *default-log-port* "WARNING: spaces in PATH are not supported."))
	    (if mt-bindir-path (setenv "PATH" (conc tmppath":"mt-bindir-path))))
          ;;(bb-check-path msg: "launch:execute post block 4")
	  ;; (change-directory top-path)
	  ;; Can setup as client for server mode now
	  ;; (client:setup)
  
	  ;; environment overrides are done *before* the remaining critical envars.
	  (alist->env-vars env-ovrd)
          ;;(bb-check-path msg: "launch:execute post block 41")
	  (runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals)
          ;;(bb-check-path msg: "launch:execute post block 42")
	  (set-item-env-vars itemdat)
          ;;(bb-check-path msg: "launch:execute post block 43")
          (let ((blacklist (configf:lookup *configdat* "setup" "blacklistvars")))
            (if blacklist
		(let ((vars (string-split blacklist)))
		  (save-environment-as-files "megatest" ignorevars: vars)
		  (for-each (lambda (var)
			      (unsetenv var))
			    vars))
                (save-environment-as-files "megatest")))
          ;;(bb-check-path msg: "launch:execute post block 44")
	  ;; open-run-close not needed for test-set-meta-info
	  ;; (tests:set-full-meta-info #f test-id run-id 0 work-area)
	  ;; (tests:set-full-meta-info test-id run-id 0 work-area)
	  (tests:set-full-meta-info #f test-id run-id 0 work-area 10)

	  ;; (thread-sleep! 0.3) ;; NFS slowness has caused grief here

	  (if (args:get-arg "-xterm")
	      (set! fullrunscript "xterm")
	      (if (and fullrunscript 
		       (common:file-exists? fullrunscript)
		       (not (file-execute-access? fullrunscript)))
		  (system (conc "chmod ug+x " fullrunscript))))
	  (launch:extract-scripts-logpro work-area test-name item-path tconfigreg)

;;;;;	  ;; We are about to actually kick off the test
;;;;;	  ;; so this is a good place to remove the records for 
;;;;;	  ;; any previous runs
;;;;;	  ;; (db:test-remove-steps db run-id testname itemdat)
;;;;;	  ;; now is also a good time to write the .testconfig file
;;;;;	  (let* ((tconfig-fname   (conc work-area "/.testconfig"))
;;;;;		 (tconfig-tmpfile (conc tconfig-fname ".tmp"))
;;;;;		 (tconfig         (tests:get-testconfig test-name item-path tconfigreg #t force-create: #t)) ;; 'return-procs)))
;;;;;		 (scripts         (configf:get-section tconfig "scripts"))
;;;;;		 (precmd          (configf:lookup tconfig )
;;;;;	    ;; create .testconfig file
;;;;;	    (configf:write-alist tconfig tconfig-tmpfile)
;;;;;	    (file-move tconfig-tmpfile tconfig-fname #t)
;;;;;	    (delete-file* ".final-status")
;;;;;
;;;;;	    ;; extract scripts from testconfig and write them to files in test run dir
;;;;;	    (for-each
;;;;;	     (lambda (scriptdat)
;;;;;	       (match scriptdat
;;;;;		      ((name content)
;;;;;		       (with-output-to-file name
;;;;;			 (lambda ()
;;;;;			   (print content)
;;;;;			   (change-file-mode name (bitwise-ior perm/irwxg perm/irwxu)))))
;;;;;		      (else
;;;;;		       (debug:print-info 0 "Invalid script definiton found in [scripts] section of testconfig. \"" scriptdat "\""))))
;;;;;	     scripts))
	  ;;

	  (let* ((m            (make-mutex))
		 (kill-job?    #f)
		 (exit-info    (make-launch:einf pid: #t exit-status: #t exit-code: #t rollup-status: 0)) ;; pid exit-status exit-code (i.e. process was successfully run) rollup-status
		 (job-thread   #f)
		 ;; (keep-going   #t)
		 (misc-flags   (let ((ht (make-hash-table)))
				 (hash-table-set! ht 'keep-going #t)
				 ht))
		 (runit        (lambda ()
				 (launch:manage-steps run-id test-id item-path fullrunscript ezsteps subrun test-name tconfigreg exit-info m)))
		 (monitorjob   (lambda ()
				 (launch:monitor-job  run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags)))
		 (th1          (make-thread monitorjob "monitor job"))
		 (th2          (make-thread runit "run job"))
                 (tconfig         (tests:get-testconfig test-name item-path tconfigreg #t))
                 (propagate-exit-code (configf:lookup *configdat* "setup" "propagate-exit-code"))
                 (propagate-status-list '("FAIL" "KILLED" "ABORT" "DEAD" "CHECK" "SKIP" "WAIVED"))
                 (test-status "not set")
                 (test-status     "not set")
		 (test-state      "not set")
		 (precmd          (configf:lookup tconfig "setup" "precmd"))
		 (postcmd         (configf:lookup tconfig "setup" "postcmd")))
	    ;; first, if set, run the precmd
	    (if precmd ;; (file-exists? precmd)(file-execute-access? precmd))
		(begin
		  ;; (save-environment-as-files "precmd-envt")
		(system precmd)) ;; up to test author to put nbfake if desired.
		  (system precmd))) ;; up to test author to put nbfake if desired.
	    (set! job-thread th2)
	    (thread-start! th1)
	    (thread-start! th2)
	    (thread-join! th2)
	    (debug:print-info 0 *default-log-port* "Megatest execute of test " test-name ", item path " item-path " complete. Notifying the db ...")
            (debug:print-info 2 *default-log-port* "exit-info = " exit-info)
	    (hash-table-set! misc-flags 'keep-going #f)
	    (thread-join! th1)
	    (thread-sleep! 1)       ;; givbe thread th1 a chance to be done TODO: Verify this is needed. At 0.1 I was getting fail to stop, increased to total of 1.1 sec.
	    (thread-sleep! 1)       ;; give thread th1 a chance to be done TODO: Verify this is needed. At 0.1 I was getting fail to stop, increased to total of 1.1 sec.
	    (mutex-lock! m)
	    (let* ((item-path (item-list->path itemdat))

		   ;; REMOVE this call and change this section to get killrequest from disk file
		   ;; FUTURE: the test will start a tcp server to listen for kill requests
		   
		   ;; only state and status needed - use lazy routine
		   (testinfo  (rmt:get-testinfo-state-status run-id test-id)))
	      
	      ;; Am I completed?
	      (if (member (db:test-get-state testinfo) '("REMOTEHOSTSTART" "RUNNING"))
                 (let ((new-state  (if kill-job? "KILLED" "COMPLETED"))
		        (new-status (cond
				     ((not (launch:einf-exit-status exit-info)) "FAIL") ;; job failed to run ... (vector-ref exit-info 1)
				     ((eq? (launch:einf-rollup-status exit-info) 0)     ;; (vector-ref exit-info 3)
				      ;; if the current status is AUTO then defer to the calculated value (i.e. leave this AUTO)
				      (if (equal? (db:test-get-status testinfo) "AUTO") "AUTO" "PASS"))
				     ((eq? (launch:einf-rollup-status exit-info) 1) "FAIL")  ;; (vector-ref exit-info 3)
				     ((eq? (launch:einf-rollup-status exit-info) 2)	     ;;	(vector-ref exit-info 3)

				      ;; if the current status is AUTO the defer to the calculated value but qualify (i.e. make this AUTO-WARN)
				      ;; if the current status is AUTO the defer to the calculated value but
				      ;; qualify (i.e. make this AUTO-WARN)
				      
				      (if (equal? (db:test-get-status testinfo) "AUTO") "AUTO-WARN" "WARN"))
				     ((eq? (launch:einf-rollup-status exit-info) 3) "CHECK")
				     ((eq? (launch:einf-rollup-status exit-info) 4) "WAIVED")
				     ((eq? (launch:einf-rollup-status exit-info) 5) "ABORT")
				     ((eq? (launch:einf-rollup-status exit-info) 6) "SKIP")
				     (else "FAIL")))
				     (else "FAIL"))))
                        ) ;; (db:test-get-status testinfo)))
		    (debug:print-info 0 *default-log-port* "Test exited in state=" (db:test-get-state testinfo) ", setting state/status based on exit code of " (launch:einf-exit-status exit-info) " and rollup-status of " (launch:einf-rollup-status exit-info))
   
                    ;; Leave a .final-status file for each sub-test
                    (tests:save-final-status run-id test-id)

		    (tests:test-set-status! run-id 
					    test-id 
					    new-state
					    new-status
					    (args:get-arg "-m") #f)
					    (args:get-arg "-m") #f)))
		    ;; need to update the top test record if PASS or FAIL and this is a subtest
		    ;; NO NEED TO CALL set-state-status-and-roll-up-items HERE, THIS IS DONE IN set-state-status-and-roll-up-items called by tests:test-set-status!
		 )
              )


	      ;; for automated creation of the rollup html file this is a good place...
	      (if (not (equal? item-path ""))
		  (tests:summarize-items run-id test-id test-name #f))
	      ;; BUG was this meant to be the antecnt of the if above?
	      ;; BUG was this meant to be the antecedent of the if above?
	      ;; BUG was this meant to be the antecnt of the if above?
	      (tests:summarize-test run-id test-id)  ;; don't force - just update if no
              ;; Leave a .final-status file for the top level test
              (tests:save-final-status run-id test-id)

	      ;; WHAT IS THIS FOR? TWO CALLS BACK TO SERVER?
	      (rmt:update-run-stats run-id (rmt:get-raw-run-stats run-id))) ;; end of let*
	      (rmt:update-run-stats run-id 'run)) ;; (rmt:get-raw-run-stats run-id))) ;; end of let*

	    (mutex-unlock! m)
            (launch:end-of-run-check run-id )
	    (debug:print 2 *default-log-port* "Output from running " fullrunscript ", pid " (launch:einf-pid exit-info) " in work area " 
			 work-area ":\n====\n exit code " (launch:einf-exit-code exit-info) "\n" "====\n")


            (set! test-status (db:test-get-status (rmt:get-testinfo-state-status run-id test-id)))
	    (let* ((testrec  (rmt:get-testinfo-state-status run-id test-id)))
              (set! test-status (db:test-get-status testrec))
	      (set! test-state  (db:test-get-state  testrec)))

            ;; If the propagate-exit-code option has been set in the megatest config, and the test status matches the list, set the exit code to 1.

            ;; If the propagate-exit-code option has been set in the megatest config
	    ;; and the test status matches the list, set the exit code to 1.
	    (if postcmd
		(system postcmd))

            (if (and propagate-exit-code (string=? propagate-exit-code "yes") (member test-status propagate-status-list))
               (begin
                (debug:print 1 *default-log-port* "Setting exit status to 1 because of test status of " test-status) 
                (set! *globalexitstatus* 1)
               )
                (set! *globalexitstatus* 1)))

            )

	    (if postcmd
		(begin
		  (setenv "MT_TEST_STATE" test-state)
		  (setenv "MT_TEST_STATUS" test-status)
		  ;; (save-environment-as-files "postcmd-envt")
		  (system postcmd)))
	    (if (not (launch:einf-exit-status exit-info))
		(exit 4))))
        )))

;; Spec for End of test
;; At end of each test call, after marking self as COMPLETED do run-state-status-rollup
;; At transition to run COMPLETED/X do hooks
1498
1499
1500
1501
1502
1503
1504
1505






1506



1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518

1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532




1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
















1552
1553
1554
1555
1556
1557
1558
1481
1482
1483
1484
1485
1486
1487

1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508

1509












1510

1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523










1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546







-
+
+
+
+
+
+

+
+
+











-
+
-
-
-
-
-
-
-
-
-
-
-
-

-
+
+
+
+









-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







           (else #f))))
    (when do-scan?
      (debug:print 1 *default-log-port* "INFO: search and mark zombie tests")
      (rmt:set-var key (current-seconds))
      (rmt:find-and-mark-incomplete run-id #f))))



(defstruct launch:ajt
  (vars '())
  (exekey #f)
  (host-type #f)
  (test-sig  #f)
  (cmdline   #f))

;; append vars
(define (launch:ajt-add-vars dat vars)
  (launch:ajt-vars-set! dat (append (launch:ajt-vars dat) vars)))

;; 1. look though disks list for disk with most space
;; 2. create run dir on disk, path name is meaningful
;; 3. create link from run dir to megatest runs area 
;; 4. remotely run the test on allocated host
;;    - could be ssh to host from hosts table (update regularly with load)
;;    - could be netbatch
;;      (launch-test db (cadr status) test-conf))
(define (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat params)
  (assert runname "FATAL: launch-test called with no runname")
  (mutex-lock! *launch-setup-mutex*) ;; setting variables and processing the testconfig is NOT thread-safe, reuse the launch-setup mutex
  (let* ( ;; (lock-key        (conc "test-" test-id))
  (let* (;; locking code removed from here commented out and pasted at end of file
	;; (got-lock        (let loop ((lock        (rmt:no-sync-get-lock lock-key))
	;; 			     (expire-time (+ (current-seconds) 15))) ;; give up on getting the lock and steal it after 15 seconds
	;; 		    (if (car lock)
	;; 			#t
	;; 			(if (> (current-seconds) expire-time)
	;; 			    (begin
	;; 			      (debug:print-info 0 *default-log-port* "Timed out waiting for a lock to launch test " keyvals " " runname " " test-name " " test-path)
	;; 			      (rmt:no-sync-del! lock-key) ;; destroy the lock
	;; 			      (loop (rmt:no-sync-get-lock lock-key) expire-time)) ;; 
	;; 			    (begin
	;; 			      (thread-sleep! 1)
	;; 			      (loop (rmt:no-sync-get-lock lock-key) expire-time))))))
	 (item-path       (item-list->path itemdat))
	 (contour         #f)) ;; NOT READY FOR THIS (args:get-arg "-contour")))
	 (contour         #f)                         ;; NOT READY FOR THIS (args:get-arg "-contour")))
	 ;; launcher-mode will be 'adjutant or 'normal
	 (launcher-mode   (string->symbol (or (configf:lookup *configdat* "jobtools" "mode") "normal")))
	 (ajtdat          (make-launch:ajt)))
    (let loop ((delta        (- (current-seconds) *last-launch*))
	       (launch-delay (configf:lookup-number *configdat* "setup" "launch-delay" default: 0)))
      (if (> launch-delay delta)
	  (begin
	    ;; (if (common:low-noise-print 1200 "test launch delay") ;; every two hours or so remind the user about launch delay.
	;;	(debug:print-info 0 *default-log-port* "NOTE: test launches are delayed by " launch-delay " seconds. See megatest.config launch-delay setting to adjust.")) ;; launch of " test-name " for " (- launch-delay delta) " seconds"))
	    (thread-sleep! (- launch-delay delta))
	    (loop (- (current-seconds) *last-launch*) launch-delay))))
    (change-directory *toppath*)
    (alist->env-vars ;; consolidate this code with the code in megatest.scm for "-execute", *maybe* - the longer they are set the longer each launch takes (must be non-overlapping with the vars)
     (append
      (list
       (list "MT_RUN_AREA_HOME" *toppath*)
       (list "MT_TEST_NAME" test-name)
       (list "MT_RUNNAME"   runname)
       (list "MT_ITEMPATH"  item-path)
       (list "MT_CONTOUR"   contour)
       )
      itemdat))
    (let ((var-list (append
		     (list
		      (list "MT_RUN_AREA_HOME" *toppath*)
		      (list "MT_TEST_NAME" test-name)
		      (list "MT_RUNNAME"   runname)
		      (list "MT_ITEMPATH"  item-path)
		      (list "MT_CONTOUR"   contour)
		      )
		     itemdat)))
       ;; consolidate this code with the code in megatest.scm for
       ;; "-execute", *maybe* - the longer they are set the longer
       ;; each launch takes (must be non-overlapping with the vars)
      (alist->env-vars var-list)
      ;; the var-list into the ajtdat adjutant record whether it is needed or not.
      (launch:ajt-add-vars ajtdat var-list))
    
    (let* ((tregistry       (tests:get-all)) ;; third param (below) is system-allowed
           ;; for tconfig, why do we allow fallback to test-conf?
	   (tconfig         (or (tests:get-testconfig test-name item-path tregistry #t force-create: #t)
				(begin
                                  (debug:print 0 *default-log-port* "WARNING: falling back to pre-calculated testconfig. This is likely not desired.")
                                  test-conf))) ;; force re-read now that all vars are set
	   (useshell        (let ((ush (configf:lookup *configdat* "jobtools"     "useshell")))
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582



1583
1584
1585
1586
1587
1588



1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601









1602
1603
1604
1605
1606

1607
1608
1609
1610
1611
1612
1613
1554
1555
1556
1557
1558
1559
1560




1561





1562
1563
1564






1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593

1594
1595
1596
1597
1598
1599
1600
1601







-
-
-
-

-
-
-
-
-
+
+
+
-
-
-
-
-
-
+
+
+













+
+
+
+
+
+
+
+
+




-
+







	   (subrun          (> (length (hash-table-ref/default tconfig "subrun"  '())) 0)) ;; send a flag to process a subrun
	   ;; (diskspace       (configf:lookup tconfig   "requirements" "diskspace"))
	   ;; (memory          (configf:lookup tconfig   "requirements" "memory"))
	   ;; (hosts           (configf:lookup *configdat* "jobtools"     "workhosts")) ;; I'm pretty sure this was never completed
	   (remote-megatest (configf:lookup *configdat* "setup" "executable"))
	   (run-time-limit  (or (configf:lookup  tconfig   "requirements" "runtimelim")
				(configf:lookup  *configdat* "setup" "runtimelim")))
	   ;; FIXME SOMEDAY: not good how this is so obtuse, this hack is to 
	   ;;                allow running from dashboard. Extract the path
	   ;;                from the called megatest and convert dashboard
	   ;;             	  or dboard to megatest
	   (local-megatest  (common:find-local-megatest))
	   #;(local-megatest  (let* ((lm  (car (argv)))
				   (dir (pathname-directory lm))
				   (exe (pathname-strip-directory lm)))
			      (conc (if dir (conc dir "/") "")
				    (case (string->symbol exe)
	   (launcher        (let ((l (common:get-launcher *configdat* test-name item-path launcher-mode)))
			      (if (string? l)
				  (string-split l)
				      ((dboard)    "../megatest")
				      ((mtest)     "../megatest")
				      ((dashboard) "megatest")
				      (else exe)))))
	   (launcher        (common:get-launcher *configdat* test-name item-path)) ;; (configf:lookup *configdat* "jobtools"     "launcher"))
	   (test-sig        (conc (common:get-testsuite-name) ":" test-name ":" item-path)) ;; (item-list->path itemdat))) ;; test-path is the full path including the item-path
				  l))) ;; some nonhomogenuity here. '(cmd param1 param2 ...) OR '(host-type launcher)
	    ;; (item-list->path itemdat))) ;; test-path is the full path including the item-path
	   (test-sig        (conc (common:get-testsuite-name) ":" test-name ":" item-path))
	   (work-area       #f)
	   (toptest-work-area #f) ;; for iterated tests the top test contains data relevant for all
	   (diskpath   #f)
	   (cmdparms   #f)
	   (fullcmd    #f) ;; (define a (with-output-to-string (lambda ()(write x))))
	   (mt-bindir-path #f)
	   (testinfo   (rmt:get-test-info-by-id run-id test-id))
	   (mt_target  (string-intersperse (map cadr keyvals) "/"))
	   (debug-param (append (if (args:get-arg "-debug")  (list "-debug" (args:get-arg "-debug")) '())
				(if (args:get-arg "-logging")(list "-logging") '())
				(if (configf:lookup *configdat* "misc" "profilesw")
				    (list (configf:lookup *configdat* "misc" "profilesw"))
				    '()))))
      ;; save the test-sig in the ajtdat record
      (launch:ajt-test-sig-set! ajtdat test-sig)
      ;; go ahead and figure out if we have a host-type from the
      ;; launcher call above and save it in the ajtdat record
      (if (and (eq? launcher-mode 'adjutant)
	       (list? launcher)
	       (> (length launcher) 1))
	  (launch:ajt-host-type-set! ajtdat (car launcher)))
 
      ;; (if hosts (set! hosts (string-split hosts)))
      ;; set the megatest to be called on the remote host
      (if (not remote-megatest)(set! remote-megatest local-megatest)) ;; "megatest"))
      (set! mt-bindir-path (pathname-directory remote-megatest))
      (if launcher (set! launcher (string-split launcher)))
      ;; (if launcher (set! launcher (string-split launcher)))           ;; yuk!
      ;; set up the run work area for this test
      (if (and (args:get-arg "-preclean") ;; user has requested to preclean for this run
	       (not (member (db:test-get-rundir testinfo)(list "n/a" "/tmp/badname")))) ;; n/a is a placeholder and thus not a read dir
	  (begin
	    (debug:print-info 0 *default-log-port* "attempting to preclean directory " (db:test-get-rundir testinfo) " for test " test-name "/" item-path)
	    (runs:remove-test-directory testinfo 'remove-data-only))) ;; remove data only, do not perturb the record
      
1657
1658
1659
1660
1661
1662
1663


1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674




1675

1676
1677
1678
1679
1680
1681
1682

1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696








1697
1698
1699
1700





1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714















1715
1716
























1717

1718
1719
1720
1721
1722
1723
1724
1725
1726
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661



1662
1663
1664
1665

1666
1667
1668

1669
1670
1671

1672
1673
1674
1675
1676
1677









1678
1679
1680
1681
1682
1683
1684
1685




1686
1687
1688
1689
1690
1691
1692












1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733

1734
1735

1736
1737
1738
1739
1740
1741
1742







+
+








-
-
-
+
+
+
+
-
+


-



-
+





-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
-
-
-
-
+
+
+
+
+


-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+


+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
+

-







					(list 'target    mt_target)
					(list 'contour   contour)
					(list 'runtlim   (if run-time-limit (common:hms-string->seconds run-time-limit) #f))
					(list 'env-ovrd  (hash-table-ref/default *configdat* "env-override" '())) 
					(list 'set-vars  (if params (hash-table-ref/default params "-setvars" #f)))
					(list 'runname   runname)
					(list 'mt-bindir-path mt-bindir-path))))))))
      ;; save the cmdparms in the ajtdat
      (launch:ajt-exekey-set! ajtdat cmdparms)

        (setenv "MT_CMDINFO" cmdparms)  ;; setting this for use in nblauncher
      
      ;; clean out step records from previous run if they exist
      ;; (rmt:delete-test-step-records run-id test-id)
      ;; if the dir does not exist we may have a itempath where individual variables are a path, launch anyway
      (if (common:file-exists? work-area)
	  (change-directory work-area)) ;; so that log files from the launch process don't clutter the test dir
      (cond
       ;; ((and launcher hosts) ;; must be using ssh hostname
       ;;    (set! fullcmd (append launcher (car hosts)(list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param)))

      ;; save the command line for adjutant mode (might never be needed but best to assemble it here)
      (launch:ajt-cmdline-set! ajtdat (string-intersperse
				       (append (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param)))
       ;; (set! fullcmd (append launcher (car hosts)(list remote-megatest test-sig "-execute" cmdparms))))
      (cond       
       (launcher
	(set! fullcmd (append launcher (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param)))
       ;; (set! fullcmd (append launcher (list remote-megatest test-sig "-execute" cmdparms))))
       (else
	(if (not useshell)(debug:print 0 *default-log-port* "WARNING: internal launching will not work well without \"useshell yes\" in your [jobtools] section"))
	(set! fullcmd (append (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param (list (if useshell "&" ""))))))
      ;; (set! fullcmd (list remote-megatest test-sig "-execute" cmdparms (if useshell "&" "")))))
      
      (if (args:get-arg "-xterm")(set! fullcmd (append fullcmd (list "-xterm"))))
      (debug:print 1 *default-log-port* "Launching " work-area)
      ;; set pre-launch-env-vars before launching, keep the vars in prevvals and put the envionment back when done
      (debug:print 4 *default-log-port* "fullcmd: " fullcmd)
      (set! *last-launch* (current-seconds)) ;; all that junk above takes time, set this as late as possible.
      (let* ((commonprevvals (alist->env-vars
			      (hash-table-ref/default *configdat* "env-override" '())))
	     (miscprevvals   (alist->env-vars ;; consolidate this code with the code in megatest.scm for "-execute"
			      (append (list (list "MT_TEST_RUN_DIR" work-area)
					    (list "MT_TEST_NAME" test-name)
					    (list "MT_ITEM_INFO" (conc itemdat)) 
					    (list "MT_RUNNAME"   runname)
					    (list "MT_TARGET"    mt_target)
					    (list "MT_ITEMPATH"  item-path)
      (let* ((env-override-vars  (hash-table-ref/default *configdat* "env-override" '()))
	     (commonprevvals     (alist->env-vars env-override-vars))
	     (misc-vars          (append (list (list "MT_TEST_RUN_DIR" work-area)
					       (list "MT_TEST_NAME" test-name)
					       (list "MT_ITEM_INFO" (conc itemdat)) 
					       (list "MT_RUNNAME"   runname)
					       (list "MT_TARGET"    mt_target)
					       (list "MT_ITEMPATH"  item-path))
					    )
				      itemdat)))
	     (testprevvals   (alist->env-vars
			      (hash-table-ref/default tconfig "pre-launch-env-overrides" '())))
					 itemdat))
	     (miscprevvals   (alist->env-vars misc-vars));; consolidate this code with the code in megatest.scm for "-execute"
	     (test-vars      (hash-table-ref/default tconfig "pre-launch-env-overrides" '()))
	     (testprevvals   (alist->env-vars test-vars))
			      
	     ;; Launchwait defaults to true, must override it to turn off wait
	     (launchwait     (if (equal? (configf:lookup *configdat* "setup" "launchwait") "no") #f #t))
	     (launch-results-prev (apply (if launchwait ;; BB: TODO: refactor this to examine return code of launcher, if nonzero, set state to launch failed.
					     process:cmd-run-with-stderr-and-exitcode->list
					     process-run)
					 (if useshell
					     (let ((cmdstr (string-intersperse fullcmd " ")))
					       (if launchwait
						   cmdstr
						   (conc cmdstr " >> mt_launch.log 2>&1 &")))
					     (car fullcmd))
					 (if useshell
					     '()
					     (cdr fullcmd))))
	     ;; BB: TODO: refactor this to examine return code of launcher, if nonzero, set state to launch failed.
	     (launch-results-prev (if (eq? launcher-mode 'adjutant)
				      '(#t 0) ;; just some fake data to fool downstream but non-applicable code
				      (apply (if launchwait
						 process:cmd-run-with-stderr-and-exitcode->list
						 process-run)
					     (if useshell
						 (let ((cmdstr (string-intersperse fullcmd " ")))
						   (if launchwait
						       cmdstr
						       (conc cmdstr " >> mt_launch.log 2>&1 &")))
						 (car fullcmd))
					     (if useshell
						 '()
						 (cdr fullcmd)))))
             (success        (if launchwait (equal? 0 (cadr launch-results-prev)) #t))
             (launch-results (if launchwait (car launch-results-prev) launch-results-prev)))

	(launch:ajt-add-vars ajtdat env-override-vars)
	(launch:ajt-add-vars ajtdat misc-vars)
	(launch:ajt-add-vars ajtdat test-vars)

	;; if in adjutant mode we register the job in the jobs_queue
	;; then fire off an adjutant runner
	;;
	(if (eq? launcher-mode 'adjutant)
	    (let* ((adjutant-runner-cmd (append (cdr launcher)
						(list remote-megatest "-adjutant"
						      (launch:ajt-host-type ajtdat)
						      "-start-dir" *toppath*)))
		   (adj-cmd     (conc (string-intersperse (map conc adjutant-runner-cmd) " ")
				      "&")))         
	      (rmt:no-sync-add-job
	       (launch:ajt-host-type  ajtdat)
	       (launch:ajt-vars ajtdat)
	       (launch:ajt-exekey     ajtdat)
	       (launch:ajt-cmdline    ajtdat))
	      (print "adj-cmd: " adj-cmd)
	      (system adj-cmd)
	      ))
	
        (if (not success)
	(if (not success)
            (tests:test-set-status! run-id test-id "COMPLETED" "DEAD" "launcher failed; exited non-zero; check mt_launch.log" #f)) ;; (if launch-results launch-results "FAILED"))
        (mutex-unlock! *launch-setup-mutex*) ;; yes, really should mutex all the way to here. Need to put this entire process into a fork.
	;; (rmt:no-sync-del! lock-key)         ;; release the lock for starting this test
	(if (not launchwait) ;; give the OS a little time to allow the process to start
	    (thread-sleep! 0.01))
	(with-output-to-file "mt_launch.log"
	  (lambda ()
	    (print "LAUNCHCMD: " (string-intersperse fullcmd " "))
	    (if (list? launch-results)
1738
1739
1740
1741
1742
1743
1744




1745
1746
1747
1748
1749
1750
1751
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771







+
+
+
+







	      ;; but this hack will work! Thanks go to Alan Post of the Chicken email list
	      ;; NB// Is this still needed? Should be safe to go back to "exit" now?
	      (process-signal (current-process-id) signal/kill)
	      ))
	(alist->env-vars miscprevvals)
	(alist->env-vars testprevvals)
	(alist->env-vars commonprevvals)
	;; yes, really should mutex all the way to here. Need to put this entire process into a fork.
	;; the unlock previously was further up. This seemed wrong as we should not proceed until the
	;; vars have been reset.
	(mutex-unlock! *launch-setup-mutex*)
	launch-results))
    (change-directory *toppath*)
    (thread-sleep! (configf:lookup-number *configdat* "setup" "inter-test-delay" default: 0.0))))

;; recover a test where the top controlling mtest may have died
;;
(define (launch:recover-test run-id test-id)
1767
1768
1769
1770
1771
1772
1773
















1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809







+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
		     (read-symbolic-link (conc "/proc/" pid "/cwd"))
		     #f)))
    ;; now wait on that process if all is correct
    ;; periodically update the db with runtime
    ;; when the process exits look at the db, if still RUNNING after 10 seconds set
    ;; state/status appropriately
    (process-wait pid)))


 ;; (lock-key        (conc "test-" test-id))
	;; (got-lock        (let loop ((lock        (rmt:no-sync-get-lock lock-key))
	;; 			     (expire-time (+ (current-seconds) 15))) ;; give up on getting the lock and steal it after 15 seconds
	;; 		    (if (car lock)
	;; 			#t
	;; 			(if (> (current-seconds) expire-time)
	;; 			    (begin
	;; 			      (debug:print-info 0 *default-log-port* "Timed out waiting for a lock to launch test " keyvals " " runname " " test-name " " test-path)
	;; 			      (rmt:no-sync-del! lock-key) ;; destroy the lock
	;; 			      (loop (rmt:no-sync-get-lock lock-key) expire-time)) ;; 
	;; 			    (begin
	;; 			      (thread-sleep! 1)
	;; 			      (loop (rmt:no-sync-get-lock lock-key) expire-time))))))