Megatest

Changes On Branch 2c853b3d8d727513
Login

Changes In Branch v1.64-farmedout-runtest Through [2c853b3d8d] Excluding Merge-Ins

This is equivalent to a diff from 5be250d6fc to 2c853b3d8d

2018-03-29
16:43
error check MTESTHASH check-in: cb3bbc9d2e user: bjbarcla tags: v1.64
2018-02-06
18:33
wip check-in: 8f16df638a user: bb tags: v1.64-forked-launch
12:34
instrumented runtestsqueue and test launcher for profiling Leaf check-in: 9b8a609dee user: bjbarcla tags: v1.64-instrumented-runtest
2018-02-02
17:54
wip check-in: d3905e798c user: bjbarcla tags: v1.64-farmedout-runtest
17:28
wip check-in: 2c853b3d8d user: bjbarcla tags: v1.64-farmedout-runtest
2018-01-17
21:03
Merged in some of Jeff's changes to Makefile.deploy check-in: 6275b9b5c5 user: matt tags: v1.65
2017-12-14
13:32
Updated deploy Makefile check-in: 5be250d6fc user: jmoon18 tags: v1.64
2017-12-12
14:34
updated itemmap section in manual; hopefully it is clearer now check-in: c607976150 user: bjbarcla tags: v1.64

Modified megatest.scm from [4ccc1620b9] to [50f303f608].

543
544
545
546
547
548
549
550

551
552
553
554
555
556
557
543
544
545
546
547
548
549

550
551
552
553
554
555
556
557







-
+







					    (printf "Sending signal/term to ~A\n" pid)
					    (process-signal pid signal/term))))))
		       (process:children #f))
		      (original-exit exit-code)))))

;; for some switches always print the command to stderr
;;
(if (args:any? "-run" "-runall" "-remove-runs" "-set-state-status")
(if (args:any? "-run" "-runall" "-remove-runs" "-set-state-status" "-run-test")
    (debug:print 0 *default-log-port* (string-intersperse (argv) " ")))

;; some switches imply homehost. Exit here if not on homehost
;;
(let ((homehost-required  (list "-cleanup-db" "-server")))
  (if (apply args:any? homehost-required)
      (if (not (common:on-homehost?))
1555
1556
1557
1558
1559
1560
1561
1562


















1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580

1581
1582
1583
1584
1585
1586
1587
1555
1556
1557
1558
1559
1560
1561

1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596

1597
1598
1599
1600
1601
1602
1603
1604







-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

















-
+







;; 	 (launch:setup))
;;     (let* ((since-time (string->number (args:get-arg "-since")))
;; 	   (run-ids    (db:get-changed-run-ids since-time)))
;;       ;; (rmt:get-tests-for-runs-mindata run-ids testpatt states status not-in)
;;       (print (sort run-ids <))
;;       (set! *didsomething* #t)))
      
      

;;======================================================================
;; run single test
;;======================================================================
;; launch test; executed from runloop
;; should only be called by megatest, not user.
(if (args:get-arg "-internal-run-test")
    (general-run-call
     "-internal-run-test"
     "run single test; internal use only"
     (lambda (target runname keys keyvals)
       (let* ((flags args:arg-hash))
         (run:test-standalone target runname keys keyvals flags)


         )
       )))

;;======================================================================
;; full run
;;======================================================================

;; get lock in db for full run for this directory
;; for all tests with deps
;;   walk tree of tests to find head tasks
;;   add head tasks to task queue
;;   add dependant tasks to task queue 
;;   add remaining tasks to task queue
;; for each task in task queue
;;   if have adequate resources
;;     launch task
;;   else
;;     put task in deferred queue
;; if still ok to run tasks
;;   process deferred tasks per above steps

     
;; run all tests are are Not COMPLETED and PASS or CHECK
(if (or (args:get-arg "-runall")
	(args:get-arg "-run")
	(args:get-arg "-rerun-clean")
	(args:get-arg "-rerun-all")
	(args:get-arg "-runtests"))
    (let ((need-clean (or (args:get-arg "-rerun-clean")

Modified runs.scm from [cd4ddfd17d] to [6b5db4b196].

331
332
333
334
335
336
337
338


339
340
341
342
343
344
345
331
332
333
334
335
336
337

338
339
340
341
342
343
344
345
346







-
+
+







	 (test-names         #f)  ;; Generated by a call to (tests:filter-test-names all-test-names test-patts))
	 (required-tests     #f)  ;; Put fully qualified test/testpath names in this list to be done
         (waitors-upon       (make-hash-table)) ;; given a test, return list of tests waiting upon this test.
	 (task-key           (conc (hash-table->alist flags) " " (get-host-name) " " (current-process-id)))
	 ;; (tdbdat             (tasks:open-db))
	 (config-reruns      (let ((x (configf:lookup *configdat* "setup" "reruns")))
			       (if x (string->number x) #f)))
	 (allowed-tests      #f))
	 (allowed-tests      #f)
         (last-loop-top-time (current-seconds)))

    ;; check if readonly
    (when readonly-mode
      (debug:print-error 0 *default-log-port* "megatest.db is readonly.  Cannot proceed.")
      (exit 1))

    ;; per user request. If less than 100Meg space on dbdir partition, bail out with error
482
483
484
485
486
487
488

489




490
491
492
493
494
495
496
483
484
485
486
487
488
489
490

491
492
493
494
495
496
497
498
499
500
501







+
-
+
+
+
+







	  (setenv "MT_TEST_NAME" hed) ;; 
	  (let*-values (((waitons waitors config) (tests:get-waitons hed all-tests-registry))
                        ((hed-mode)
                         (let ((m (config-lookup config "requirements" "mode")))
                           (if m (map string->symbol (string-split m)) '(normal))))
                        ((hed-itemized-waiton) ;; are items in hed waiting on items of waiton?
                         (not (null? (lset-intersection eq? hed-mode '(itemmatch itemwait)))))
                        ((loop-delta-time) (- (current-seconds) last-loop-top-time))
                        )
                        
                        )
            (set! last-loop-top-time (current-seconds))
            (BB> "RUNLOOPTOP | cycletime="loop-delta-time" depth="(add1 (length tal))" hed="hed)
	    (debug:print-info 8 *default-log-port* "waitons: " waitons)
	    ;; check for hed in waitons => this would be circular, remove it and issue an
	    ;; error
	    (if (or (member hed waitons)
		    (member hed waitors))
		(begin
		  (debug:print-error 0 *default-log-port* "test " hed " has listed itself as a waiton or waitor, please correct this!")
1237
1238
1239
1240
1241
1242
1243
1244

1245
1246
1247
1248
1249
1250
1251
1242
1243
1244
1245
1246
1247
1248

1249
1250
1251
1252
1253
1254
1255
1256







-
+







  ;; At this point the list of parent tests is expanded 
  ;; NB// Should expand items here and then insert into the run queue.
  (debug:print 5 *default-log-port* "test-records: " test-records ", flags: " (hash-table->alist flags))

  ;; Do mark-and-find clean up of db before starting runing of quue
  ;;
  ;; (rmt:find-and-mark-incomplete)

  (BB> "entered run-tests-queue")
  (let* ((run-info             (rmt:get-run-info run-id))
	(tests-info            (mt:get-tests-for-run run-id #f '() '())) ;;  qryvals: "id,testname,item_path"))
	(sorted-test-names     (tests:sort-by-priority-and-waiton test-records)) 
	(test-registry         (make-hash-table))
	(registry-mutex        (make-mutex))
	(num-retries           0)
	(max-retries           (config-lookup *configdat* "setup" "maxretries"))
1278
1279
1280
1281
1282
1283
1284
1285



1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303



1304
1305
1306
1307
1308
1309
1310
1283
1284
1285
1286
1287
1288
1289

1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309

1310
1311
1312
1313
1314
1315
1316
1317
1318
1319







-
+
+
+

















-
+
+
+







		  keyvals: keyvals
		  run-info: run-info
		  ;; newtal: newtal
		  all-tests-registry: all-tests-registry
		  ;; itemmaps: itemmaps
		  ;; prereqs-not-met: (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)
		  ;; can-run-more-tests: (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs) ;; look at the test jobgroup and tot jobs running
		  )))
		  ))

        (rtq-looptop-lastvisit-time (current-seconds)))

    ;; Initialize the test-registery hash with tests that already have a record
    ;; convert state to symbol and use that as the hash value
    (for-each (lambda (trec)
		(let ((id (db:test-get-id        trec))
		      (tn (db:test-get-testname  trec))
		      (ip (db:test-get-item-path trec))
		      (st (db:test-get-state     trec)))
		  (if (not (equal? st "DELETED"))
		      (hash-table-set! test-registry (db:test-make-full-name tn ip) (string->symbol st)))))
	      tests-info)
    (set! max-retries (if (and max-retries (string->number max-retries))(string->number max-retries) 100))

    (let loop ((hed         (car sorted-test-names))
	       (tal         (cdr sorted-test-names)) 
	       (reg         '()) ;; registered, put these at the head of tal 
	       (reruns      '()))

      (let ((cycletime (- (current-seconds) rtq-looptop-lastvisit-time)))
        (BB> "RTQ LOOP TOP | cycletime="cycletime"| qdepth="(add1 (length tal))" hed="hed))
      (set! rtq-looptop-lastvisit-time (current-seconds))
      (runs:incremental-print-results run-id)

      (if (not (null? reruns))(debug:print-info 4 *default-log-port* "reruns=" reruns))

      ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes
      ;; moving this to a parallel thread and just run it once.
      ;;
1600
1601
1602
1603
1604
1605
1606










1607
1608
1609

1610

1611
1612
1613
1614
1615
1616
1617



1618
1619
1620
1621
1622
1623
1624
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635



1636
1637
1638
1639
1640
1641
1642
1643
1644
1645







+
+
+
+
+
+
+
+
+
+



+

+




-
-
-
+
+
+







(define (runs:pretty-string lst)
  (map (lambda (t)
	 (if (not (vector? t))
	     (conc t)
	     (conc (db:test-get-testname t) ":" (db:test-get-state t) "/" (db:test-get-status t))))
       lst))




(define (run:test-standalone target runname keys keyvals flags)
  (let* ((run-id ...)
         (run-info ...)
         

  )

;; parent-test is there as a placeholder for when parent-tests can be run as a setup step
;;
(define (run:test run-id run-info keyvals runname test-record flags parent-test test-registry all-tests-registry)
  ;; all-tests-registry - used to determine test path
  ;; All these vars might be referenced by the testconfig file reader
  ;; flags are a hash of k/v pairs that could become commandline switches to a standalone version
  (let* ((test-name    (tests:testqueue-get-testname   test-record))
	 (test-waitons (tests:testqueue-get-waitons    test-record))
	 (test-conf    (tests:testqueue-get-testconfig test-record))
	 (itemdat      (tests:testqueue-get-itemdat    test-record))
	 (test-path    (hash-table-ref all-tests-registry test-name)) ;; (conc *toppath* "/tests/" test-name)) ;; could use tests:get-testconfig here ...
	 (force        (hash-table-ref/default flags "-force" #f))
	 (rerun        (hash-table-ref/default flags "-rerun" #f))
	 (test-path    (hash-table-ref all-tests-registry test-name)) ;; path to test definition (conc *toppath* "/tests/" test-name)) ;; could use tests:get-testconfig here ...
	 (force        (hash-table-ref/default flags "-force" #f)) ;; ignore some red flags; should be removed
	 (rerun        (hash-table-ref/default flags "-rerun" #f)) ;; used to decide to rerun if test already exists, always rerun (pretend not started) 
	 (keepgoing    (hash-table-ref/default flags "-keepgoing" #f))
	 (incomplete-timeout (string->number (or (configf:lookup *configdat* "setup" "incomplete-timeout") "x")))
	 (item-path     "")
	 (db           #f)
	 (full-test-name #f))

    ;; setting itemdat to a list if it is #f
1647
1648
1649
1650
1651
1652
1653
1654

1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670

1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686


1687

1688
1689
1690
1691
1692

1693
1694
1695
1696
1697
1698
1699
1668
1669
1670
1671
1672
1673
1674

1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690

1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709

1710
1711
1712
1713
1714

1715
1716
1717
1718
1719
1720
1721
1722







-
+















-
+
















+
+
-
+




-
+







    (if (not (hash-table-ref/default *test-meta-updated* test-name #f))
        (begin
	   (hash-table-set! *test-meta-updated* test-name #t)
           (runs:update-test_meta test-name test-conf)))
    
    ;; itemdat => ((ripeness "overripe") (temperature "cool") (season "summer"))
    (let* ((new-test-path (string-intersperse (cons test-path (map cadr itemdat)) "/"))
	   (test-id       (rmt:get-test-id run-id test-name item-path))
	   (test-id       (rmt:get-test-id run-id test-name item-path)) ;; ?? necessary?
	   (testdat       (if test-id (rmt:get-test-info-by-id run-id test-id) #f)))
      (if (not testdat)
	  (let loop ()
	    ;; ensure that the path exists before registering the test
	    ;; NOPE: Cannot! Don't know yet which disk area will be assigned....
	    ;; (system (conc "mkdir -p " new-test-path))
	    ;;
	    ;; (open-run-close tests:register-test db run-id test-name item-path)
	    ;;
	    ;; NB// for the above line. I want the test to be registered long before this routine gets called!
	    ;;
	    (if (not test-id)(set! test-id (rmt:get-test-id run-id test-name item-path)))
	    (if (not test-id)
		(begin
		  (debug:print 2 *default-log-port* "WARN: Test not pre-created? test-name=" test-name ", item-path=" item-path ", run-id=" run-id)
		  (rmt:register-test run-id test-name item-path)
		  (rmt:register-test run-id test-name item-path) ;; 
		  (set! test-id (rmt:get-test-id run-id test-name item-path))))
	    (debug:print-info 4 *default-log-port* "test-id=" test-id ", run-id=" run-id ", test-name=" test-name ", item-path=\"" item-path "\"")
	    (set! testdat (rmt:get-test-info-by-id run-id test-id))
	    (if (not testdat)
		(begin
		  (debug:print-info 0 *default-log-port* "WARNING: server is overloaded, trying again in one second")
		  (thread-sleep! 1)
		  (loop)))))
      (if (not testdat) ;; should NOT happen
	  (debug:print-error 0 *default-log-port* "failed to get test record for test-id " test-id))
      (set! test-id (db:test-get-id testdat))
      (if (common:file-exists? test-path)
	  (change-directory test-path)
	  (begin
	    (debug:print-error 0 *default-log-port* "test run path not created before attempting to run the test. Perhaps you are running -remove-runs at the same time?")
	    (change-directory *toppath*)))


      (case (if force ;; (args:get-arg "-force")
      (case (if force ;; (args:get-arg "-force") 
		'NOT_STARTED
		(if testdat
		    (string->symbol (test:get-state testdat))
		    'failed-to-insert))
	((failed-to-insert)
	((failed-to-insert) ;; should never happen now
	 (debug:print-error 0 *default-log-port* "Failed to insert the record into the db"))
	((NOT_STARTED COMPLETED DELETED INCOMPLETE)
	 (let ((runflag #f))
	   (cond
	    ;; -force, run no matter what
	    (force (set! runflag #t))
	    ;; NOT_STARTED, run no matter what
1761
1762
1763
1764
1765
1766
1767
1768

1769
1770
1771
1772
1773
1774
1775
1784
1785
1786
1787
1788
1789
1790

1791
1792
1793
1794
1795
1796
1797
1798







-
+







			     (> numseconds time-since-last))
			 (set! skip-test (conc "Skipping due to previous test run less than " (configf:lookup test-conf "skip" "rundelay") " ago"))))))
		 
		 (if skip-test
		     (begin
		       (mt:test-set-state-status-by-id run-id test-id "COMPLETED" "SKIP" skip-test)
		       (debug:print-info 1 *default-log-port* "SKIPPING Test " full-test-name " due to " skip-test))
		     (if (not (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat flags))
		     (if (not (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat flags)) ;; HERE WE GO
			 (begin
			   (print "ERROR: Failed to launch the test. Exiting as soon as possible")
			   (set! *globalexitstatus* 1) ;; 
			   (process-signal (current-process-id) signal/kill))))))))
	((KILLED) 
	 (debug:print 1 *default-log-port* "NOTE: " full-test-name " is already running or was explictly killed, use -force to launch it.")
	 (hash-table-set! test-registry (db:test-make-full-name test-name test-path) 'DONOTRUN)) ;; KILLED))