Megatest

Check-in [44919a2a7f]
Login
Overview
Comment:Try locking transactions in no-sync db
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.65-transaction-locking
Files: files | file ages | folders
SHA1: 44919a2a7f97a127682b09fd8a8e692d46b26f8d
User & Date: mrwellan on 2020-09-24 23:28:36
Other Links: branch diff | manifest | tags
Context
2020-09-25
21:24
Turn off WAL mode on no-sync db Closed-Leaf check-in: 300c036c99 user: mrwellan tags: v1.65-transaction-locking
2020-09-24
23:28
Try locking transactions in no-sync db check-in: 44919a2a7f user: mrwellan tags: v1.65-transaction-locking
15:50
Missed case of relying on run_duration field - fixed. ==/FAIL/orion,mars/== Closed-Leaf check-in: 08b1dfe720 user: mrwellan tags: v1.65-failed-testdat
Changes

Modified db.scm from [2c7b396933] to [a1e4ee1a15].

1778
1779
1780
1781
1782
1783
1784
1785

1786
1787
1788
1789
1790
1791
1792

1793
1794
1795
1796
1797
1798
1799
1800
1801


1802

1803
1804
1805
1806
1807
1808
1809
1778
1779
1780
1781
1782
1783
1784

1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804

1805
1806
1807
1808
1809
1810
1811
1812







-
+







+









+
+
-
+







        (with-input-from-file infile read-lines)
	)))

;; check duration against test-run.dat file if it exists and update the value in
;; the db if necessary
;;
(define (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration)
  (let* ((datf             (conc run-dir ".mt_data/test-run.dat"))
  (let* ((datf             (conc run-dir "/.mt_data/test-run.dat"))
	 (modt             (if (and (file-exists? datf)
				    (file-read-access? datf))
			       (file-modification-time datf)
			       #f)) ;; (+ event-time run-duration))))
	 (alt-run-duration (if modt
			       (- modt event-time)
			       #f)))
    (debug:print 0 *default-log-port* "Test " test-id " datf " datf " modt " modt " alt-run-duration " alt-run-duration)
    (if (and alt-run-duration
	     (> alt-run-duration run-duration))
	(begin
	  (debug:print 0 *default-log-port* "Test " test-id " run duration mismatch. Setting to " alt-run-duration)
	  (db:with-db
	   dbstruct #f #f
	   (lambda (db)
	     (sqlite3:execute db "UPDATE tests SET run_duration=? WHERE id=?;" alt-run-duration test-id)
	     #t)))
	(begin
	  (debug:print 0 *default-log-port* "Test " test-id " run duration correct. No adjustment needed.")
	#f))) ;; #f = we did NOT adjust the time
	  #f)))) ;; #f = we did NOT adjust the time
	      
(define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime)
  (let* ((incompleted '())
	 (oldlaunched '())
	 (toplevels   '())
          ;; The default running-deadtime is 720 seconds = 12 minutes.
          ;; "(running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period)))" = 200 + (2 * (200 + 30 + 30))
4017
4018
4019
4020
4021
4022
4023
4024

4025
4026

4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037














4038
4039
4040
4041
4042
4043
4044
4020
4021
4022
4023
4024
4025
4026

4027
4028
4029
4030











4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051







-
+


+
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+







    (db:with-db
     dbstruct #f #f
     (lambda (db)
       (let ((tr-res
              (sqlite3:with-transaction
               db
               (lambda ()
                 ;; NB// Pass the db so it is part fo the transaction
                 ;; NB// Pass the db so it is part of the transaction
                 (db:test-set-state-status db run-id test-id state status comment) ;; this call sets the item state/status
                 (if (not (equal? item-path "")) ;; only roll up IF incoming test is an item
		     ;;                           item-path is used to exclude current state/status of THIS test
                     (let* ((state-status-counts  (db:get-all-state-status-counts-for-test dbstruct run-id test-name item-path state status)) ;; item-path is used to exclude current state/status of THIS test
			      						  (state-stauses (db:roll-up-rules state-status-counts state status))
                          (newstate (car state-stauses))
                          (newstatus (cadr state-stauses)))
                       (debug:print 4 *default-log-port* "BB> tl-test-id="tl-test-id" ; "test-name":"item-path" newstate="newstate" newstatus="newstatus" len(sscs)="(length state-status-counts)  " state-status-counts: "
							(apply conc
                  (map (lambda (x)
                     (conc
                     		(with-output-to-string (lambda () (pp (dbr:counts->alist x)))) " | "))
                              state-status-counts))); end debug:print
   
                     (let* ((state-status-counts  (db:get-all-state-status-counts-for-test
						   dbstruct run-id test-name item-path state status))
			    (state-stauses        (db:roll-up-rules state-status-counts state status))
                            (newstate             (car state-stauses))
                            (newstatus            (cadr state-stauses)))
                       (debug:print 4 *default-log-port* "BB> tl-test-id="tl-test-id" ; "test-name
				    ":"item-path" newstate="newstate" newstatus="newstatus
				    " len(sscs)="(length state-status-counts)  " state-status-counts: "
				    (apply conc
					   (map (lambda (x)
						  (conc
                     				   (with-output-to-string (lambda () (pp (dbr:counts->alist x)))) " | "))
						state-status-counts))); end debug:print
		       
                       (if tl-test-id
			   (db:test-set-state-status db run-id tl-test-id newstate newstatus #f)) ;; we are still in the transaction - must access the db and not the dbstruct
		       ))))))
         (mutex-unlock! *db-transaction-mutex*)
         (if (and test-id state status (equal? status "AUTO")) 
             (db:test-data-rollup dbstruct run-id test-id status))
         tr-res)))))
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114



4115
4116

4117
4118
4119
4120
4121
4122
4123
4112
4113
4114
4115
4116
4117
4118



4119
4120
4121
4122

4123
4124
4125
4126
4127
4128
4129
4130







-
-
-
+
+
+

-
+







     dbstruct #f #f
     (lambda (db)
       (let ((tr-res
              (sqlite3:with-transaction
               db
               (lambda ()
                   (let* ((state-status-counts  (db:get-all-state-status-counts-for-run dbstruct run-id))
													(state-stauses (db:roll-up-rules state-status-counts #f #f ))
                          (newstate (car state-stauses))
                          (newstatus (cadr state-stauses))) 
			  (state-stauses        (db:roll-up-rules state-status-counts #f #f ))
                          (newstate             (car state-stauses))
                          (newstatus            (cadr state-stauses))) 
                    (if (or (not (eq? newstate curr-state)) (not (eq?  newstatus curr-status)))
                   (db:set-run-state-status dbstruct run-id newstate newstatus )))))))
			(db:set-run-state-status dbstruct run-id newstate newstatus )))))))
         (mutex-unlock! *db-transaction-mutex*)
         tr-res))))


(define (db:get-all-state-status-counts-for-run dbstruct run-id)
 (let* ((test-count-recs (db:with-db
                                  dbstruct #f #f

Modified launch.scm from [fb7acc7e32] to [363e2a7eb1].

103
104
105
106
107
108
109


110


111
112
113
114
115
116
117
103
104
105
106
107
108
109
110
111

112
113
114
115
116
117
118
119
120







+
+
-
+
+







  ;; (tests:test-set-status! test-id "RUNNING" "n/a" #f #f)
  ;; Since we should have a clean slate at this time there is no need to do 
  ;; any of the other stuff that tests:test-set-status! does. Let's just 
  ;; force RUNNING/n/a

  ;; (thread-sleep! 0.3)
  ;; (tests:test-force-state-status! run-id test-id "RUNNING" "n/a")
  (common:soft-lock
   (lambda ()
  (rmt:set-state-status-and-roll-up-items run-id test-name item-path "RUNNING" #f #f) 
     (rmt:set-state-status-and-roll-up-items run-id test-name item-path "RUNNING" #f #f))
    "set-state-status-roll-up")
  ;; (thread-sleep! 0.3) ;; NFS slowness has caused grief here

  ;; if there is a runscript do it first
  (if fullrunscript
      (let ((pid (process-run fullrunscript)))
	(rmt:test-set-top-process-pid run-id test-id pid)
	(let loop ((i 0))
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360























1361
1362
1363
1364
1365
1366
1367
1354
1355
1356
1357
1358
1359
1360



1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390







-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







            #t)
           (else #f))))
    (when do-scan?
      (debug:print 1 *default-log-port* "INFO: search and mark zombie tests")
      (rmt:set-var key (current-seconds))
      (rmt:find-and-mark-incomplete run-id #f))))




;; weak locking - slow down concurrent activities but don't entirely stop them from
;; overlapping. wait for up to 5 seconds
;;
(define (common:soft-lock proc keyname #!key (wait-time 5))
  (let* ((start-time (current-seconds))
	 (my-key     (conc (current-process-id) "-" (get-host-name))))
    (let loop ((key-val (rmt:no-sync-get/default keyname #f)))
      (if (and key-val
	       (not (equal? key-val my-key))
	       (< (- (current-seconds) start-time) wait-time))
	  (begin
	    (thread-sleep! 0.5)
	    (debug:print-info 0 *default-log-port* "Still trying to lock for " keyname)
	    (loop (current-seconds))) ;; try again
	  (begin ;; either we got the lock or we timed out - proceed as if 
	    (rmt:no-sync-set keyname my-key)
	    (if (equal? (rmt:no-sync-get/default keyname #f) my-key)
		(begin
		  (debug:print-info 0 *default-log-port* "Got lock for " keyname)
		  (proc)
		  (rmt:no-sync-del! keyname)
		  (debug:print-info 0 *default-log-port* "Released lock for " keyname))
		(common:soft-lock proc keyname wait-time: wait-time)))))))

;; 1. look though disks list for disk with most space
;; 2. create run dir on disk, path name is meaningful
;; 3. create link from run dir to megatest runs area 
;; 4. remotely run the test on allocated host
;;    - could be ssh to host from hosts table (update regularly with load)
;;    - could be netbatch
1464
1465
1466
1467
1468
1469
1470


1471


1472
1473
1474
1475
1476
1477
1478
1487
1488
1489
1490
1491
1492
1493
1494
1495

1496
1497
1498
1499
1500
1501
1502
1503
1504







+
+
-
+
+







	    (debug:print-info 0 *default-log-port* "attempting to preclean directory " (db:test-get-rundir testinfo) " for test " test-name "/" item-path)
	    (runs:remove-test-directory testinfo 'remove-data-only))) ;; remove data only, do not perturb the record
      
      ;; prevent overlapping actions - set to LAUNCHED as early as possible
      ;;
      ;; the following call handles waiver propogation. cannot yet condense into roll-up-pass-fail
      (tests:test-set-status! run-id test-id "LAUNCHED" "n/a" #f #f) ;; (if launch-results launch-results "FAILED"))
      (common:soft-lock
       (lambda ()
      (rmt:set-state-status-and-roll-up-items run-id test-name item-path #f "LAUNCHED" #f)
	 (rmt:set-state-status-and-roll-up-items run-id test-name item-path #f "LAUNCHED" #f))
       "set-state-status-roll-up")
      ;; (pp (hash-table->alist tconfig))
      (set! diskpath (get-best-disk *configdat* tconfig))
      (if diskpath
	  (let ((dat  (create-work-area run-id run-info keyvals test-id test-path diskpath test-name itemdat)))
	    (set! work-area (car dat))
	    (set! toptest-work-area (cadr dat))
	    (debug:print-info 2 *default-log-port* "Using work area " work-area))

Modified runs.scm from [e5ce9acbf0] to [11725dad72].

745
746
747
748
749
750
751

752

753
754
755
756
757
758
759
745
746
747
748
749
750
751
752

753
754
755
756
757
758
759
760







+
-
+







					    ;; (rmt:find-and-mark-incomplete-all-runs))))) CAN'T INTERRUPT IT ...
					    (let ((run-ids (rmt:get-all-run-ids)))
					      (for-each (lambda (run-id)
							  (if keep-going
							      (handle-exceptions
							       exn
							       (debug:print 0 *default-log-port* "error in calling find-and-mark-incomplete for run-id " run-id ", exn=" exn)
							       (if (common:low-noise-print 900 (conc "mark-incomplete-" run-id))
							       (rmt:find-and-mark-incomplete run-id #f)))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27)
								   (rmt:find-and-mark-incomplete run-id #f))))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27)
							run-ids)))
					  "runs: mark-incompletes")))
	    ;; (thread-start! th1)
	    (thread-start! th2)
	    ;; (thread-join! th1)
	    ;; just do the main stuff in the main thread
	    (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests
1844
1845
1846
1847
1848
1849
1850

1851

1852
1853
1854
1855
1856
1857
1858
1845
1846
1847
1848
1849
1850
1851
1852

1853
1854
1855
1856
1857
1858
1859
1860







+
-
+







	    ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes
	    ;; (debug:print 0 *default-log-port* "Got here eh! num-running=" num-running " (> num-running 0) " (> num-running 0))
	    (if (> (current-seconds)(+ last-time-incomplete 900))
		(let ((actual-num-running (rmt:get-count-tests-running-for-run-id run-id #f))) ;; fastmode=no
		  (debug:print-info 0 *default-log-port* "Marking stuck tests as INCOMPLETE while waiting for run " run-id
				    ". Running as pid " (current-process-id) " on " (get-host-name))
		  (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set!
		  (if (common:low-noise-print 900 (conc "mark-incomplete-" run-id))
		  (rmt:find-and-mark-incomplete run-id #f)
		      (rmt:find-and-mark-incomplete run-id #f))
		  (debug:print-info 0 *default-log-port* "run-wait specified, waiting on " actual-num-running
				    " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at "
				    (time->string (seconds->local-time (current-seconds))))))
	    ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
	    (thread-sleep! 5) ;; (if (>= num-running max-concurrent-jobs) 5 1))
	    (wait-loop (rmt:get-count-tests-running-for-run-id run-id #t) ;; fastmode=yes
		       num-running))))