Changes In Branch v1.65-transaction-locking Through [44919a2a7f] Excluding Merge-Ins
This is equivalent to a diff from 08b1dfe720 to 44919a2a7f
2020-10-05
| ||
06:43 | Removed suspect change in server that seemed to be causing the obfusc-skel fails check-in: 134d87e018 user: matt tags: v1.71_old | |
2020-10-03
| ||
21:25 | Pull in all v1.65 before patching with prereq fix. check-in: b50ac24a01 user: matt tags: v1.70-defunct | |
20:20 | Patched in fix for prereqs-not-met issue. Closed-Leaf check-in: 7ee0b77302 user: matt tags: v1.65-refactor-fix-2, v1.65-failed-testdat (unpublished) | |
2020-09-25
| ||
21:24 | Turn off WAL mode on no-sync db Closed-Leaf check-in: 300c036c99 user: mrwellan tags: v1.65-transaction-locking | |
2020-09-24
| ||
23:28 | Try locking transactions in no-sync db check-in: 44919a2a7f user: mrwellan tags: v1.65-transaction-locking | |
15:50 | Missed case of relying on run_duration field - fixed. ==/FAIL/orion,mars/== Closed-Leaf check-in: 08b1dfe720 user: mrwellan tags: v1.65-failed-testdat | |
2020-09-23
| ||
15:16 | Do not exit on failure to create directory - race conditons on NFS cause false fail scenarios - just keep going and cross your fingers... ==/FAIL/orion,mars/== check-in: c5b1d00b22 user: mrwellan tags: v1.6572, v1.65-failed-testdat | |
Modified db.scm from [2c7b396933] to [a1e4ee1a15].
︙ | ︙ | |||
1778 1779 1780 1781 1782 1783 1784 | (with-input-from-file infile read-lines) ))) ;; check duration against test-run.dat file if it exists and update the value in ;; the db if necessary ;; (define (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration) | | > > > | | 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 | (with-input-from-file infile read-lines) ))) ;; check duration against test-run.dat file if it exists and update the value in ;; the db if necessary ;; (define (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration) (let* ((datf (conc run-dir "/.mt_data/test-run.dat")) (modt (if (and (file-exists? datf) (file-read-access? datf)) (file-modification-time datf) #f)) ;; (+ event-time run-duration)))) (alt-run-duration (if modt (- modt event-time) #f))) (debug:print 0 *default-log-port* "Test " test-id " datf " datf " modt " modt " alt-run-duration " alt-run-duration) (if (and alt-run-duration (> alt-run-duration run-duration)) (begin (debug:print 0 *default-log-port* "Test " test-id " run duration mismatch. Setting to " alt-run-duration) (db:with-db dbstruct #f #f (lambda (db) (sqlite3:execute db "UPDATE tests SET run_duration=? WHERE id=?;" alt-run-duration test-id) #t))) (begin (debug:print 0 *default-log-port* "Test " test-id " run duration correct. No adjustment needed.") #f)))) ;; #f = we did NOT adjust the time (define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime) (let* ((incompleted '()) (oldlaunched '()) (toplevels '()) ;; The default running-deadtime is 720 seconds = 12 minutes. ;; "(running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period)))" = 200 + (2 * (200 + 30 + 30)) |
︙ | ︙ | |||
4017 4018 4019 4020 4021 4022 4023 | (db:with-db dbstruct #f #f (lambda (db) (let ((tr-res (sqlite3:with-transaction db (lambda () | | > | > | | | | > > | | | | | | | 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 | (db:with-db dbstruct #f #f (lambda (db) (let ((tr-res (sqlite3:with-transaction db (lambda () ;; NB// Pass the db so it is part of the transaction (db:test-set-state-status db run-id test-id state status comment) ;; this call sets the item state/status (if (not (equal? item-path "")) ;; only roll up IF incoming test is an item ;; item-path is used to exclude current state/status of THIS test (let* ((state-status-counts (db:get-all-state-status-counts-for-test dbstruct run-id test-name item-path state status)) (state-stauses (db:roll-up-rules state-status-counts state status)) (newstate (car state-stauses)) (newstatus (cadr state-stauses))) (debug:print 4 *default-log-port* "BB> tl-test-id="tl-test-id" ; "test-name ":"item-path" newstate="newstate" newstatus="newstatus " len(sscs)="(length state-status-counts) " state-status-counts: " (apply conc (map (lambda (x) (conc (with-output-to-string (lambda () (pp (dbr:counts->alist x)))) " | ")) state-status-counts))); end debug:print (if tl-test-id (db:test-set-state-status db run-id tl-test-id newstate newstatus #f)) ;; we are still in the transaction - must access the db and not the dbstruct )))))) (mutex-unlock! *db-transaction-mutex*) (if (and test-id state status (equal? status "AUTO")) (db:test-data-rollup dbstruct run-id test-id status)) tr-res))))) |
︙ | ︙ | |||
4105 4106 4107 4108 4109 4110 4111 | dbstruct #f #f (lambda (db) (let ((tr-res (sqlite3:with-transaction db (lambda () (let* ((state-status-counts (db:get-all-state-status-counts-for-run dbstruct run-id)) | | | | | | 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 | dbstruct #f #f (lambda (db) (let ((tr-res (sqlite3:with-transaction db (lambda () (let* ((state-status-counts (db:get-all-state-status-counts-for-run dbstruct run-id)) (state-stauses (db:roll-up-rules state-status-counts #f #f )) (newstate (car state-stauses)) (newstatus (cadr state-stauses))) (if (or (not (eq? newstate curr-state)) (not (eq? newstatus curr-status))) (db:set-run-state-status dbstruct run-id newstate newstatus ))))))) (mutex-unlock! *db-transaction-mutex*) tr-res)))) (define (db:get-all-state-status-counts-for-run dbstruct run-id) (let* ((test-count-recs (db:with-db dbstruct #f #f |
︙ | ︙ |
Modified launch.scm from [fb7acc7e32] to [363e2a7eb1].
︙ | ︙ | |||
103 104 105 106 107 108 109 | ;; (tests:test-set-status! test-id "RUNNING" "n/a" #f #f) ;; Since we should have a clean slate at this time there is no need to do ;; any of the other stuff that tests:test-set-status! does. Let's just ;; force RUNNING/n/a ;; (thread-sleep! 0.3) ;; (tests:test-force-state-status! run-id test-id "RUNNING" "n/a") | > > | > | 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 | ;; (tests:test-set-status! test-id "RUNNING" "n/a" #f #f) ;; Since we should have a clean slate at this time there is no need to do ;; any of the other stuff that tests:test-set-status! does. Let's just ;; force RUNNING/n/a ;; (thread-sleep! 0.3) ;; (tests:test-force-state-status! run-id test-id "RUNNING" "n/a") (common:soft-lock (lambda () (rmt:set-state-status-and-roll-up-items run-id test-name item-path "RUNNING" #f #f)) "set-state-status-roll-up") ;; (thread-sleep! 0.3) ;; NFS slowness has caused grief here ;; if there is a runscript do it first (if fullrunscript (let ((pid (process-run fullrunscript))) (rmt:test-set-top-process-pid run-id test-id pid) (let loop ((i 0)) |
︙ | ︙ | |||
1351 1352 1353 1354 1355 1356 1357 | #t) (else #f)))) (when do-scan? (debug:print 1 *default-log-port* "INFO: search and mark zombie tests") (rmt:set-var key (current-seconds)) (rmt:find-and-mark-incomplete run-id #f)))) | | | | > > > > > > > > > > > > > > > > > > > > | 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 | #t) (else #f)))) (when do-scan? (debug:print 1 *default-log-port* "INFO: search and mark zombie tests") (rmt:set-var key (current-seconds)) (rmt:find-and-mark-incomplete run-id #f)))) ;; weak locking - slow down concurrent activities but don't entirely stop them from ;; overlapping. wait for up to 5 seconds ;; (define (common:soft-lock proc keyname #!key (wait-time 5)) (let* ((start-time (current-seconds)) (my-key (conc (current-process-id) "-" (get-host-name)))) (let loop ((key-val (rmt:no-sync-get/default keyname #f))) (if (and key-val (not (equal? key-val my-key)) (< (- (current-seconds) start-time) wait-time)) (begin (thread-sleep! 0.5) (debug:print-info 0 *default-log-port* "Still trying to lock for " keyname) (loop (current-seconds))) ;; try again (begin ;; either we got the lock or we timed out - proceed as if (rmt:no-sync-set keyname my-key) (if (equal? (rmt:no-sync-get/default keyname #f) my-key) (begin (debug:print-info 0 *default-log-port* "Got lock for " keyname) (proc) (rmt:no-sync-del! keyname) (debug:print-info 0 *default-log-port* "Released lock for " keyname)) (common:soft-lock proc keyname wait-time: wait-time))))))) ;; 1. look though disks list for disk with most space ;; 2. create run dir on disk, path name is meaningful ;; 3. create link from run dir to megatest runs area ;; 4. remotely run the test on allocated host ;; - could be ssh to host from hosts table (update regularly with load) ;; - could be netbatch |
︙ | ︙ | |||
1464 1465 1466 1467 1468 1469 1470 | (debug:print-info 0 *default-log-port* "attempting to preclean directory " (db:test-get-rundir testinfo) " for test " test-name "/" item-path) (runs:remove-test-directory testinfo 'remove-data-only))) ;; remove data only, do not perturb the record ;; prevent overlapping actions - set to LAUNCHED as early as possible ;; ;; the following call handles waiver propogation. cannot yet condense into roll-up-pass-fail (tests:test-set-status! run-id test-id "LAUNCHED" "n/a" #f #f) ;; (if launch-results launch-results "FAILED")) | > > | > | 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 | (debug:print-info 0 *default-log-port* "attempting to preclean directory " (db:test-get-rundir testinfo) " for test " test-name "/" item-path) (runs:remove-test-directory testinfo 'remove-data-only))) ;; remove data only, do not perturb the record ;; prevent overlapping actions - set to LAUNCHED as early as possible ;; ;; the following call handles waiver propogation. cannot yet condense into roll-up-pass-fail (tests:test-set-status! run-id test-id "LAUNCHED" "n/a" #f #f) ;; (if launch-results launch-results "FAILED")) (common:soft-lock (lambda () (rmt:set-state-status-and-roll-up-items run-id test-name item-path #f "LAUNCHED" #f)) "set-state-status-roll-up") ;; (pp (hash-table->alist tconfig)) (set! diskpath (get-best-disk *configdat* tconfig)) (if diskpath (let ((dat (create-work-area run-id run-info keyvals test-id test-path diskpath test-name itemdat))) (set! work-area (car dat)) (set! toptest-work-area (cadr dat)) (debug:print-info 2 *default-log-port* "Using work area " work-area)) |
︙ | ︙ |
Modified runs.scm from [e5ce9acbf0] to [11725dad72].
︙ | ︙ | |||
745 746 747 748 749 750 751 | ;; (rmt:find-and-mark-incomplete-all-runs))))) CAN'T INTERRUPT IT ... (let ((run-ids (rmt:get-all-run-ids))) (for-each (lambda (run-id) (if keep-going (handle-exceptions exn (debug:print 0 *default-log-port* "error in calling find-and-mark-incomplete for run-id " run-id ", exn=" exn) | > | | 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 | ;; (rmt:find-and-mark-incomplete-all-runs))))) CAN'T INTERRUPT IT ... (let ((run-ids (rmt:get-all-run-ids))) (for-each (lambda (run-id) (if keep-going (handle-exceptions exn (debug:print 0 *default-log-port* "error in calling find-and-mark-incomplete for run-id " run-id ", exn=" exn) (if (common:low-noise-print 900 (conc "mark-incomplete-" run-id)) (rmt:find-and-mark-incomplete run-id #f))))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27) run-ids))) "runs: mark-incompletes"))) ;; (thread-start! th1) (thread-start! th2) ;; (thread-join! th1) ;; just do the main stuff in the main thread (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests |
︙ | ︙ | |||
1844 1845 1846 1847 1848 1849 1850 | ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes ;; (debug:print 0 *default-log-port* "Got here eh! num-running=" num-running " (> num-running 0) " (> num-running 0)) (if (> (current-seconds)(+ last-time-incomplete 900)) (let ((actual-num-running (rmt:get-count-tests-running-for-run-id run-id #f))) ;; fastmode=no (debug:print-info 0 *default-log-port* "Marking stuck tests as INCOMPLETE while waiting for run " run-id ". Running as pid " (current-process-id) " on " (get-host-name)) (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set! | > | | 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 | ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes ;; (debug:print 0 *default-log-port* "Got here eh! num-running=" num-running " (> num-running 0) " (> num-running 0)) (if (> (current-seconds)(+ last-time-incomplete 900)) (let ((actual-num-running (rmt:get-count-tests-running-for-run-id run-id #f))) ;; fastmode=no (debug:print-info 0 *default-log-port* "Marking stuck tests as INCOMPLETE while waiting for run " run-id ". Running as pid " (current-process-id) " on " (get-host-name)) (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set! (if (common:low-noise-print 900 (conc "mark-incomplete-" run-id)) (rmt:find-and-mark-incomplete run-id #f)) (debug:print-info 0 *default-log-port* "run-wait specified, waiting on " actual-num-running " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at " (time->string (seconds->local-time (current-seconds)))))) ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (thread-sleep! 5) ;; (if (>= num-running max-concurrent-jobs) 5 1)) (wait-loop (rmt:get-count-tests-running-for-run-id run-id #t) ;; fastmode=yes num-running)))) |
︙ | ︙ |