Overview
Comment: | Lots of little changes |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.80-revolution |
Files: | files | file ages | folders |
SHA1: |
fa88f0abd7a0e20f05fcaa1302727f05 |
User & Date: | mrwellan on 2023-11-21 15:42:59 |
Other Links: | branch diff | manifest | tags |
Context
2023-11-21
| ||
20:13 | Removing some of the mutex enabling around db:with-db seems to improve performance check-in: 7e63ac6bde user: matt tags: v1.80-revolution | |
15:42 | Lots of little changes check-in: fa88f0abd7 user: mrwellan tags: v1.80-revolution | |
2023-11-20
| ||
19:18 | Chipping away at server issues check-in: 30862628e2 user: mrwellan tags: v1.80-revolution | |
Changes
Modified TODO from [7c16b69c3b] to [e1f7aa3311].
︙ | ︙ | |||
14 15 16 17 18 19 20 21 22 23 24 25 26 27 | # # You should have received a copy of the GNU General Public License # along with Megatest. If not, see <http://www.gnu.org/licenses/>. TODO ==== 23WW45 - the revolution branch . Add "fast" db start option (no handshaking over NFS) . Add server-ro to server types (just "server" is fine for read/write). . Create pause-server and resume-server calls . Create rsync or cp sync to MTRAH function . Change rmt:send-receive to divert calls to read-only server when possible . Change start server to call main.db server for 1..N.db servers, block until server is read for use. | > > > | 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 | # # You should have received a copy of the GNU General Public License # along with Megatest. If not, see <http://www.gnu.org/licenses/>. TODO ==== 23WW46 - v1.80 branch . Use file semaphore to kill tests, eliminate db load of the KILLREQ query . Merge this change to revolution branch 23WW45 - the revolution branch . Add "fast" db start option (no handshaking over NFS) . Add server-ro to server types (just "server" is fine for read/write). . Create pause-server and resume-server calls . Create rsync or cp sync to MTRAH function . Change rmt:send-receive to divert calls to read-only server when possible . Change start server to call main.db server for 1..N.db servers, block until server is read for use. |
︙ | ︙ |
Modified api.scm from [7a64d5e9a2] to [b08fe263c7].
︙ | ︙ | |||
224 225 226 227 228 229 230 | (ok (equal? dbfname (dbr:dbstruct-dbfname dbstruct)))) (case cmd ((ping) #t) ;; we are fine (else (assert ok "FATAL: database file and run-id not aligned."))))) (ttdat *server-info*) (server-state (tt-state ttdat)) | | | | 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 | (ok (equal? dbfname (dbr:dbstruct-dbfname dbstruct)))) (case cmd ((ping) #t) ;; we are fine (else (assert ok "FATAL: database file and run-id not aligned."))))) (ttdat *server-info*) (server-state (tt-state ttdat)) (maxthreads 20) ;; make this a parameter? (status (cond ((and (> numthreads maxthreads) (> (random 100) 70)) ;; allow a 30% probability to go through so we can figure out what is going wrong in main.db server. 'busy) ;; ((> numthreads 5) 'loaded) ;; this gets transmitted to the client which calls tt:backoff-incr to slow stuff down. (else 'ok))) (errmsg (case status ((busy) (conc "Server overloaded, "numthreads" threads in flight")) ((loaded) (conc "Server loaded, "numthreads" threads in flight")) (else #f))) (result (case status ((busy) (if (eq? cmd 'ping) (normal-proc cmd run-id params) ;; numthreads must be greater than 5 for busy (* 0.1 (- numthreads maxthreads)) ;; was 15 )) ;; (- numthreads 29)) ;; call back in as many seconds ((loaded) ;; (if (eq? (rmt:transport-mode) 'tcp) ;; (thread-sleep! 0.5)) (normal-proc cmd run-id params)) (else (normal-proc cmd run-id params)))) |
︙ | ︙ |
Modified db.scm from [4cfe34e3d4] to [70a7ff684e].
︙ | ︙ | |||
2225 2226 2227 2228 2229 2230 2231 | '() db qry run-id (or last-update 0)))))) (define (db:get-testinfo-state-status dbstruct run-id test-id) | | | | > > | | | | | | > | | | 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 | '() db qry run-id (or last-update 0)))))) (define (db:get-testinfo-state-status dbstruct run-id test-id) (db:with-db dbstruct run-id #f (lambda (dbdat db) (let* ((res #f) (stmth (db:get-cache-stmth dbdat db "SELECT run_id,testname,item_path,state,status FROM tests WHERE id=? and run_id=?;"))) (sqlite3:for-each-row (lambda (run-id testname item-path state status) ;; id,run_id,testname,state,status,event_time,host,cpuload,diskfree,uname,rundir,item_path,run_duration,final_logf,comment (set! res (vector test-id run-id testname state status -1 "" -1 -1 "" "-" item-path -1 "-" "-"))) ;; db ;; "SELECT run_id,testname,item_path,state,status FROM tests WHERE id=? and run_id=?;" stmth test-id run-id) res)))) ;; get a useful subset of the tests data (used in dashboard ;; use db:mintest-get-{id ,run_id,testname ...} ;; (define (db:get-tests-for-run-mindata dbstruct run-id testpatt states statuses not-in) (db:get-tests-for-run dbstruct run-id testpatt states statuses #f #f not-in #f #f "id,run_id,testname,state,status,event_time,item_path" 0 #f)) |
︙ | ︙ | |||
2628 2629 2630 2631 2632 2633 2634 | ;; (define (db:get-test-state-status-by-id dbstruct run-id test-id) (db:with-db dbstruct run-id #f (lambda (dbdat db) | | | > > | | | | | | | 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 | ;; (define (db:get-test-state-status-by-id dbstruct run-id test-id) (db:with-db dbstruct run-id #f (lambda (dbdat db) (let ((res (cons #f #f)) (stmth (db:get-cache-stmth dbdat db "SELECT state,status FROM tests WHERE id=? AND run_id=?;"))) (db:with-mutex-for-stmth (lambda() (sqlite3:for-each-row ;; attemptnum added to hold pid of top process (not Megatest) controlling a test (lambda (state status) (cons state status)) ;; db stmth ;;"SELECT state,status FROM tests WHERE id=? AND run_id=?;" ;; stmth try not compiling this one - yes, this fixed the bind issue test-id run-id))) res)))) ;; Use db:test-get* to access ;; Get test data using test_ids. NB// Only works within a single run!! ;; (define (db:get-test-info-by-ids dbstruct run-id test-ids) (db:with-db |
︙ | ︙ |
Modified dbfile.scm from [b5eea0764a] to [f0738065cd].
︙ | ︙ | |||
524 525 526 527 528 529 530 | dbname TEXT, mtversion TEXT, reason TEXT DEFAULT 'none', CONSTRAINT no_sync_processes UNIQUE (host,pid));" )))))) (on-tmp (equal? (car (string-split dbpath "/")) "tmp")) (db (if on-tmp | | | > | | 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 | dbname TEXT, mtversion TEXT, reason TEXT DEFAULT 'none', CONSTRAINT no_sync_processes UNIQUE (host,pid));" )))))) (on-tmp (equal? (car (string-split dbpath "/")) "tmp")) (db (if on-tmp (dbfile:cautious-open-database dbname init-proc 1 "WAL" force-init: #t) ;; WAL MODE should use syncronous=1 ;; (dbfile:cautious-open-database dbname init-proc 0 #f force-init: #t) (dbfile:cautious-open-database dbname init-proc 0 "MEMORY" force-init: #t) ;; (sqlite3:open-database dbname) ))) (if on-tmp ;; done in cautious-open-database (begin ;; (sqlite3:execute db "PRAGMA synchronous = 0;") ;; why was this here when is is handled by cautious-open-database? (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 136000)))) db)) ;; mtest processes registry calls (define (dbfile:insert-or-update-process nsdb dat) (let* ((host (procinf-host dat)) |
︙ | ︙ | |||
1577 1578 1579 1580 1581 1582 1583 1584 | (result (or stmth (let* ((newstmth (sqlite3:prepare db stmt))) ;; (db:hoh-set! stmt-cache db stmt newstmth) (hash-table-set! stmt-cache stmt newstmth) newstmth)))) (mutex-unlock! *get-cache-stmth-mutex*) result)) | > | > > > > > > > | 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 | (result (or stmth (let* ((newstmth (sqlite3:prepare db stmt))) ;; (db:hoh-set! stmt-cache db stmt newstmth) (hash-table-set! stmt-cache stmt newstmth) newstmth)))) (mutex-unlock! *get-cache-stmth-mutex*) result)) (define *mutex-stmth-call* (make-mutex)) (define (db:with-mutex-for-stmth proc) (mutex-lock! *mutex-stmth-call*) (let* ((res (proc))) (mutex-unlock! *mutex-stmth-call*) res)) ) |
Modified launch.scm from [470997d4b0] to [4674140b46].
︙ | ︙ | |||
234 235 236 237 238 239 240 | ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10) (let loop ((minutes (calc-minutes)) (cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (disk-free (get-df (current-directory))) (last-sync (current-seconds))) | < > | < < < | < < | 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 | ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10) (let loop ((minutes (calc-minutes)) (cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (disk-free (get-df (current-directory))) (last-sync (current-seconds))) (let* ((over-time (> (current-seconds) (+ last-sync update-period))) (new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (delta (abs (- load cpu-load)))) (if (> delta 0.1) ;; don't bother updating with small changes load #f))) (new-disk-free (let* ((df (if over-time ;; only get df every 30 seconds (get-df (current-directory)) disk-free)) (delta (abs (- df disk-free)))) (if (and (> df 0) (> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg df #f))) (do-sync (or new-cpu-load new-disk-free over-time)) (test-info (rmt:get-test-state-status-by-id run-id test-id)) (state (car test-info));; (db:test-get-state test-info)) (status (cdr test-info));; (db:test-get-status test-info)) (killreq (equal? state "KILLREQ")) (kill-reason "no kill reason specified") (kill-job? #f)) ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period)) (cond (killreq (set! kill-reason "KILLING TEST since received kill request (KILLREQ)") (set! kill-job? #t)) ((and runtlim (> (- (current-seconds) start-seconds) runtlim)) (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim)) (set! kill-job? #t)) ((equal? status "DEAD") (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.") ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING (set! kill-job? #f))) (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync) (if (common:low-noise-print 600 "run zombie") ;; every five minutes is plenty (launch:handle-zombie-tests run-id)) (when do-sync (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)) (if kill-job? (begin (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason) (mutex-lock! m) ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this ;; section and the runit section? Or add a loop that tries three times with a 1/4 second |
︙ | ︙ | |||
329 330 331 332 333 334 335 | (debug:print-error 0 *default-log-port* "Nothing to kill, pid1=" pid1 ", pid2=" pid2) (tests:test-set-status! run-id test-id "KILLED" "FAILED TO KILL" (conc (args:get-arg "-m")" "kill-reason) #f) ;; BB ADDED kill-reason -- confirm OK with Matt ))) (mutex-unlock! m) ;; no point in sticking around. Exit now. But run end of run before exiting? (launch:end-of-run-check run-id) (exit))) | | | < | | | | | 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 | (debug:print-error 0 *default-log-port* "Nothing to kill, pid1=" pid1 ", pid2=" pid2) (tests:test-set-status! run-id test-id "KILLED" "FAILED TO KILL" (conc (args:get-arg "-m")" "kill-reason) #f) ;; BB ADDED kill-reason -- confirm OK with Matt ))) (mutex-unlock! m) ;; no point in sticking around. Exit now. But run end of run before exiting? (launch:end-of-run-check run-id) (exit))) (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta (begin (thread-sleep! 6) ;; was 3 (loop (calc-minutes) (or new-cpu-load cpu-load) (or new-disk-free disk-free) (if do-sync (current-seconds) last-sync)))))) (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional (define (launch:execute encoded-cmd) (let* ((cmdinfo (common:read-encoded-string encoded-cmd)) (tconfigreg #f)) (setenv "MT_CMDINFO" encoded-cmd) |
︙ | ︙ |
Modified tcp-transportmod.scm from [d161e3f81c] to [46ac0a4bbf].
︙ | ︙ | |||
254 255 256 257 258 259 260 | (> delay-wait 0)) (begin (debug:print 0 *default-log-port* "Server is loaded, delaying "delay-wait" seconds") (thread-sleep! delay-wait))))) (case status ((busy) ;; result will be how long the server wants you to delay (let* ((raw-dly (if (number? result) result 0.1)) | | | 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 | (> delay-wait 0)) (begin (debug:print 0 *default-log-port* "Server is loaded, delaying "delay-wait" seconds") (thread-sleep! delay-wait))))) (case status ((busy) ;; result will be how long the server wants you to delay (let* ((raw-dly (if (number? result) result 0.1)) (dly (+ raw-dly (/ attemptnum 10)))) ;; (* raw-dly (/ attemptnum 2)))) (debug:print 0 *default-log-port* "WARNING: server for "dbfname" is busy, cmd is "cmd", will try again in "dly" seconds. This is attempt "(- attemptnum 1)) (thread-sleep! dly) (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc))) ((loaded) (debug:print 0 *default-log-port* "WARNING: server for "dbfname" is loaded, slowing queries.") (tt:backoff-incr (tt-conn-host conn)(tt-conn-port conn)) result) ;; (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe)) |
︙ | ︙ |
Modified tests.scm from [6fa611f761] to [97b0ba1ab3].
︙ | ︙ | |||
1962 1963 1964 1965 1966 1967 1968 | ;;====================================================================== ;; test steps ;;====================================================================== ;; teststep-set-status! used to be here | > | | 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 | ;;====================================================================== ;; test steps ;;====================================================================== ;; teststep-set-status! used to be here ;; NOT NEEDED #;(define (test-get-kill-request run-id test-id) ;; run-id test-name itemdat) (let* ((testdat (rmt:get-test-state-status-by-id run-id test-id))) (and testdat (equal? (car testdat) "KILLREQ")))) (define (test:tdb-get-rundat-count tdb) (if tdb (let ((res 0)) |
︙ | ︙ |