Overview
Comment: | merged changes to reduce load of brute force syner |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.65 | v1.6528 |
Files: | files | file ages | folders |
SHA1: |
f61052be3c1cb795806f4c8895add22d |
User & Date: | bjbarcla on 2019-03-29 13:17:59 |
Other Links: | branch diff | manifest | tags |
Context
2019-04-01
| ||
08:32 | Merged trim sleeps check-in: db1d22eadb user: mrwellan tags: v1.65 | |
2019-03-29
| ||
18:26 | wip Closed-Leaf check-in: 8e9712a5d3 user: bjbarcla tags: v1.65-dont-preqfail-completed | |
17:59 | Trim couple sleeps that seem unnecessary Closed-Leaf check-in: 1550ea7ddf user: mrwellan tags: v1.65-trim-sleeps | |
13:17 | merged changes to reduce load of brute force syner check-in: f61052be3c user: bjbarcla tags: v1.65, v1.6528 | |
11:36 | merged v1.65-nosleep Closed-Leaf check-in: 2ea9ce81b4 user: bjbarcla tags: v1.65-integ-19-03-29 | |
2019-03-28
| ||
16:10 | updated version check-in: 8293650fcd user: pjhatwal tags: v1.65 | |
Changes
Modified common.scm from [c41ac723cd] to [b1d85b703a].
︙ | ︙ | |||
316 317 318 319 320 321 322 | (define (common:version-changed?) (not (equal? (common:get-last-run-version) (common:version-signature)))) (define (common:api-changed?) (not (equal? (substring (->string megatest-version) 0 4) (substring (conc (common:get-last-run-version)) 0 4)))) | | > > > > > > | 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 | (define (common:version-changed?) (not (equal? (common:get-last-run-version) (common:version-signature)))) (define (common:api-changed?) (not (equal? (substring (->string megatest-version) 0 4) (substring (conc (common:get-last-run-version)) 0 4)))) (define (common:get-sync-lock-filepath) (let* ((tmp-area (common:get-db-tmp-area)) (lockfile (conc tmp-area "/megatest.db.sync-lock"))) lockfile)) ;; Move me elsewhere ... ;; RADT => Why do we meed the version check here, this is called only if version misma ;; (define (common:cleanup-db dbstruct #!key (full #f)) (apply db:multi-db-sync dbstruct 'schema |
︙ | ︙ | |||
597 598 599 600 601 602 603 | (read (open-input-string (z3:decode-buffer (base64:base64-decode instr)))))) ;; dot-locking egg seems not to work, using this for now ;; if lock is older than expire-time then remove it and try again ;; to get the lock ;; (define (common:simple-file-lock fname #!key (expire-time 300)) | | | | | > > | 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 | (read (open-input-string (z3:decode-buffer (base64:base64-decode instr)))))) ;; dot-locking egg seems not to work, using this for now ;; if lock is older than expire-time then remove it and try again ;; to get the lock ;; (define (common:simple-file-lock fname #!key (expire-time 300)) ; (handle-exceptions ; exn ; #f ;; don't really care what went wrong right now. NOTE: I have not seen this one actually fail. (if (common:file-exists? fname) (if (> (- (current-seconds)(file-modification-time fname)) expire-time) (begin (delete-file* fname) (common:simple-file-lock fname expire-time: expire-time)) #f) (let ((key-string (conc (get-host-name) "-" (current-process-id)))) (with-output-to-file fname (lambda () (print key-string))) (thread-sleep! 0.25) (if (common:file-exists? fname) (with-input-from-file fname (lambda () (equal? key-string (read-line)))) #f))) ; ) ) (define (common:simple-file-lock-and-wait fname #!key (expire-time 300)) (let ((end-time (+ expire-time (current-seconds)))) (let loop ((got-lock (common:simple-file-lock fname expire-time: expire-time))) (if got-lock #t (if (> end-time (current-seconds)) |
︙ | ︙ | |||
935 936 937 938 939 940 941 | ) ) 0) (define (std-signal-handler signum) ;; (signal-mask! signum) | | | 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 | ) ) 0) (define (std-signal-handler signum) ;; (signal-mask! signum) (set! *time-to-exit* #t) ;;(debug:print-info 13 *default-log-port* "got signal "signum) (debug:print-error 0 *default-log-port* "Received signal " signum " aaa exiting promptly") ;; (std-exit-procedure) ;; shouldn't need this since we are exiting and it will be called anyway (exit)) (define (special-signal-handler signum) ;; (signal-mask! signum) |
︙ | ︙ |
Modified db.scm from [73e0450409] to [a146d876b8].
︙ | ︙ | |||
311 312 313 314 315 316 317 | ;; This routine creates the db if not already present. It is only called if the db is not already opened ;; (define (db:open-db dbstruct #!key (areapath #f)(do-sync #t)) ;; TODO: actually use areapath (let ((tmpdb-stack (dbr:dbstruct-dbstack dbstruct))) ;; RA => Returns the first reference in dbstruct (if (stack? tmpdb-stack) (db:get-db tmpdb-stack) ;; get previously opened db (will create new db handle if all in the stack are already used | > | | 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 | ;; This routine creates the db if not already present. It is only called if the db is not already opened ;; (define (db:open-db dbstruct #!key (areapath #f)(do-sync #t)) ;; TODO: actually use areapath (let ((tmpdb-stack (dbr:dbstruct-dbstack dbstruct))) ;; RA => Returns the first reference in dbstruct (if (stack? tmpdb-stack) (db:get-db tmpdb-stack) ;; get previously opened db (will create new db handle if all in the stack are already used (let* ((max-stale-tmp (configf:lookup-number *configdat* "server" "filling-db-max-stale-seconds" default: 10)) (dbpath (db:dbfile-path )) ;; path to tmp db area (dbexists (common:file-exists? dbpath)) (tmpdbfname (conc dbpath "/megatest.db")) (dbfexists (common:file-exists? tmpdbfname)) ;; (conc dbpath "/megatest.db"))) (mtdbexists (common:file-exists? (conc *toppath* "/megatest.db"))) (mtdbmodtime (if mtdbexists (common:lazy-sqlite-db-modification-time (conc *toppath* "/megatest.db")) #f)) (tmpdbmodtime (if dbfexists (common:lazy-sqlite-db-modification-time tmpdbfname) #f)) |
︙ | ︙ | |||
349 350 351 352 353 354 355 | (dbr:dbstruct-tmpdb-set! dbstruct tmpdb) (dbr:dbstruct-dbstack-set! dbstruct (make-stack)) ;; BB: why a stack? Why would the number of db's be indeterminate? Is this a legacy of 1.db 2.db .. ? (stack-push! (dbr:dbstruct-dbstack dbstruct) tmpdb) ;; olddb is already a (cons db path) (dbr:dbstruct-refndb-set! dbstruct refndb) ;; (mutex-unlock! *rundb-mutex*) (if (and (or (not dbfexists) (and modtimedelta | | | 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 | (dbr:dbstruct-tmpdb-set! dbstruct tmpdb) (dbr:dbstruct-dbstack-set! dbstruct (make-stack)) ;; BB: why a stack? Why would the number of db's be indeterminate? Is this a legacy of 1.db 2.db .. ? (stack-push! (dbr:dbstruct-dbstack dbstruct) tmpdb) ;; olddb is already a (cons db path) (dbr:dbstruct-refndb-set! dbstruct refndb) ;; (mutex-unlock! *rundb-mutex*) (if (and (or (not dbfexists) (and modtimedelta (> modtimedelta max-stale-tmp))) ;; if db in tmp is over ten seconds older than the file in MTRA then do a sync back do-sync) (begin (debug:print 1 *default-log-port* "filling db " (db:dbdat-get-path tmpdb) " with data \n from " (db:dbdat-get-path mtdb) " mod time delta: " modtimedelta) (db:sync-tables (db:sync-all-tables-list dbstruct) #f mtdb refndb tmpdb) ;touch tmp db to avoid wal mode wierdness (set! (file-modification-time tmpdbfname) (current-seconds)) (debug:print-info 13 *default-log-port* "db:sync-all-tables-list done.") |
︙ | ︙ | |||
1046 1047 1048 1049 1050 1051 1052 | ;; kill servers ((killservers) (for-each (lambda (server) (match-let (((mod-time host port start-time pid) server)) (if (and host pid) (tasks:kill-server host pid)))) | | > > > > | 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 | ;; kill servers ((killservers) (for-each (lambda (server) (match-let (((mod-time host port start-time pid) server)) (if (and host pid) (tasks:kill-server host pid)))) servers) ;; /tmp/bjbarcla/megatest_localdb/fullrun/.nfs.pdx.disks.icf_env_disk001.bjbarcla.gwa.issues.mtdev.matt-bisect.megatest.ext-tests.runs.all.v1.65.1.6524.dev.bb-24-justrollup-f8.rollup.fullrun/megatest.db.lock (delete-file* (common:get-sync-lock-filepath)) ) ;; clear out junk records ;; ((dejunk) (db:delay-if-busy mtdb) ;; ok to delay on mtdb (db:clean-up mtdb) (db:clean-up tmpdb) |
︙ | ︙ |
Modified megatest.scm from [a7dc9766fe] to [8964e71ae0].
︙ | ︙ | |||
424 425 426 427 428 429 430 431 432 433 434 435 436 437 | "-create-megatest-area" "-mark-incompletes" "-convert-to-norm" "-convert-to-old" "-import-megatest.db" "-sync-to-megatest.db" "-logging" "-v" ;; verbose 2, more than normal (normal is 1) "-q" ;; quiet 0, errors/warnings only "-diff-rep" ) args:arg-hash | > | 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 | "-create-megatest-area" "-mark-incompletes" "-convert-to-norm" "-convert-to-old" "-import-megatest.db" "-sync-to-megatest.db" "-sync-brute-force" "-logging" "-v" ;; verbose 2, more than normal (normal is 1) "-q" ;; quiet 0, errors/warnings only "-diff-rep" ) args:arg-hash |
︙ | ︙ | |||
1242 1243 1244 1245 1246 1247 1248 | (db:get-value-by-header run header x)) keys) "/")) (statuses (string-split (or (args:get-arg "-status") "") ",")) (run-id (db:get-value-by-header run header "id")) (runname (db:get-value-by-header run header "runname")) (states (string-split (or (args:get-arg "-state") "") ",")) (tests (if tests-spec | | | 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 | (db:get-value-by-header run header x)) keys) "/")) (statuses (string-split (or (args:get-arg "-status") "") ",")) (run-id (db:get-value-by-header run header "id")) (runname (db:get-value-by-header run header "runname")) (states (string-split (or (args:get-arg "-state") "") ",")) (tests (if tests-spec (rmt:get-tests-for-run run-id testpatt states statuses #f #f #f 'testname 'asc ;; (db:get-tests-for-run dbstruct run-id testpatt '() '() #f #f #f 'testname 'asc ;; use qryvals if test-spec provided (if tests-spec (string-intersperse adj-tests-spec ",") ;; db:test-record-fields #f) #f 'normal) |
︙ | ︙ | |||
1378 1379 1380 1381 1382 1383 1384 | (hash-table-set! data "targets" (cons targetstr (hash-table-ref/default data "targets" '()))) ))) (let* ((run-id (db:get-value-by-header run header "id")) (runname (db:get-value-by-header run header "runname")) (states (string-split (or (args:get-arg "-state") "") ",")) (statuses (string-split (or (args:get-arg "-status") "") ",")) (tests (if tests-spec | | | 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 | (hash-table-set! data "targets" (cons targetstr (hash-table-ref/default data "targets" '()))) ))) (let* ((run-id (db:get-value-by-header run header "id")) (runname (db:get-value-by-header run header "runname")) (states (string-split (or (args:get-arg "-state") "") ",")) (statuses (string-split (or (args:get-arg "-status") "") ",")) (tests (if tests-spec (rmt:get-tests-for-run run-id testpatt states statuses #f #f #f 'testname 'asc ;; (db:get-tests-for-run dbstruct run-id testpatt '() '() #f #f #f 'testname 'asc ;; use qryvals if test-spec provided (if tests-spec (string-intersperse adj-tests-spec ",") ;; db:test-record-fields #f) #f 'normal) |
︙ | ︙ | |||
1503 1504 1505 1506 1507 1508 1509 | (conc "\n rundir: " (get-value-by-fieldname test test-field-index "rundir")) ;; (db:test-get-rundir test) "") ;; "\n rundir: " (get-value-by-fieldname test test-field-index "") ;; (sdb:qry 'getstr ;; (filedb:get-path *fdb* ;; (db:test-get-rundir test) ;; ) ) ;; Each test ;; DO NOT remote run | | | 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 | (conc "\n rundir: " (get-value-by-fieldname test test-field-index "rundir")) ;; (db:test-get-rundir test) "") ;; "\n rundir: " (get-value-by-fieldname test test-field-index "") ;; (sdb:qry 'getstr ;; (filedb:get-path *fdb* ;; (db:test-get-rundir test) ;; ) ) ;; Each test ;; DO NOT remote run (let ((steps (rmt:get-steps-for-test run-id (db:test-get-id test)))) ;; (db:get-steps-for-test dbstruct run-id (db:test-get-id test)))) (for-each (lambda (step) (format #t " Step: ~20a State: ~10a Status: ~10a Time ~22a\n" (tdb:step-get-stepname step) (tdb:step-get-state step) (tdb:step-get-status step) |
︙ | ︙ | |||
2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 | 'killservers 'dejunk 'adj-testids 'old2new ;; 'new2old ) (set! *didsomething* #t))) (if (args:get-arg "-sync-to-megatest.db") (let* ((dbstruct (db:setup #f)) (tmpdbpth (cdr (dbr:dbstruct-tmpdb dbstruct))) (lockfile (conc tmpdbpth ".lock")) (locked (common:simple-file-lock lockfile)) (res (if locked | > > > > | 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 | 'killservers 'dejunk 'adj-testids 'old2new ;; 'new2old ) (set! *didsomething* #t))) (when (args:get-arg "-sync-brute-force") ((server:get-bruteforce-syncer (db:setup #t) persist-until-sync: #t)) (set! *didsomething* #t)) (if (args:get-arg "-sync-to-megatest.db") (let* ((dbstruct (db:setup #f)) (tmpdbpth (cdr (dbr:dbstruct-tmpdb dbstruct))) (lockfile (conc tmpdbpth ".lock")) (locked (common:simple-file-lock lockfile)) (res (if locked |
︙ | ︙ |
Modified server.scm from [8ce184eea5] to [a266765465].
︙ | ︙ | |||
498 499 500 501 502 503 504 | ;; (* 3 (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: 180))) ;; (server:release-sync-lock) ;; (server:have-sync-lock?)) ;; (else #f)))) ;; moving this here as it needs access to db and cannot be in common. ;; | | | < | | | > > > > | < < | < > > > > | > > > > > | | > > > > > > > | | | | | | > | | | | | | < < < < < < < < < < < | > | < < < < | < < | > | | < < < < < < | < < < < < < < < < | < < | | | < < | > | | > > > | | > > | | < < < < < < < < < | | | | | | < < < < < > | < < < < > | < < | > > > > | | > | < < | < < < < < < < > > | < < < < < < < < < < < < < < < < < < < < < | < < < < < < < < > | < < < < < < < < | < | < < < < < < < < < < < < < < < | | < < < < < < < < < < | > | | | > > | 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 | ;; (* 3 (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: 180))) ;; (server:release-sync-lock) ;; (server:have-sync-lock?)) ;; (else #f)))) ;; moving this here as it needs access to db and cannot be in common. ;; (define (server:get-bruteforce-syncer dbstruct #!key (fork-to-background #f) (persist-until-sync #f)) (let* ((sqlite-exe (or (get-environment-variable "MT_SQLITE3_EXE"))) ;; defined in cfg.sh (sync-log (or (args:get-arg "-sync-log") (conc *toppath* "/logs/sync-" (current-process-id) "-" (get-host-name) ".log"))) (tmp-area (common:get-db-tmp-area)) (tmp-db (conc tmp-area "/megatest.db")) (staging-file (conc *toppath* "/.megatest.db")) (mtdbfile (conc *toppath* "/megatest.db")) (lockfile (common:get-sync-lock-filepath)) (sync-cmd-core (conc sqlite-exe" " tmp-db " .dump | "sqlite-exe" " staging-file "&>"sync-log)) (sync-cmd (if fork-to-background (conc "/usr/bin/env NBFAKE_LOG="*toppath*"/logs/last-server-sync-"(current-process-id)".log nbfake \""sync-cmd-core" && /bin/mv -f " staging-file " " mtdbfile" \"") sync-cmd-core)) (default-min-intersync-delay 2) (min-intersync-delay (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: default-min-intersync-delay)) (default-duty-cycle 0.1) (duty-cycle (configf:lookup-number *configdat* "server" "sync-duty-cycle" default: default-duty-cycle)) (last-sync-seconds 10) ;; we will adjust this to a measurement and delay last-sync-seconds * (1 - duty-cycle) (calculate-off-time (lambda (work-duration duty-cycle) (* (/ (- 1 duty-cycle) duty-cycle) last-sync-seconds))) (off-time min-intersync-delay) ;; adjusted in closure below. (do-a-sync (lambda () (BB> "Start do-a-sync with fork-to-background="fork-to-background" persist-until-sync="persist-until-sync) (let* ((finalres (let retry-loop ((num-tries 0)) (if (common:simple-file-lock lockfile) (begin (cond ((not (or fork-to-background persist-until-sync)) (debug:print 0 *default-log-port* "INFO: syncer thread sleeping for max of (server.minimum-intersync-delay="min-intersync-delay " , off-time="off-time" seconds ]") (thread-sleep! (max off-time min-intersync-delay))) (else (debug:print 0 *default-log-port* "INFO: syncer thread NOT sleeping ; maybe time-to-exit..."))) (if (not (configf:lookup *configdat* "server" "disable-db-snapshot")) (common:snapshot-file mtdbfile subdir: ".db-snapshot")) (delete-file* staging-file) (let* ((start-time (current-milliseconds)) (res (system sync-cmd)) (res2 (cond ((eq? 0 res) (delete-file* (conc mtdbfile ".backup")) (if (eq? 0 (file-size sync-log)) (delete-file sync-log)) (system (conc "/bin/mv " staging-file " " mtdbfile)) (set! last-sync-seconds (/ (- (current-milliseconds) start-time) 1000)) (set! off-time (calculate-off-time last-sync-seconds (cond ((and (number? duty-cycle) (> duty-cycle 0) (< duty-cycle 1)) duty-cycle) (else (debug:print 0 *default-log-port* "WARNING: ["(common:human-time)"] server.sync-duty-cycle is invalid. Should be a number between 0 and 1, but "duty-cycle" was specified. Using default value: "default-duty-cycle) default-duty-cycle)))) (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "last-sync-seconds" sec") (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "last-sync-seconds" sec ; with duty-cycle of "duty-cycle" off time is now "off-time) 'sync-completed) (else (system (conc "/bin/cp "sync-log" "sync-log".fail")) (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Sync failed. See log at "sync-log".fail") (if (file-exists? (conc mtdbfile ".backup")) (system (conc "/bin/cp "mtdbfile ".backup " mtdbfile))) #f)))) (common:simple-file-release-lock lockfile) (BB> "released lockfile: " lockfile) (when (common:file-exists? lockfile) (BB> "DID NOT ACTUALLY RELEASE LOCKFILE")) res2) ;; end let );; end begin ;; else (cond (persist-until-sync (thread-sleep! 1) (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" other SYNC in progress; we're in a fork-to-background so we need to succeed. Let's wait a jiffy and and try again. num-tries="num-tries" (waiting for lockfile="lockfile" to disappear)") (retry-loop (add1 num-tries))) (else (thread-sleep! (max off-time (+ last-sync-seconds min-intersync-delay))) (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" other SYNC in progress; not syncing.") 'parallel-sync-in-progress)) ) ;; end if got lockfile ) )) (BB> "End do-a-sync with fork-to-background="fork-to-background" persist-until-sync="persist-until-sync" and result="finalres) finalres) ) ;; end lambda )) do-a-sync)) (define (server:writable-watchdog dbstruct) (thread-sleep! 10) ;; delay for startup (let* ((do-a-sync (server:get-bruteforce-syncer dbstruct)) (final-sync (server:get-bruteforce-syncer dbstruct fork-to-background: #t persist-until-sync: #t))) (when (and (not (args:get-arg "-sync-to-megatest.db")) ;; conditions under which we do not run the sync (args:get-arg "-server")) (let loop () (do-a-sync) (if (not *time-to-exit*) (loop))) ;; keep going unless time to exit ;; time to exit, close the no-sync db here (final-sync) (if (common:low-noise-print 30) (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id) ))))) |
Modified tests.scm from [c7490d0df8] to [3fce4840b2].
︙ | ︙ | |||
483 484 485 486 487 488 489 | tol "," units "," dcomment ",," ;; extra comma for status type ))) ;; This was run remote, don't think that makes sense. Perhaps not, but that is the easiest path for the moment. (rmt:csv->test-data run-id test-id dat) | > > > | | 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 | tol "," units "," dcomment ",," ;; extra comma for status type ))) ;; This was run remote, don't think that makes sense. Perhaps not, but that is the easiest path for the moment. (rmt:csv->test-data run-id test-id dat) ;; This was added in check-in a5adfa3f9a. Message was: "...added delay in set-values to allow for delayed write on server start" ;; I'm inserting an arbitrary rmt: call to force/ensure that the server is available to (hopefully) prevent a communication issue. (rmt:get-var "MEGATEST_VERSION") ;; this does NOTHING but ensure the server is reachable. This is almost certainly NOT needed :) ;; BB - commentiong out arbitrary 10 second wait (thread-sleep! 10) ;; add 10 second delay before quit incase rmt needs time to start a server. ))) ;; need to update the top test record if PASS or FAIL and this is a subtest ;;;;;; (if (not (equal? item-path "")) ;;;;;; (rmt:set-state-status-and-roll-up-items run-id test-name item-path state status #f) ;;;;;) (if (or (and (string? comment) |
︙ | ︙ |