Overview
Comment: | wip |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.80 |
Files: | files | file ages | folders |
SHA1: |
abb2c7397456dc2a1867d0ebfe4458d8 |
User & Date: | mrwellan on 2023-03-03 17:37:11 |
Other Links: | branch diff | manifest | tags |
Context
2023-03-03
| ||
18:32 | no crashes (yet) check-in: 349234dd61 user: mrwellan tags: v1.80 | |
17:37 | wip check-in: abb2c73974 user: mrwellan tags: v1.80 | |
02:16 | Speculative fix for wrong server access issue check-in: 0f348aa84b user: mrwellan tags: v1.80 | |
Changes
Modified api.scm from [1736f4bcb3] to [9b20a45ffe].
︙ | ︙ | |||
241 242 243 244 245 246 247 | (if (not *server-signature*) (set! *server-signature* (tt:mk-signature *toppath*))) (lambda () (let* ((indat (deserialize)) (newcount (+ *api-process-request-count* 1)) (delay-wait (if (> newcount 10) (- newcount 10) | | > > > > > | | | | < | < | | | 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 | (if (not *server-signature*) (set! *server-signature* (tt:mk-signature *toppath*))) (lambda () (let* ((indat (deserialize)) (newcount (+ *api-process-request-count* 1)) (delay-wait (if (> newcount 10) (- newcount 10) 0)) (normal-proc (lambda (cmd run-id params) (case cmd ((ping) *server-signature*) (else (api:dispatch-request dbstruct cmd run-id params)))))) (set! *api-process-request-count* newcount) (set! *db-last-access* (current-seconds)) (match indat ((cmd run-id params meta) (let* ((ttdat *server-info*) (server-state (tt-state ttdat)) (status (cond ;; ((> newcount 600) 'busy) ;; ((> newcount 300) 'loaded) (else 'ok))) (errmsg (case status ((busy) (conc "Server overloaded, "newcount" threads in flight")) ((loaded) (conc "Server loaded, "newcount" threads in flight")) (else #f))) (result (case status ((busy) (- newcount 29)) ;; call back in as many seconds ((loaded) (normal-proc cmd run-id params)) (else (normal-proc cmd run-id params)))) (meta (case cmd ((ping) `((sstate . ,server-state))) (else `((wait . ,delay-wait))))) (payload (list status errmsg result meta))) (set! *api-process-request-count* (- *api-process-request-count* 1)) (serialize payload))) (else |
︙ | ︙ |
Modified db.scm from [5ed64563c8] to [5255866c2a].
︙ | ︙ | |||
2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 | ;; 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 (set! res (cons (apply vector a b) res))) db (conc "SELECT " db:test-record-qry-selector " FROM tests WHERE id in (" (string-intersperse (map conc test-ids) ",") ");")) res)))) (define (db:get-test-info dbstruct run-id test-name item-path) (db:with-db dbstruct run-id #f (lambda (dbdat db) (db:get-test-info-db db run-id test-name item-path)))) | > > > > > > > > > > > > > > | 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 | ;; 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 (set! res (cons (apply vector a b) res))) db (conc "SELECT " db:test-record-qry-selector " FROM tests WHERE id in (" (string-intersperse (map conc test-ids) ",") ");")) res)))) ;; try every second until tries times proc ;; (define (db:keep-trying-until-true proc params tries) (let* ((res (apply proc params))) (if res res (if (> tries 0) (begin (thread-sleep! 1) (db:keep-trying-until-true proc params (- tries 1))) (begin (debug:print-info 0 *default-log-port* "proc never returned true, params="params) #f))))) (define (db:get-test-info dbstruct run-id test-name item-path) (db:with-db dbstruct run-id #f (lambda (dbdat db) (db:get-test-info-db db run-id test-name item-path)))) |
︙ | ︙ | |||
3314 3315 3316 3317 3318 3319 3320 | ;; if test-name is an integer work off that as test-id instead of test-name test-path ;; (define (db:set-state-status-and-roll-up-items dbstruct run-id test-name item-path state status comment) ;; establish info on incoming test followed by info on top level test ;; BBnote - for mode itemwait, linkage between upstream test & matching item status is propagated to run queue in db:prereqs-not-met (let* ((testdat (if (number? test-name) (db:get-test-info-by-id dbstruct run-id test-name) ;; test-name is actually a test-id | > > | > | 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 | ;; if test-name is an integer work off that as test-id instead of test-name test-path ;; (define (db:set-state-status-and-roll-up-items dbstruct run-id test-name item-path state status comment) ;; establish info on incoming test followed by info on top level test ;; BBnote - for mode itemwait, linkage between upstream test & matching item status is propagated to run queue in db:prereqs-not-met (let* ((testdat (if (number? test-name) (db:get-test-info-by-id dbstruct run-id test-name) ;; test-name is actually a test-id (db:keep-trying-until-true ;; in our threaded stuff this call could happen before the test is registered (maybe?) db:get-test-info (list dbstruct run-id test-name item-path) 10))) (test-id (db:test-get-id testdat)) (test-name (if (number? test-name) (db:test-get-testname testdat) test-name)) (item-path (db:test-get-item-path testdat)) (tl-testdat (db:get-test-info dbstruct run-id test-name "")) (tl-test-id (if tl-testdat |
︙ | ︙ |
Modified dbfile.scm from [776ffe336b] to [e6a2908d99].
︙ | ︙ | |||
15 16 17 18 19 20 21 | ;; ;; You should have received a copy of the GNU General Public License ;; along with Megatest. If not, see <http://www.gnu.org/licenses/>. ;;====================================================================== (declare (unit dbfile)) | | | | 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 | ;; ;; You should have received a copy of the GNU General Public License ;; along with Megatest. If not, see <http://www.gnu.org/licenses/>. ;;====================================================================== (declare (unit dbfile)) (declare (uses debugprint)) (declare (uses commonmod)) (module dbfile * (import scheme chicken data-structures extras matchable) (import (prefix sqlite3 sqlite3:) posix typed-records srfi-18 srfi-1 srfi-69 stack files ports commonmod debugprint ) (define keep-age-param (make-parameter 10)) ;; qif file age, if over move to attic (define num-run-dbs (make-parameter 10)) ;; number of db's in .megatest (define dbfile:testsuite-name (make-parameter #f)) (define dbfile:sync-method (make-parameter 'attach)) ;; 'attach or 'original |
︙ | ︙ | |||
405 406 407 408 409 410 411 412 413 414 415 416 417 418 | (system cmd))) ;; opens and returns handle and nothing else ;; (define (dbfile:raw-open-no-sync-db dbpath) (if (not (file-exists? dbpath)) (create-directory dbpath #t)) (let* ((dbname (conc dbpath "/no-sync.db")) (db-exists (file-exists? dbname)) (init-proc (lambda (db) (if (not db-exists) (begin (sqlite3:execute db "CREATE TABLE IF NOT EXISTS no_sync_metadat (var TEXT,val TEXT, CONSTRAINT no_sync_metadat_constraint UNIQUE (var));")) ))) | > | 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 | (system cmd))) ;; opens and returns handle and nothing else ;; (define (dbfile:raw-open-no-sync-db dbpath) (if (not (file-exists? dbpath)) (create-directory dbpath #t)) (debug:print-info 0 *default-log-port* "Opening "dbpath"/no-sync.db") (let* ((dbname (conc dbpath "/no-sync.db")) (db-exists (file-exists? dbname)) (init-proc (lambda (db) (if (not db-exists) (begin (sqlite3:execute db "CREATE TABLE IF NOT EXISTS no_sync_metadat (var TEXT,val TEXT, CONSTRAINT no_sync_metadat_constraint UNIQUE (var));")) ))) |
︙ | ︙ | |||
466 467 468 469 470 471 472 | ;; transaction protected lock aquisition ;; either: ;; fails returns (#f . lock-creation-time) ;; succeeds (returns (#t . lock-creation-time) ;; use (db:no-sync-del! db keyname) to release the lock ;; | > > | | 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 | ;; transaction protected lock aquisition ;; either: ;; fails returns (#f . lock-creation-time) ;; succeeds (returns (#t . lock-creation-time) ;; use (db:no-sync-del! db keyname) to release the lock ;; ;; ;; (define (db:no-sync-get-lock db keyname . identification) (sqlite3:with-transaction db (lambda () (condition-case (let* ((curr-val (db:no-sync-get/default db keyname #f))) (if curr-val `(#f . ,curr-val) ;; (sqlite3:first-result db "SELECT val FROM no_sync_metadat WHERE var=?;" keyname)) |
︙ | ︙ |
Modified launch.scm from [6fda936e21] to [b3f7d09843].
︙ | ︙ | |||
1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 | ;; 2. create run dir on disk, path name is meaningful ;; 3. create link from run dir to megatest runs area ;; 4. remotely run the test on allocated host ;; - could be ssh to host from hosts table (update regularly with load) ;; - could be netbatch ;; (launch-test db (cadr status) test-conf)) (define (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat params) (mutex-lock! *launch-setup-mutex*) ;; setting variables and processing the testconfig is NOT thread-safe, reuse the launch-setup mutex (let* ( ;; (lock-key (conc "test-" test-id)) ;; (got-lock (let loop ((lock (rmt:no-sync-get-lock lock-key)) ;; (expire-time (+ (current-seconds) 15))) ;; give up on getting the lock and steal it after 15 seconds ;; (if (car lock) ;; #t ;; (if (> (current-seconds) expire-time) | > | 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 | ;; 2. create run dir on disk, path name is meaningful ;; 3. create link from run dir to megatest runs area ;; 4. remotely run the test on allocated host ;; - could be ssh to host from hosts table (update regularly with load) ;; - could be netbatch ;; (launch-test db (cadr status) test-conf)) (define (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat params) (assert runname "FATAL: launch-test called with no runname") (mutex-lock! *launch-setup-mutex*) ;; setting variables and processing the testconfig is NOT thread-safe, reuse the launch-setup mutex (let* ( ;; (lock-key (conc "test-" test-id)) ;; (got-lock (let loop ((lock (rmt:no-sync-get-lock lock-key)) ;; (expire-time (+ (current-seconds) 15))) ;; give up on getting the lock and steal it after 15 seconds ;; (if (car lock) ;; #t ;; (if (> (current-seconds) expire-time) |
︙ | ︙ | |||
1563 1564 1565 1566 1567 1568 1569 | ;; the following call handles waiver propogation. cannot yet condense into roll-up-pass-fail (tests:test-set-status! run-id test-id "LAUNCHED" "n/a" #f #f) ;; (if launch-results launch-results "FAILED")) (rmt:set-state-status-and-roll-up-items run-id test-name item-path #f "LAUNCHED" #f) ;; (pp (hash-table->alist tconfig)) (set! diskpath (get-best-disk *configdat* tconfig)) (debug:print 2 *default-log-port* "best disk path = " diskpath) (if diskpath | | | 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 | ;; the following call handles waiver propogation. cannot yet condense into roll-up-pass-fail (tests:test-set-status! run-id test-id "LAUNCHED" "n/a" #f #f) ;; (if launch-results launch-results "FAILED")) (rmt:set-state-status-and-roll-up-items run-id test-name item-path #f "LAUNCHED" #f) ;; (pp (hash-table->alist tconfig)) (set! diskpath (get-best-disk *configdat* tconfig)) (debug:print 2 *default-log-port* "best disk path = " diskpath) (if diskpath (let ((dat (create-work-area run-id run-info keyvals test-id test-path diskpath test-name itemdat))) (set! work-area (car dat)) (set! toptest-work-area (cadr dat)) (debug:print-info 2 *default-log-port* "Using work area " work-area)) (begin (set! work-area (conc test-path "/tmp_run")) (create-directory work-area #t) (debug:print 0 *default-log-port* "WARNING: No disk work area specified - running in the test directory under tmp_run"))) |
︙ | ︙ |
Modified megatest.scm from [a9076c4fb8] to [c6a7ef8a82].
︙ | ︙ | |||
93 94 95 96 97 98 99 100 101 102 103 104 105 106 | (define *usage-use-seconds* #t) ;; for Epoc seconds in usage logging change this to #t in ~/.megatestrc file ;; set some parameters here - these need to be put in something that can be loaded from other ;; executables such as dashboard and mtutil ;; (include "transport-mode.scm") (dbfile:db-init-proc db:initialize-main-db) ;; load the ~/.megatestrc file, put (use trace)(trace-call-sites #t)(trace function-you-want-to-trace) in this file ;; (let ((debugcontrolf (conc (get-environment-variable "HOME") "/.megatestrc"))) (if (common:file-exists? debugcontrolf) (load debugcontrolf))) | > | 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 | (define *usage-use-seconds* #t) ;; for Epoc seconds in usage logging change this to #t in ~/.megatestrc file ;; set some parameters here - these need to be put in something that can be loaded from other ;; executables such as dashboard and mtutil ;; (include "transport-mode.scm") (dbfile:db-init-proc db:initialize-main-db) (debug:enable-timestamp #t) ;; load the ~/.megatestrc file, put (use trace)(trace-call-sites #t)(trace function-you-want-to-trace) in this file ;; (let ((debugcontrolf (conc (get-environment-variable "HOME") "/.megatestrc"))) (if (common:file-exists? debugcontrolf) (load debugcontrolf))) |
︙ | ︙ |
Modified tcp-transportmod.scm from [2182e473e2] to [8018e361e3].
︙ | ︙ | |||
206 207 208 209 210 211 212 | (if (and (number? delay-wait) (> delay-wait 0)) (begin (debug:print 0 *default-log-port* "Server is loaded, delaying "delay-wait" seconds") (thread-sleep! delay-wait))))) (case status ((busy) ;; result will be how long the server wants you to delay | | | | 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 | (if (and (number? delay-wait) (> delay-wait 0)) (begin (debug:print 0 *default-log-port* "Server is loaded, delaying "delay-wait" seconds") (thread-sleep! delay-wait))))) (case status ((busy) ;; result will be how long the server wants you to delay (debug:print 0 *default-log-port* "WARNING: server for "dbfname" is overloaded, will try again in "result" seconds.") (thread-sleep! (if (number? result) result 2)) (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)) ((loaded) (debug:print 0 *default-log-port* "WARNING: server for "dbfname" is loaded, will try again in a 1/4 second.") (thread-sleep! 0.25) (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)) (else result))) (else (if (not res) (let* ((host (tt-conn-host conn)) |
︙ | ︙ | |||
265 266 267 268 269 270 271 | ;; returns list of (host port startseconds server-id servinfofile) ;; (define (tt:get-server-info-sorted ttdat dbfname) (let* ((areapath (tt-areapath ttdat)) (sfiles (tt:find-server areapath dbfname)) (sdats (filter car (map tt:server-get-info sfiles))) ;; first element is #f if the file disappeared while being read (sorted (sort sdats (lambda (a b) | | > > > > > > | 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 | ;; returns list of (host port startseconds server-id servinfofile) ;; (define (tt:get-server-info-sorted ttdat dbfname) (let* ((areapath (tt-areapath ttdat)) (sfiles (tt:find-server areapath dbfname)) (sdats (filter car (map tt:server-get-info sfiles))) ;; first element is #f if the file disappeared while being read (sorted (sort sdats (lambda (a b) (< (list-ref a 2)(list-ref b 2))))) (count 0)) (for-each (lambda (rec) (debug:print 0 *default-log-port* "SERVER #"count": "(string-intersperse (map conc sorted) ", ")) (set! count (+ count 1))) sorted) sorted)) (define (tt:get-current-server-info ttdat dbfname) (assert (tt-areapath ttdat) "FATAL: areapath not set in ttdat.") ;; ;; TODO - replace most of below with tt;get-server-info-sorted ;; |
︙ | ︙ | |||
385 386 387 388 389 390 391 | (thread-sleep! 0.05) ;; any real need for delay here? (let loop () (let* ((servers (tt:get-server-info-sorted ttdat dbfname)) (ok (cond ((null? servers) #f) ;; not ok ((equal? (list-ref (car servers) 6) ;; compare the servinfofile (tt-servinf-file ttdat)) | < | | | > | | | | | | | > | | | | | > > > > | > > > | | | | | > | > | > | 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 | (thread-sleep! 0.05) ;; any real need for delay here? (let loop () (let* ((servers (tt:get-server-info-sorted ttdat dbfname)) (ok (cond ((null? servers) #f) ;; not ok ((equal? (list-ref (car servers) 6) ;; compare the servinfofile (tt-servinf-file ttdat)) (let* ((res (if db-locked-in #t (let* ((lockinfo (dbfile:with-no-sync-db nosyncdbpath (lambda (db) (db:no-sync-get-lock db dbfname)))) (success (car lockinfo))) (if success (begin (tt-state-set! ttdat 'running) (debug:print 0 *default-log-port* "Got server lock for " dbfname) (set! db-locked-in #t) #t) (begin (debug:print 0 *default-log-port* "Failed to get server lock for "dbfname) #f)))))) (if res (debug:print-info 0 *default-log-port* "Keep running, I'm the top server for " dbfname" on "(tt-host ttdat)":"(tt-port ttdat))) res)) (else (debug:print-info 0 *default-log-port* "I'm not the lead server: "servers) (let* ((leadsrv (car servers))) (match leadsrv ((host port startseconds server-id pid dbfname servinfofile) (let* ((res (tt:ping host port server-id))) (debug:print-info 0 *default-log-port* "Ping to "host":"port", with server-id "server-id ", and file "servinfofile" returned "res) (if res #f ;; not the server, but all good, want to exit (if (and (file-exists? servinfofile) (> (- (current-seconds)(file-modification-time servinfofile)) 30)) (begin ;; can't ping and file has been on disk 15 seconds, go ahead and try to remove it (debug:print-info 0 *default-log-port* "Removing apparently dead server info file: "servinfofile) (delete-file* servinfofile) #t) ;; not the server but the server is not reachable (begin (debug:print 0 *default-log-port* "I'm not the server but could not ping "host":"port", trying again.") (thread-sleep! 1) ;; just because #t))))) (else ;; should never get here (debug:print 0 *default-log-port* "BAD SERVER RECORD: "leadsrv) (assert #f "Bad server record "leadsrv)))))))) (if ok ;; (if (> *api-process-request-count* 0) ;; have requests in flight ;; (tt-last-access-set! ttdat (current-seconds))) (tt-last-access-set! ttdat *db-last-access*) ;; bit silly, just use db-last-access (begin (cleanup) (exit))) (let* ((last-update (dbr:dbstruct-last-update dbstruct)) (curr-secs (current-seconds))) (if (and (eq? (tt-state ttdat) 'running) (> (- curr-secs last-update) 3)) ;; every 3-4 seconds update the db? maybe this should be refresh the inmem? (begin (set! (file-modification-time (tt-servinf-file ttdat)) (current-seconds)) ((dbr:dbstruct-sync-proc dbstruct) last-update) (dbr:dbstruct-last-update-set! dbstruct curr-secs)))) (if (< (- (current-seconds) (tt-last-access ttdat)) 60) (begin (thread-sleep! 5) (loop))))) |
︙ | ︙ |