Overview
Comment: | Fixed param order issue that was causing some server crashes. Cleaned up some exception handing |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.60 |
Files: | files | file ages | folders |
SHA1: |
3a0e887fa84a31cc2684fedafb292e38 |
User & Date: | matt on 2014-11-18 00:15:30 |
Other Links: | branch diff | manifest | tags |
Context
2014-11-18
| ||
09:32 | Force start on servers before doing write heavy stuff. The transition when doing writes from direct access -> server needs improving. This is a work-around (but a good feature anyway. check-in: c101fd0ec9 user: mrwellan tags: v1.60 | |
00:15 | Fixed param order issue that was causing some server crashes. Cleaned up some exception handing check-in: 3a0e887fa8 user: matt tags: v1.60 | |
2014-11-17
| ||
20:32 | Adding back some tests check-in: faa19d1d5f user: matt tags: v1.60 | |
Changes
Modified common_records.scm from [785b03b237] to [08c9f6257d].
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | ;;====================================================================== ;; Copyright 2006-2012, Matthew Welland. ;; ;; This program is made available under the GNU GPL version 2.0 or ;; greater. See the accompanying file COPYING for details. ;; ;; This program is distributed WITHOUT ANY WARRANTY; without even the ;; implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR ;; PURPOSE. ;;====================================================================== ;; (use trace) (define (debug:calc-verbosity vstr) (cond ((number? vstr) vstr) ((not (string? vstr)) 1) ;; ((string-match "^\\s*$" vstr) 1) (vstr (let ((debugvals (filter number? (map string->number (string-split vstr ","))))) (cond | > > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | ;;====================================================================== ;; Copyright 2006-2012, Matthew Welland. ;; ;; This program is made available under the GNU GPL version 2.0 or ;; greater. See the accompanying file COPYING for details. ;; ;; This program is distributed WITHOUT ANY WARRANTY; without even the ;; implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR ;; PURPOSE. ;;====================================================================== ;; (use trace) ;; Some of these routines use: ;; ;; http://www.cs.toronto.edu/~gfb/scheme/simple-macros.html ;; ;; Syntax for defining macros in a simple style similar to function definiton, ;; when there is a single pattern for the argument list and there are no keywords. ;; ;; (define-simple-syntax (name arg ...) body ...) ;; (define-syntax define-simple-syntax (syntax-rules () ((_ (name arg ...) body ...) (define-syntax name (syntax-rules () ((name arg ...) (begin body ...))))))) (define-syntax common:handle-exceptions (syntax-rules () ((_ exn-in errstmt ...)(handle-exceptions exn-in errstmt ...)))) (define (debug:calc-verbosity vstr) (cond ((number? vstr) vstr) ((not (string? vstr)) 1) ;; ((string-match "^\\s*$" vstr) 1) (vstr (let ((debugvals (filter number? (map string->number (string-split vstr ","))))) (cond |
︙ | ︙ |
Modified db.scm from [f79d03493a] to [d047d1e2d5].
︙ | ︙ | |||
30 31 32 33 34 35 36 37 38 39 40 41 42 43 | (include "key_records.scm") (include "run_records.scm") (define *rundb-mutex* (make-mutex)) ;; prevent problems opening/closing rundb's (define *number-of-writes* 0) (define *number-non-write-queries* 0) ;; Get/open a database ;; if run-id => get run specific db ;; if #f => get main db ;; if db already open - return inmem ;; if db not open, open inmem, rundb and sync then return inmem ;; inuse gets set automatically for rundb's ;; | > > > > > > > > > > > > > > > > > > | 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 | (include "key_records.scm") (include "run_records.scm") (define *rundb-mutex* (make-mutex)) ;; prevent problems opening/closing rundb's (define *number-of-writes* 0) (define *number-non-write-queries* 0) ;;====================================================================== ;; SQLITE3 HELPERS ;;====================================================================== ;; convert to -inline (define (db:first-result-default db stmt default . params) (handle-exceptions exn (let ((err-status ((condition-property-accessor 'sqlite3 'status #f) exn))) ;; check for (exn sqlite3) ((condition-property-accessor 'exn 'message) exn) (if (eq? err-status 'done) default (begin (debug:print 0 "ERROR: query " stmt " failed, params: " params ", error: " ((condition-property-accessor 'exn 'message) exn)) (print-call-chain (current-error-port)) default))) (apply sqlite3:first-result db stmt params))) ;; Get/open a database ;; if run-id => get run specific db ;; if #f => get main db ;; if db already open - return inmem ;; if db not open, open inmem, rundb and sync then return inmem ;; inuse gets set automatically for rundb's ;; |
︙ | ︙ | |||
427 428 429 430 431 432 433 434 435 436 437 438 439 440 | ;; (define (db:sync-tables tbls fromdb todb . slave-dbs) (mutex-lock! *db-sync-mutex*) (handle-exceptions exn (begin (debug:print 0 "EXCEPTION: database probably overloaded or unreadable in db:sync-tables.") (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (print "exn=" (condition->list exn)) (debug:print 0 " status: " ((condition-property-accessor 'sqlite3 'status) exn)) (debug:print 0 " src db: " (db:dbdat-get-path fromdb)) (for-each (lambda (dbdat) (debug:print 0 " dbpath: " (db:dbdat-get-path dbdat))) (cons todb slave-dbs)) | > | > | | 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 | ;; (define (db:sync-tables tbls fromdb todb . slave-dbs) (mutex-lock! *db-sync-mutex*) (handle-exceptions exn (begin (debug:print 0 "EXCEPTION: database probably overloaded or unreadable in db:sync-tables.") (print-call-chain (current-error-port)) (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (print "exn=" (condition->list exn)) (debug:print 0 " status: " ((condition-property-accessor 'sqlite3 'status) exn)) (debug:print 0 " src db: " (db:dbdat-get-path fromdb)) (for-each (lambda (dbdat) (debug:print 0 " dbpath: " (db:dbdat-get-path dbdat))) (cons todb slave-dbs)) (if *server-run* ;; we are inside a server (set! *time-to-exit* #t) ;; let watch dog know that it is time to die. (exit 1))) (cond ((not fromdb) (debug:print 3 "WARNING: db:sync-tables called with fromdb missing") -1) ((not todb) (debug:print 3 "WARNING: db:sync-tables called with todb missing") -2) ((not (sqlite3:database? (db:dbdat-get-db fromdb))) (debug:print 0 "ERROR: db:sync-tables called with fromdb not a database " fromdb) -3) ((not (sqlite3:database? (db:dbdat-get-db todb))) (debug:print 0 "ERROR: db:sync-tables called with todb not a database " todb) -4) |
︙ | ︙ | |||
1861 1862 1863 1864 1865 1866 1867 | (define (db:test-get-top-process-pid dbstruct run-id test-id) (db:with-db dbstruct run-id #f (lambda (db) | > > | > | | 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 | (define (db:test-get-top-process-pid dbstruct run-id test-id) (db:with-db dbstruct run-id #f (lambda (db) (db:first-result-default db "SELECT attemptnum FROM tests WHERE id=?;" #f test-id)))) (define db:test-record-fields '("id" "run_id" "testname" "state" "status" "event_time" "host" "cpuload" "diskfree" "uname" "rundir" "item_path" "run_duration" "final_logf" "comment" "shortdir" "attemptnum")) ;; fields *must* be a non-empty list ;; |
︙ | ︙ | |||
2658 2659 2660 2661 2662 2663 2664 | ;; if the test is not found then clearly the waiton is not met... ;; (if (not ever-seen)(set! result (cons waitontest-name result))))) (if (not ever-seen) (set! result (append (if (null? tests)(list waitontest-name) tests) result))))) waitons) (delete-duplicates result)))) | < < < < < < < < < < < < < < < < < < | 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 | ;; if the test is not found then clearly the waiton is not met... ;; (if (not ever-seen)(set! result (cons waitontest-name result))))) (if (not ever-seen) (set! result (append (if (null? tests)(list waitontest-name) tests) result))))) waitons) (delete-duplicates result)))) ;;====================================================================== ;; Extract ods file from the db ;;====================================================================== ;; NOT REWRITTEN YET!!!!! ;; runspatt is a comma delimited list of run patterns |
︙ | ︙ |
Modified http-transport.scm from [0ec84bc71c] to [068fb6dcc3].
︙ | ︙ | |||
461 462 463 464 465 466 467 | ;; Consider implementing some smarts here to re-insert the record or kill self is ;; the db indicates so ;; ;; (if (tasks:server-am-i-the-server? tdb run-id) ;; (tasks:server-set-state! tdb server-id "running")) ;; (loop 0 server-state)) | > | > | | | | | | | | | | | | | | | | | | | | | | | | | | | | | 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 | ;; Consider implementing some smarts here to re-insert the record or kill self is ;; the db indicates so ;; ;; (if (tasks:server-am-i-the-server? tdb run-id) ;; (tasks:server-set-state! tdb server-id "running")) ;; (loop 0 server-state)) (http-transport:server-shutdown server-id port))))) (define (http-transport:server-shutdown server-id port) (debug:print-info 0 "Starting to shutdown the server.") ;; need to delete only *my* server entry (future use) (set! *time-to-exit* #t) (if *inmemdb* (db:sync-touched *inmemdb* *run-id* force-sync: #t)) ;; ;; start_shutdown ;; (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "shutting-down") (portlogger:open-run-close portlogger:set-port port "released") (thread-sleep! 5) (debug:print-info 0 "Max cached queries was " *max-cache-size*) (debug:print-info 0 "Number of cached writes " *number-of-writes*) (debug:print-info 0 "Average cached write time " (if (eq? *number-of-writes* 0) "n/a (no writes)" (/ *writes-total-delay* *number-of-writes*)) " ms") (debug:print-info 0 "Number non-cached queries " *number-non-write-queries*) (debug:print-info 0 "Average non-cached time " (if (eq? *number-non-write-queries* 0) "n/a (no queries)" (/ *total-non-write-delay* *number-non-write-queries*)) " ms") (debug:print-info 0 "Server shutdown complete. Exiting") (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running") (exit)) ;; all routes though here end in exit ... ;; ;; start_server? ;; (define (http-transport:launch run-id) (let* ((tdbdat (tasks:open-db))) |
︙ | ︙ |
Modified launch.scm from [3011b582fa] to [e538ecba1a].
︙ | ︙ | |||
62 63 64 65 66 67 68 | (top-path (assoc/default 'toppath cmdinfo)) (work-area (assoc/default 'work-area cmdinfo)) ;; work-area is the test run area (test-name (assoc/default 'test-name cmdinfo)) (runscript (assoc/default 'runscript cmdinfo)) (ezsteps (assoc/default 'ezsteps cmdinfo)) ;; (runremote (assoc/default 'runremote cmdinfo)) (transport (assoc/default 'transport cmdinfo)) | | | 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 | (top-path (assoc/default 'toppath cmdinfo)) (work-area (assoc/default 'work-area cmdinfo)) ;; work-area is the test run area (test-name (assoc/default 'test-name cmdinfo)) (runscript (assoc/default 'runscript cmdinfo)) (ezsteps (assoc/default 'ezsteps cmdinfo)) ;; (runremote (assoc/default 'runremote cmdinfo)) (transport (assoc/default 'transport cmdinfo)) ;; (serverinf (assoc/default 'serverinf cmdinfo)) (port (assoc/default 'port cmdinfo)) (run-id (assoc/default 'run-id cmdinfo)) (test-id (assoc/default 'test-id cmdinfo)) (target (assoc/default 'target cmdinfo)) (itemdat (assoc/default 'itemdat cmdinfo)) (env-ovrd (assoc/default 'env-ovrd cmdinfo)) (set-vars (assoc/default 'set-vars cmdinfo)) ;; pre-overrides from -setvar |
︙ | ︙ | |||
785 786 787 788 789 790 791 | (debug:print 0 "WARNING: No disk work area specified - running in the test directory under tmp_run"))) (set! cmdparms (base64:base64-encode (z3:encode-buffer (with-output-to-string (lambda () ;; (list 'hosts hosts) (write (list (list 'testpath test-path) (list 'transport (conc *transport-type*)) | | | 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 | (debug:print 0 "WARNING: No disk work area specified - running in the test directory under tmp_run"))) (set! cmdparms (base64:base64-encode (z3:encode-buffer (with-output-to-string (lambda () ;; (list 'hosts hosts) (write (list (list 'testpath test-path) (list 'transport (conc *transport-type*)) ;; (list 'serverinf *server-info*) (list 'toppath *toppath*) (list 'work-area work-area) (list 'test-name test-name) (list 'runscript runscript) (list 'run-id run-id ) (list 'test-id test-id ) ;; (list 'item-path item-path ) |
︙ | ︙ |
Modified tasks.scm from [f329b3e32b] to [d12c4149c9].
︙ | ︙ | |||
26 27 28 29 30 31 32 | ;; (define (tasks:wait-on-journal path n #!key (remove #f)(waiting-msg #f)) (if (not (string? path)) (debug:print 0 "ERROR: Called tasks:wait-on-journal with path=" path " (not a string)") (let ((fullpath (conc path "-journal"))) (handle-exceptions exn | > > > > > | | 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 | ;; (define (tasks:wait-on-journal path n #!key (remove #f)(waiting-msg #f)) (if (not (string? path)) (debug:print 0 "ERROR: Called tasks:wait-on-journal with path=" path " (not a string)") (let ((fullpath (conc path "-journal"))) (handle-exceptions exn (begin (print-call-chain (current-error-port)) (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (debug:print 0 " exn=" (condition->list exn)) (debug:print 0 "tasks:wait-on-journal failed. Continuing on, you can ignore this call-chain") #t) ;; if stuff goes wrong just allow it to move on (let loop ((journal-exists (file-exists? fullpath)) (count n)) ;; wait ten times ... (if journal-exists (begin (if (and waiting-msg (eq? (modulo n 30) 0)) (debug:print 0 waiting-msg)) |
︙ | ︙ | |||
306 307 308 309 310 311 312 | (vector header res))) (define (tasks:get-server mdb run-id #!key (retries 10)) (let ((res #f) (best #f)) (handle-exceptions exn | | > | | | | | | | | | | 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 | (vector header res))) (define (tasks:get-server mdb run-id #!key (retries 10)) (let ((res #f) (best #f)) (handle-exceptions exn (begin (print-call-chain (current-error-port)) (debug:print 0 "WARNING: tasks:get-server db access error.") (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (debug:print 0 " for run " run-id) (print-call-chain (current-error-port)) (if (> retries 0) (begin (debug:print 0 " trying call to tasks:get-server again in 10 seconds") (thread-sleep! 10) (tasks:get-server mdb run-id retries: (- retries 0))) (debug:print 0 "10 tries of tasks:get-server all crashed and burned. Giving up and returning \"no server found\""))) (sqlite3:for-each-row (lambda (id interface port pubport transport pid hostname) (set! res (vector id interface port pubport transport pid hostname))) mdb ;; removed: ;; strftime('%s','now')-heartbeat < 10 AND mt_version = ? "SELECT id,interface,port,pubport,transport,pid,hostname FROM servers |
︙ | ︙ | |||
373 374 375 376 377 378 379 | (debug:print-info 0 "Killing server " server-id " for run-id " run-id " on host " hostname " with pid " pid) (tasks:kill-server hostname pid) (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) ) (debug:print-info 0 "No server found for run-id " run-id ", nothing to kill")) ;; (sqlite3:finalize! tdb) )) | < < < < < < < < < < < < < < < < < < < < < < < < < < < < | 379 380 381 382 383 384 385 386 387 388 389 390 391 392 | (debug:print-info 0 "Killing server " server-id " for run-id " run-id " on host " hostname " with pid " pid) (tasks:kill-server hostname pid) (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) ) (debug:print-info 0 "No server found for run-id " run-id ", nothing to kill")) ;; (sqlite3:finalize! tdb) )) ;;====================================================================== ;; Tasks and Task monitors ;;====================================================================== ;;====================================================================== |
︙ | ︙ |
Modified tdb.scm from [13705ac22c] to [575d5c7ba8].
︙ | ︙ | |||
52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 | (dbexists (file-exists? dbpath)) (handler (make-busy-timeout (if (args:get-arg "-override-timeout") (string->number (args:get-arg "-override-timeout")) 136000)))) (handle-exceptions exn (begin (debug:print 2 "ERROR: problem accessing test db " work-area ", you probably should clean and re-run this test" ((condition-property-accessor 'exn 'message) exn)) (set! db (sqlite3:open-database ":memory:")) ;; open an in-memory db to allow readonly access (set! dbexists #f)) ;; must force re-creation of tables, more tom-foolery (set! db (sqlite3:open-database dbpath))) (if *db-write-access* (sqlite3:set-busy-handler! db handler)) (if (not dbexists) (begin (db:set-sync db) ;; (sqlite3:execute db "PRAGMA synchronous = FULL;") (debug:print-info 11 "Initialized test database " dbpath) (tdb:testdb-initialize db))) ;; (sqlite3:execute db "PRAGMA synchronous = 0;") (debug:print-info 11 "open-test-db END (sucessful)" work-area) ;; now let's test that everything is correct (handle-exceptions exn (begin (debug:print 0 "ERROR: problem accessing test db " work-area ", you probably should clean and re-run this test or remove the file " dbpath ".\n " ((condition-property-accessor 'exn 'message) exn)) #f) ;; Is there a cheaper single line operation that will check for existance of a table ;; and raise an exception ? (sqlite3:execute db "SELECT id FROM test_data LIMIT 1;")) | > > | 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 | (dbexists (file-exists? dbpath)) (handler (make-busy-timeout (if (args:get-arg "-override-timeout") (string->number (args:get-arg "-override-timeout")) 136000)))) (handle-exceptions exn (begin (print-call-chain (current-error-port)) (debug:print 2 "ERROR: problem accessing test db " work-area ", you probably should clean and re-run this test" ((condition-property-accessor 'exn 'message) exn)) (set! db (sqlite3:open-database ":memory:")) ;; open an in-memory db to allow readonly access (set! dbexists #f)) ;; must force re-creation of tables, more tom-foolery (set! db (sqlite3:open-database dbpath))) (if *db-write-access* (sqlite3:set-busy-handler! db handler)) (if (not dbexists) (begin (db:set-sync db) ;; (sqlite3:execute db "PRAGMA synchronous = FULL;") (debug:print-info 11 "Initialized test database " dbpath) (tdb:testdb-initialize db))) ;; (sqlite3:execute db "PRAGMA synchronous = 0;") (debug:print-info 11 "open-test-db END (sucessful)" work-area) ;; now let's test that everything is correct (handle-exceptions exn (begin (print-call-chain (current-error-port)) (debug:print 0 "ERROR: problem accessing test db " work-area ", you probably should clean and re-run this test or remove the file " dbpath ".\n " ((condition-property-accessor 'exn 'message) exn)) #f) ;; Is there a cheaper single line operation that will check for existance of a table ;; and raise an exception ? (sqlite3:execute db "SELECT id FROM test_data LIMIT 1;")) |
︙ | ︙ |
Modified tests.scm from [3b87b51253] to [2a580a2e0e].
︙ | ︙ | |||
645 646 647 648 649 650 651 | (define (tests:set-full-meta-info db test-id run-id minutes work-area remtries) ;; (define (tests:set-full-meta-info test-id run-id minutes work-area) ;; (let ((remtries 10)) (let* ((cpuload (get-cpu-load)) (diskfree (get-df (current-directory))) (uname (get-uname "-srvpio")) (hostname (get-host-name))) | < < < < < < < < < < < < < < < < < < < < < > > | 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 | (define (tests:set-full-meta-info db test-id run-id minutes work-area remtries) ;; (define (tests:set-full-meta-info test-id run-id minutes work-area) ;; (let ((remtries 10)) (let* ((cpuload (get-cpu-load)) (diskfree (get-df (current-directory))) (uname (get-uname "-srvpio")) (hostname (get-host-name))) (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname))) ;; (define (tests:set-partial-meta-info test-id run-id minutes work-area) (define (tests:set-partial-meta-info test-id run-id minutes work-area remtries) (let* ((cpuload (get-cpu-load)) (diskfree (get-df (current-directory))) (remtries 10)) (handle-exceptions exn (if (> remtries 0) (begin (print-call-chain (current-error-port)) (debug:print-info 0 "WARNING: failed to set meta info. Will try " remtries " more times") (set! remtries (- remtries 1)) (thread-sleep! 10) (tests:set-full-meta-info db test-id run-id minutes work-area (- remtries 1))) (let ((err-status ((condition-property-accessor 'sqlite3 'status #f) exn))) (debug:print 0 "ERROR: tried for over a minute to update meta info and failed. Giving up") (debug:print 0 "EXCEPTION: database probably overloaded or unreadable.") (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) |
︙ | ︙ |