Overview
Comment: | Improved messages in couple noisy cases. Add flag for turning off touching of server logs when max api calls situation occurs |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.64 |
Files: | files | file ages | folders |
SHA1: |
bc2a1779bc75107f4d6bcee57846da7c |
User & Date: | matt on 2017-05-31 23:05:24 |
Other Links: | branch diff | manifest | tags |
Context
2017-06-01
| ||
00:21 | Corrected issue with server overload handling. Switch to using a less burdensome server ping instead of server:check-if-running check-in: fd6bcc8b75 user: matt tags: v1.64 | |
2017-05-31
| ||
23:05 | Improved messages in couple noisy cases. Add flag for turning off touching of server logs when max api calls situation occurs check-in: bc2a1779bc user: matt tags: v1.64 | |
2017-05-30
| ||
10:33 | bumped version check-in: 9dbae6fda1 user: bjbarcla tags: v1.64, v1.6416 | |
Changes
Modified api.scm from [5e34e05ad5] to [656c55c8ea].
︙ | ︙ | |||
125 126 127 128 129 130 131 132 133 134 135 136 137 138 | (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens (cond ((not (vector? dat)) ;; it is an error to not receive a vector (vector #f (vector #f "remote must be called with a vector"))) ((> *api-process-request-count* 20) ;; 20) (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an overloaded message.") (vector #f (vector #f 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor! (else (let* ((cmd-in (vector-ref dat 0)) (cmd (if (symbol? cmd-in) cmd-in (string->symbol cmd-in))) (params (vector-ref dat 1)) | > | 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 | (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens (cond ((not (vector? dat)) ;; it is an error to not receive a vector (vector #f (vector #f "remote must be called with a vector"))) ((> *api-process-request-count* 20) ;; 20) (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an overloaded message.") (set! *server-overloaded* #t) (vector #f (vector #f 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor! (else (let* ((cmd-in (vector-ref dat 0)) (cmd (if (symbol? cmd-in) cmd-in (string->symbol cmd-in))) (params (vector-ref dat 1)) |
︙ | ︙ |
Modified common.scm from [f90660c3ad] to [ebe13dceae].
︙ | ︙ | |||
43 44 45 46 47 48 49 | (debug:print-error 0 *default-log-port* "bad value for setenv, key=" key ", value=" val) (setenv key val)) (debug:print-error 0 *default-log-port* "bad value for setenv, key=" key ", value=" val)))) (define home (getenv "HOME")) (define user (getenv "USER")) | | | 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 | (debug:print-error 0 *default-log-port* "bad value for setenv, key=" key ", value=" val) (setenv key val)) (debug:print-error 0 *default-log-port* "bad value for setenv, key=" key ", value=" val)))) (define home (getenv "HOME")) (define user (getenv "USER")) ;; GLOBALS ;; CONTEXTS (defstruct cxt (taskdb #f) (cmutex (make-mutex))) ;; (define *contexts* (make-hash-table)) ;; (define *context-mutex* (make-mutex)) |
︙ | ︙ | |||
126 127 128 129 130 131 132 133 134 135 136 137 138 139 | (define *run-id* #f) (define *server-kind-run* (make-hash-table)) (define *home-host* #f) ;; (define *total-non-write-delay* 0) (define *heartbeat-mutex* (make-mutex)) (define *api-process-request-count* 0) (define *max-api-process-requests* 0) ;; client (define *rmt-mutex* (make-mutex)) ;; remote access calls mutex ;; RPC transport (define *rpc:listener* #f) | > | 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 | (define *run-id* #f) (define *server-kind-run* (make-hash-table)) (define *home-host* #f) ;; (define *total-non-write-delay* 0) (define *heartbeat-mutex* (make-mutex)) (define *api-process-request-count* 0) (define *max-api-process-requests* 0) (define *server-overloaded* #f) ;; client (define *rmt-mutex* (make-mutex)) ;; remote access calls mutex ;; RPC transport (define *rpc:listener* #f) |
︙ | ︙ |
Modified db.scm from [4838550ec8] to [de6a9e2632].
︙ | ︙ | |||
400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 | (mutex-unlock! *db-multi-sync-mutex*) (db:sync-tables (db:sync-all-tables-list dbstruct) update_info tmpdb refndb mtdb)) (mutex-lock! *db-multi-sync-mutex*) (set! *db-last-sync* start-t) (set! *db-last-access* start-t) (mutex-unlock! *db-multi-sync-mutex*) (stack-push! (dbr:dbstruct-dbstack dbstruct) tmpdb))) ;; close all opened run-id dbs (define (db:close-all dbstruct) (if (dbr:dbstruct? dbstruct) (handle-exceptions exn (begin (debug:print 0 *default-log-port* "WARNING: Finalizing failed, " ((condition-property-accessor 'exn 'message) exn)) (print-call-chain *default-log-port*)) ;; (db:sync-touched dbstruct 0 force-sync: #t) ;; NO. Do not do this here. Instead we rely on a server to be started when there are writes, even if the server itself is not going to be used as a server. (let ((tdbs (map db:dbdat-get-db (stack->list (dbr:dbstruct-dbstack dbstruct)))) (mdb (db:dbdat-get-db (dbr:dbstruct-mtdb dbstruct))) (rdb (db:dbdat-get-db (dbr:dbstruct-refndb dbstruct)))) (map (lambda (db) | > > > > > > > > > > > > > > > > | | | | | 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 | (mutex-unlock! *db-multi-sync-mutex*) (db:sync-tables (db:sync-all-tables-list dbstruct) update_info tmpdb refndb mtdb)) (mutex-lock! *db-multi-sync-mutex*) (set! *db-last-sync* start-t) (set! *db-last-access* start-t) (mutex-unlock! *db-multi-sync-mutex*) (stack-push! (dbr:dbstruct-dbstack dbstruct) tmpdb))) (define (db:safely-close-sqlite3-db db #!key (try-num 3)) (if (<= try-num 0) #f (handle-exceptions exn (begin (thread-sleep! 3) (sqlite3:interrupt! db) (db:safely-close-sqlite3-db db try-num: (- try-num 1))) (if (sqlite3:database? db) (begin (sqlite3:finalize! db) #t) #f)))) ;; close all opened run-id dbs (define (db:close-all dbstruct) (if (dbr:dbstruct? dbstruct) (handle-exceptions exn (begin (debug:print 0 *default-log-port* "WARNING: Finalizing failed, " ((condition-property-accessor 'exn 'message) exn)) (print-call-chain *default-log-port*)) ;; (db:sync-touched dbstruct 0 force-sync: #t) ;; NO. Do not do this here. Instead we rely on a server to be started when there are writes, even if the server itself is not going to be used as a server. (let ((tdbs (map db:dbdat-get-db (stack->list (dbr:dbstruct-dbstack dbstruct)))) (mdb (db:dbdat-get-db (dbr:dbstruct-mtdb dbstruct))) (rdb (db:dbdat-get-db (dbr:dbstruct-refndb dbstruct)))) (map (lambda (db) (db:safely-close-sqlite3-db db)) ;; (if (sqlite3:database? db) ;; (sqlite3:finalize! db))) tdbs) (db:safely-close-sqlite3-db mdb) ;; (if (sqlite3:database? mdb) (sqlite3:finalize! mdb)) (db:safely-close-sqlite3-db rdb))))) ;; (if (sqlite3:database? rdb) (sqlite3:finalize! rdb)))))) ;; (let ((locdbs (dbr:dbstruct-locdbs dbstruct))) ;; (if (hash-table? locdbs) ;; (for-each (lambda (run-id) ;; (db:close-run-db dbstruct run-id)) ;; (hash-table-keys locdbs))))) |
︙ | ︙ |
Modified http-transport.scm from [bb11c9d077] to [2c955425b7].
︙ | ︙ | |||
432 433 434 435 436 437 438 439 440 441 442 443 444 445 | (adjusted-timeout (if (> hrs-since-start 1) (- server-timeout (inexact->exact (round (* hrs-since-start 60)))) ;; subtract 60 seconds per hour server-timeout))) (if (common:low-noise-print 120 "server timeout") (debug:print-info 0 *default-log-port* "Adjusted server timeout: " adjusted-timeout)) (cond ((and *server-run* (> (+ last-access server-timeout) (current-seconds)) (< (- (current-seconds) server-start-time) 3600)) ;; do not update log or touch log if we've been running for more than one hour. (if (common:low-noise-print 120 "server continuing") (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access)) (let ((curr-time (current-seconds))) (handle-exceptions | > | 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 | (adjusted-timeout (if (> hrs-since-start 1) (- server-timeout (inexact->exact (round (* hrs-since-start 60)))) ;; subtract 60 seconds per hour server-timeout))) (if (common:low-noise-print 120 "server timeout") (debug:print-info 0 *default-log-port* "Adjusted server timeout: " adjusted-timeout)) (cond ((and *server-run* (not *server-overloaded*) (> (+ last-access server-timeout) (current-seconds)) (< (- (current-seconds) server-start-time) 3600)) ;; do not update log or touch log if we've been running for more than one hour. (if (common:low-noise-print 120 "server continuing") (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access)) (let ((curr-time (current-seconds))) (handle-exceptions |
︙ | ︙ |
Modified launch.scm from [66b5d11122] to [98f81b97d0].
︙ | ︙ | |||
550 551 552 553 554 555 556 | (set-signal-handler! signal/term sighand) ) ;; (set-signal-handler! signal/stop sighand) ;; Do not run the test if it is REMOVING, RUNNING, KILLREQ or REMOTEHOSTSTART, ;; Mark the test as REMOTEHOSTSTART *IMMEDIATELY* ;; (let* ((test-info (rmt:get-test-info-by-id run-id test-id)) | > | > > > | 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 | (set-signal-handler! signal/term sighand) ) ;; (set-signal-handler! signal/stop sighand) ;; Do not run the test if it is REMOVING, RUNNING, KILLREQ or REMOTEHOSTSTART, ;; Mark the test as REMOTEHOSTSTART *IMMEDIATELY* ;; (let* ((test-info (rmt:get-test-info-by-id run-id test-id)) (test-host (if test-info (db:test-get-host test-info) (begin (debug:print 0 *default-log-port* "ERROR: failed to find a record for test-id " test-id ", exiting.") (exit)))) (test-pid (db:test-get-process_id test-info))) (cond ((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "KILLREQ" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun (debug:print 0 *default-log-port* "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request") ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f) ) ;; prime it for running |
︙ | ︙ | |||
1224 1225 1226 1227 1228 1229 1230 | (mutex-lock! *launch-setup-mutex*) ;; setting variables and processing the testconfig is NOT thread-safe, reuse the launch-setup mutex (let* ((item-path (item-list->path itemdat)) (contour #f)) ;; NOT READY FOR THIS (args:get-arg "-contour"))) (let loop ((delta (- (current-seconds) *last-launch*)) (launch-delay (configf:lookup-number *configdat* "setup" "launch-delay" default: 1))) (if (> launch-delay delta) (begin | > | | 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 | (mutex-lock! *launch-setup-mutex*) ;; setting variables and processing the testconfig is NOT thread-safe, reuse the launch-setup mutex (let* ((item-path (item-list->path itemdat)) (contour #f)) ;; NOT READY FOR THIS (args:get-arg "-contour"))) (let loop ((delta (- (current-seconds) *last-launch*)) (launch-delay (configf:lookup-number *configdat* "setup" "launch-delay" default: 1))) (if (> launch-delay delta) (begin (if (common:low-noise-print 1200 "test launch delay") ;; every two hours or so remind the user about launch delay. (debug:print-info 0 *default-log-port* "NOTE: test launches are delayed by " launch-delay " seconds. See megatest.config launch-delay setting to adjust.")) ;; launch of " test-name " for " (- launch-delay delta) " seconds")) (thread-sleep! (- launch-delay delta)) (loop (- (current-seconds) *last-launch*) launch-delay)))) (change-directory *toppath*) (alist->env-vars ;; consolidate this code with the code in megatest.scm for "-execute", *maybe* - the longer they are set the longer each launch takes (must be non-overlapping with the vars) (append (list (list "MT_RUN_AREA_HOME" *toppath*) |
︙ | ︙ |
Modified tests/fdktestqa/testqa/megatest.config from [d32541500d] to [a5c70b7363].
1 2 3 | [setup] testcopycmd cp --remove-destination -rlv TEST_SRC_PATH/. TEST_TARG_PATH/. >> TEST_TARG_PATH/mt_launch.log 2>> TEST_TARG_PATH/mt_launch.log # launchwait no | | | > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 | [setup] testcopycmd cp --remove-destination -rlv TEST_SRC_PATH/. TEST_TARG_PATH/. >> TEST_TARG_PATH/mt_launch.log 2>> TEST_TARG_PATH/mt_launch.log # launchwait no launch-delay 1 [server] # runtime 180 # timeout is in hours timeout 100 # All these are overridden in ../fdk.config # [jobtools] # launcher nbfake # launcher bsub -q priority -o $MT_TEST_RUN_DIR/openlava.log [include ../fdk.config] |
︙ | ︙ |
tests/fdktestqa/testqa/tests/bigrun/step1.sh became executable with contents [e700391a61].
︙ | ︙ |
tests/fdktestqa/testqa/tests/bigrun2/step1.sh became executable with contents [f90152c7af].
︙ | ︙ |
Modified tests/fdktestqa/testqa/tests/bigrun2/testconfig from [ccc63b9335] to [37225a43b3].
1 2 3 4 5 6 7 8 9 10 11 12 13 | # Add additional steps here. Format is "stepname script" [ezsteps] step1 step1.sh # Test requirements are specified here [requirements] waiton bigrun priority 0 mode itemwait itemmap .*/ # Iteration for your tests are controlled by the items section [items] | | | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 | # Add additional steps here. Format is "stepname script" [ezsteps] step1 step1.sh # Test requirements are specified here [requirements] waiton bigrun priority 0 mode itemwait itemmap .*/ # Iteration for your tests are controlled by the items section [items] NUMBER #{scheme (string-intersperse (map (lambda (x)(conc (if (getenv "USEBLAH") "blah/" "") x)) \ (map number->string (sort (let loop ((a 0)(res '())) \ (if (<= a (or (any->number (get-environment-variable "NUMTESTS")) 2500)) \ (loop (+ a 1)(cons a res)) res)) <))) " ")} # test_meta is a section for storing additional data on your test [test_meta] |
︙ | ︙ |
tests/fdktestqa/testqa/tests/bigrun3/step1.sh became executable with contents [f90152c7af].
︙ | ︙ |