Overview
Comment: | http transport with try-nanomsg changes completed first pass Validation 100% |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | try-nanomsg |
Files: | files | file ages | folders |
SHA1: |
b260c4c5d969da46f3a01ba6188986d4 |
User & Date: | matt on 2014-11-29 20:33:57 |
Other Links: | branch diff | manifest | tags |
Context
2014-11-29
| ||
21:50 | Added a little time spread on client starting servers - try to avoid startup storms check-in: b3a83a0cec user: matt tags: try-nanomsg | |
20:33 | http transport with try-nanomsg changes completed first pass Validation 100% check-in: b260c4c5d9 user: matt tags: try-nanomsg | |
13:43 | Fixed breakage caused by enabling newdashboard. Sigh. check-in: 2f12af3ef0 user: matt tags: try-nanomsg | |
Changes
Modified client.scm from [eb2cd66f13] to [8f8fe7296f].
︙ | ︙ | |||
51 52 53 54 55 56 57 | ;; 2. We are a run tests, list runs or other interactive process and we must figure out ;; *transport-type* and *runremote* from the monitor.db ;; ;; client:setup ;; ;; lookup_server, need to remove *runremote* stuff ;; | | | 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 | ;; 2. We are a run tests, list runs or other interactive process and we must figure out ;; *transport-type* and *runremote* from the monitor.db ;; ;; client:setup ;; ;; lookup_server, need to remove *runremote* stuff ;; (define (client:setup run-id #!key (remaining-tries 10) (failed-connects 0)) (debug:print-info 2 "client:setup remaining-tries=" remaining-tries) (let* ((tdbdat (tasks:open-db))) (if (<= remaining-tries 0) (begin (debug:print 0 "ERROR: failed to start or connect to server for run-id " run-id) (exit 1)) (let* ((server-dat (tasks:get-server (db:delay-if-busy tdbdat) run-id))) |
︙ | ︙ | |||
84 85 86 87 88 89 90 91 92 93 94 95 96 97 | (debug:print-info 2 "connected to " (http-transport:server-dat-make-url start-res)) start-res) (begin ;; login failed but have a server record, clean out the record and try again (debug:print-info 0 "client:setup, login failed, will attempt to start server ... start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat) (case *transport-type* ((http)(http-transport:close-connections run-id))) (hash-table-delete! *runremote* run-id) (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat) run-id (tasks:hostinfo-get-interface server-dat) (tasks:hostinfo-get-port server-dat) " client:setup (server-dat = #t)") (server:try-running run-id) (thread-sleep! 5) ;; give server a little time to start up | > | 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 | (debug:print-info 2 "connected to " (http-transport:server-dat-make-url start-res)) start-res) (begin ;; login failed but have a server record, clean out the record and try again (debug:print-info 0 "client:setup, login failed, will attempt to start server ... start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat) (case *transport-type* ((http)(http-transport:close-connections run-id))) (hash-table-delete! *runremote* run-id) (tasks:kill-server-run-id run-id) (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat) run-id (tasks:hostinfo-get-interface server-dat) (tasks:hostinfo-get-port server-dat) " client:setup (server-dat = #t)") (server:try-running run-id) (thread-sleep! 5) ;; give server a little time to start up |
︙ | ︙ |
Modified common.scm from [7a1802c2d4] to [5faac0528e].
︙ | ︙ | |||
64 65 66 67 68 69 70 | (define *task-db* #f) ;; (vector db path-to-db) (define *db-access-allowed* #t) ;; flag to allow access (define *db-access-mutex* (make-mutex)) ;; SERVER (define *my-client-signature* #f) ;; (define *transport-type* 'nmsg) | | | 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 | (define *task-db* #f) ;; (vector db path-to-db) (define *db-access-allowed* #t) ;; flag to allow access (define *db-access-mutex* (make-mutex)) ;; SERVER (define *my-client-signature* #f) ;; (define *transport-type* 'nmsg) (define *transport-type* 'http) ;; override with [server] transport http|rpc|nmsg (define *runremote* (make-hash-table)) ;; if set up for server communication this will hold <host port> (define *max-cache-size* 0) (define *logged-in-clients* (make-hash-table)) (define *client-non-blocking-mode* #f) (define *server-id* #f) (define *server-info* #f) (define *time-to-exit* #f) |
︙ | ︙ |
Modified db.scm from [e95ebbcbeb] to [ad92b83fd4].
︙ | ︙ | |||
615 616 617 618 619 620 621 622 623 624 625 626 627 628 | ;; ;; run-ids: '(1 2 3 ...) or #f (for all) ;; (define (db:multi-db-sync run-ids . options) (let* ((toppath (launch:setup-for-run)) (dbstruct (if toppath (make-dbr:dbstruct path: toppath) #f)) (mtdb (if toppath (db:open-megatest-db))) (run-ids (if run-ids run-ids (if toppath (begin (db:delay-if-busy mtdb) (db:get-all-run-ids mtdb))))) (tdbdat (tasks:open-db)) (servers (tasks:get-all-servers (db:delay-if-busy tdbdat)))) | > | 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 | ;; ;; run-ids: '(1 2 3 ...) or #f (for all) ;; (define (db:multi-db-sync run-ids . options) (let* ((toppath (launch:setup-for-run)) (dbstruct (if toppath (make-dbr:dbstruct path: toppath) #f)) (mtdb (if toppath (db:open-megatest-db))) (allow-cleanup (if run-ids #f #t)) (run-ids (if run-ids run-ids (if toppath (begin (db:delay-if-busy mtdb) (db:get-all-run-ids mtdb))))) (tdbdat (tasks:open-db)) (servers (tasks:get-all-servers (db:delay-if-busy tdbdat)))) |
︙ | ︙ | |||
661 662 663 664 665 666 667 668 | (dbstruct (if toppath (make-dbr:dbstruct path: toppath local: #t) #f))) (debug:print 0 "INFO: Propagating " (length testrecs) " records for run-id=" run-id " to run specific db") (db:replace-test-records dbstruct run-id testrecs) (sqlite3:finalize! (db:dbdat-get-db (dbr:dbstruct-get-rundb dbstruct))))) run-ids))) ;; now ensure all newdb data are synced to megatest.db (if (member 'new2old options) | > > > > > > > > | | > > | | | | > | > | > > > | > > | > > > > > > > > > | 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 | (dbstruct (if toppath (make-dbr:dbstruct path: toppath local: #t) #f))) (debug:print 0 "INFO: Propagating " (length testrecs) " records for run-id=" run-id " to run specific db") (db:replace-test-records dbstruct run-id testrecs) (sqlite3:finalize! (db:dbdat-get-db (dbr:dbstruct-get-rundb dbstruct))))) run-ids))) ;; now ensure all newdb data are synced to megatest.db ;; do not use the run-ids list passed in to the function ;; (if (member 'new2old options) (let* ((maindb (make-dbr:dbstruct path: toppath local: #t)) (src-run-ids (db:get-all-run-ids (db:dbdat-get-db (db:get-db maindb 0)))) (all-run-ids (sort (delete-duplicates (cons 0 src-run-ids)) <)) (count 1) (total (length all-run-ids)) (dead-runs '())) (for-each (lambda (run-id) (debug:print 0 "Processing run " (if (eq? run-id 0) " main.db " run-id) ", " count " of " total) (set! count (+ count 1)) (let* ((fromdb (if toppath (make-dbr:dbstruct path: toppath local: #t) #f)) (frundb (db:dbdat-get-db (db:get-db fromdb run-id)))) ;; (db:delay-if-busy frundb) ;; (db:delay-if-busy mtdb) ;; (db:clean-up frundb) (if (eq? run-id 0) (begin (db:sync-tables (db:sync-main-list dbstruct) (db:get-db fromdb #f) mtdb) (set! dead-runs (db:clean-up-maindb (db:get-db fromdb #f)))) (begin ;; NB// must sync first to ensure deleted tests get marked as such in megatest.db (db:sync-tables db:sync-tests-only (db:get-db fromdb run-id) mtdb) (db:clean-up-rundb (db:get-db fromdb run-id)) )))) all-run-ids) ;; removed deleted runs (let ((dbdir (tasks:get-task-db-path))) (for-each (lambda (run-id) (let ((fullname (conc dbdir "/" run-id ".db"))) (if (file-exists? fullname) (begin (debug:print 0 "Removing database file for deleted run " fullname) (delete-file fullname))))) dead-runs)))) ;; (db:close-all dbstruct) ;; (sqlite3:finalize! mdb) )) ;; keeping it around for debugging purposes only (define (open-run-close-no-exception-handling proc idb . params) (debug:print-info 11 "open-run-close-no-exception-handling START given a db=" (if idb "yes " "no ") ", params=" params) |
︙ | ︙ | |||
991 992 993 994 995 996 997 | ;; a. If test dir exists, set the the test to state='UNKNOWN', Set the run to 'unknown' ;; b. If test dir gone, delete the test record ;; 2. Look at run records ;; a. If have tests that are not deleted, set state='unknown' ;; b. .... ;; (define (db:clean-up dbdat) | | | 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 | ;; a. If test dir exists, set the the test to state='UNKNOWN', Set the run to 'unknown' ;; b. If test dir gone, delete the test record ;; 2. Look at run records ;; a. If have tests that are not deleted, set state='unknown' ;; b. .... ;; (define (db:clean-up dbdat) ;; (debug:print 0 "WARNING: db clean up not fully ported to v1.60, cleanup action will be on megatest.db") (let* ((db (db:dbdat-get-db dbdat)) (count-stmt (sqlite3:prepare db "SELECT (SELECT count(id) FROM tests)+(SELECT count(id) FROM runs);")) (statements (map (lambda (stmt) (sqlite3:prepare db stmt)) (list ;; delete all tests that belong to runs that are 'deleted' |
︙ | ︙ | |||
1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 | (debug:print-info 0 "Records count after clean: " tot)) count-stmt))) (map sqlite3:finalize! statements) (sqlite3:finalize! count-stmt) ;; (db:find-and-mark-incomplete db) (db:delay-if-busy dbdat) (sqlite3:execute db "VACUUM;"))) ;;====================================================================== ;; M E T A G E T A N D S E T V A R S ;;====================================================================== ;; returns number if string->number is successful, string otherwise ;; also updates *global-delta* | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 | (debug:print-info 0 "Records count after clean: " tot)) count-stmt))) (map sqlite3:finalize! statements) (sqlite3:finalize! count-stmt) ;; (db:find-and-mark-incomplete db) (db:delay-if-busy dbdat) (sqlite3:execute db "VACUUM;"))) ;; Clean out old junk and vacuum the database ;; ;; Ultimately do something like this: ;; ;; 1. Look at test records either deleted or part of deleted run: ;; a. If test dir exists, set the the test to state='UNKNOWN', Set the run to 'unknown' ;; b. If test dir gone, delete the test record ;; 2. Look at run records ;; a. If have tests that are not deleted, set state='unknown' ;; b. .... ;; (define (db:clean-up-rundb dbdat) ;; (debug:print 0 "WARNING: db clean up not fully ported to v1.60, cleanup action will be on megatest.db") (let* ((db (db:dbdat-get-db dbdat)) (count-stmt (sqlite3:prepare db "SELECT (SELECT count(id) FROM tests);")) (statements (map (lambda (stmt) (sqlite3:prepare db stmt)) (list ;; delete all tests that belong to runs that are 'deleted' ;; (conc "DELETE FROM tests WHERE run_id NOT IN (" (string-intersperse (map conc valid-runs) ",") ");") ;; delete all tests that are 'DELETED' "DELETE FROM tests WHERE state='DELETED';" )))) (db:delay-if-busy dbdat) (sqlite3:with-transaction db (lambda () (sqlite3:for-each-row (lambda (tot) (debug:print-info 0 "Records count before clean: " tot)) count-stmt) (map sqlite3:execute statements) (sqlite3:for-each-row (lambda (tot) (debug:print-info 0 "Records count after clean: " tot)) count-stmt))) (map sqlite3:finalize! statements) (sqlite3:finalize! count-stmt) ;; (db:find-and-mark-incomplete db) (db:delay-if-busy dbdat) (sqlite3:execute db "VACUUM;"))) ;; Clean out old junk and vacuum the database ;; ;; Ultimately do something like this: ;; ;; 1. Look at test records either deleted or part of deleted run: ;; a. If test dir exists, set the the test to state='UNKNOWN', Set the run to 'unknown' ;; b. If test dir gone, delete the test record ;; 2. Look at run records ;; a. If have tests that are not deleted, set state='unknown' ;; b. .... ;; (define (db:clean-up-maindb dbdat) ;; (debug:print 0 "WARNING: db clean up not fully ported to v1.60, cleanup action will be on megatest.db") (let* ((db (db:dbdat-get-db dbdat)) (count-stmt (sqlite3:prepare db "SELECT (SELECT count(id) FROM runs);")) (statements (map (lambda (stmt) (sqlite3:prepare db stmt)) (list ;; delete all tests that belong to runs that are 'deleted' ;; (conc "DELETE FROM tests WHERE run_id NOT IN (" (string-intersperse (map conc valid-runs) ",") ");") ;; delete all tests that are 'DELETED' "DELETE FROM runs WHERE state='deleted';" ))) (dead-runs '())) (sqlite3:for-each-row (lambda (run-id) (set! dead-runs (cons run-id dead-runs))) db "SELECT id FROM runs WHERE state='deleted';") (db:delay-if-busy dbdat) (sqlite3:with-transaction db (lambda () (sqlite3:for-each-row (lambda (tot) (debug:print-info 0 "Records count before clean: " tot)) count-stmt) (map sqlite3:execute statements) (sqlite3:for-each-row (lambda (tot) (debug:print-info 0 "Records count after clean: " tot)) count-stmt))) (map sqlite3:finalize! statements) (sqlite3:finalize! count-stmt) ;; (db:find-and-mark-incomplete db) (db:delay-if-busy dbdat) (sqlite3:execute db "VACUUM;") dead-runs)) ;;====================================================================== ;; M E T A G E T A N D S E T V A R S ;;====================================================================== ;; returns number if string->number is successful, string otherwise ;; also updates *global-delta* |
︙ | ︙ | |||
2321 2322 2323 2324 2325 2326 2327 | (z3:decode-buffer (base64:base64-decode (string-substitute (regexp "_") "=" msg #t))) (lambda ()(deserialize))) (begin (debug:print 0 "ERROR: reception failed. Received " msg " but cannot translate it.") | | | 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 | (z3:decode-buffer (base64:base64-decode (string-substitute (regexp "_") "=" msg #t))) (lambda ()(deserialize))) (begin (debug:print 0 "ERROR: reception failed. Received " msg " but cannot translate it.") msg))) ;; crude reply for when things go awry ((zmq nmsg)(with-input-from-string msg (lambda ()(deserialize)))) (else msg))) (define (db:test-set-status-state dbstruct run-id test-id status state msg) (let ((dbdat (db:get-db dbstruct run-id))) (if (member state '("LAUNCHED" "REMOTEHOSTSTART")) (db:general-call dbdat 'set-test-start-time (list test-id))) |
︙ | ︙ |
Modified http-transport.scm from [fc5c6a3a75] to [4218ba432d].
︙ | ︙ | |||
281 282 283 284 285 286 287 | (set! res (vector success (db:string->obj (handle-exceptions exn (begin (set! success #f) | | > | 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 | (set! res (vector success (db:string->obj (handle-exceptions exn (begin (set! success #f) (debug:print 0 "WARNING: failure in with-input-from-request to " fullurl ".") (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (hash-table-delete! *runremote* run-id) ;; Killing associated server to allow clean retry.") ;; (tasks:kill-server-run-id run-id) ;; better to kill the server in the logic that called this routine. #f) (with-input-from-request ;; was dat fullurl (list (cons 'key "thekey") (cons 'cmd cmd) (cons 'params sparams)) |
︙ | ︙ | |||
311 312 313 314 315 316 317 | (thread-join! th1) (thread-terminate! th2) (debug:print-info 11 "got res=" res) (if (vector? res) (if (vector-ref res 0) res (begin ;; note: this code also called in nmsg-transport - consider consolidating it | | | | 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 | (thread-join! th1) (thread-terminate! th2) (debug:print-info 11 "got res=" res) (if (vector? res) (if (vector-ref res 0) res (begin ;; note: this code also called in nmsg-transport - consider consolidating it (debug:print 0 "ERROR: error occured at server, info=" (vector-ref res 2)) (debug:print 0 " client call chain:") (print-call-chain (current-error-port)) (debug:print 0 " server call chain:") (pp (vector-ref res 1) (current-error-port)) (signal (vector-ref result 0)))) (signal (make-composite-condition (make-property-condition 'timeout 'message "nmsg-transport:client-api-send-receive-raw timed out talking to server"))))))) ;; careful closing of connections stored in *runremote* |
︙ | ︙ |
Modified launch.scm from [34882953c1] to [111f911e42].
︙ | ︙ | |||
456 457 458 459 460 461 462 463 464 465 466 467 468 469 | (set! *configinfo* (find-and-read-config (if (args:get-arg "-config")(args:get-arg "-config") "megatest.config") environ-patt: "env-override" given-toppath: (get-environment-variable "MT_RUN_AREA_HOME") pathenvvar: "MT_RUN_AREA_HOME")) (set! *configdat* (if (car *configinfo*)(car *configinfo*) #f)) (set! *toppath* (if (car *configinfo*)(cadr *configinfo*) #f)) (let ((linktree (configf:lookup *configdat* "setup" "linktree"))) ;; link tree is critical (if linktree (if (not (file-exists? linktree)) (begin (handle-exceptions exn (begin | > > > > > > > | 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 | (set! *configinfo* (find-and-read-config (if (args:get-arg "-config")(args:get-arg "-config") "megatest.config") environ-patt: "env-override" given-toppath: (get-environment-variable "MT_RUN_AREA_HOME") pathenvvar: "MT_RUN_AREA_HOME")) (set! *configdat* (if (car *configinfo*)(car *configinfo*) #f)) (set! *toppath* (if (car *configinfo*)(cadr *configinfo*) #f)) (let* ((tmptransport (configf:lookup *configdat* "server" "transport")) (transport (if tmptransport (string->symbol tmptransport) 'http))) (if (member transport '(http rpc nmsg)) (set! *transport-type* transport) (begin (debug:print 0 "ERROR: Unrecognised transport " transport) (exit)))) (let ((linktree (configf:lookup *configdat* "setup" "linktree"))) ;; link tree is critical (if linktree (if (not (file-exists? linktree)) (begin (handle-exceptions exn (begin |
︙ | ︙ |
Modified megatest.scm from [16af3583d8] to [f155a05495].
︙ | ︙ | |||
281 282 283 284 285 286 287 288 289 290 291 | "-q" ;; quiet 0, errors/warnings only ) args:arg-hash 0)) ;; The watchdog is to keep an eye on things like db sync etc. ;; (define *watchdog* (make-thread (lambda () (thread-sleep! 0.05) ;; delay for startup | > | > > | 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 | "-q" ;; quiet 0, errors/warnings only ) args:arg-hash 0)) ;; The watchdog is to keep an eye on things like db sync etc. ;; (define *time-zero* (current-seconds)) (define *watchdog* (make-thread (lambda () (thread-sleep! 0.05) ;; delay for startup (let ((legacy-sync (configf:lookup *configdat* "setup" "megatest-db")) (debug-mode (debug:debug-mode 1)) (last-time (current-seconds))) (let loop () ;; sync for filesystem local db writes ;; (let ((start-time (current-seconds)) (servers-started (make-hash-table))) (for-each (lambda (run-id) |
︙ | ︙ | |||
308 309 310 311 312 313 314 | (debug:print-info 0 "Sync of newdb to olddb for run-id " run-id " completed in " sync-time " seconds"))) ;; (if (> sync-time 10) ;; took more than ten seconds, start a server for this run ;; (begin ;; (debug:print-info 0 "Sync is taking a long time, start up a server to assist for run " run-id) ;; (server:kind-run run-id))))) (hash-table-delete! *db-local-sync* run-id))) (mutex-unlock! *db-multi-sync-mutex*)) | | > > > > > | | | 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 | (debug:print-info 0 "Sync of newdb to olddb for run-id " run-id " completed in " sync-time " seconds"))) ;; (if (> sync-time 10) ;; took more than ten seconds, start a server for this run ;; (begin ;; (debug:print-info 0 "Sync is taking a long time, start up a server to assist for run " run-id) ;; (server:kind-run run-id))))) (hash-table-delete! *db-local-sync* run-id))) (mutex-unlock! *db-multi-sync-mutex*)) (hash-table-keys *db-local-sync*)) (if (and debug-mode (> (- start-time last-time) 14)) (begin (set! last-time start-time) (debug:print-info 0 "timestamp -> " (seconds->time-string (current-seconds)) ", time since start -> " (seconds->hr-min-sec (- (current-seconds) *time-zero*)))))) ;; keep going unless time to exit ;; (if (not *time-to-exit*) (begin (thread-sleep! 5) ;; wait five seconds before syncing again, we'll also sync on exit (loop))))) "Watchdog thread"))) (thread-start! *watchdog*) (if (args:get-arg "-log") (let ((oup (open-output-file (args:get-arg "-log")))) |
︙ | ︙ |
Modified rmt.scm from [7ce19b3436] to [fbd1a17708].
︙ | ︙ | |||
62 63 64 65 66 67 68 69 70 | cinfo ;; NB// can cache the answer for server running for 10 seconds ... ;; ;; (and (not (rmt:write-frequency-over-limit? cmd run-id)) (if (tasks:server-running-or-starting? (db:delay-if-busy (tasks:open-db)) run-id) (client:setup run-id) #f)))) (define (rmt:send-receive cmd rid params #!key (attemptnum 1)) ;; start attemptnum at 1 so the modulo below works as expected ;; clean out old connections | > | | > > > | | | > > > | 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | cinfo ;; NB// can cache the answer for server running for 10 seconds ... ;; ;; (and (not (rmt:write-frequency-over-limit? cmd run-id)) (if (tasks:server-running-or-starting? (db:delay-if-busy (tasks:open-db)) run-id) (client:setup run-id) #f)))) (define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id (define (rmt:send-receive cmd rid params #!key (attemptnum 1)) ;; start attemptnum at 1 so the modulo below works as expected ;; clean out old connections ;; (mutex-lock! *db-multi-sync-mutex*) ;; (let ((expire-time (- (current-seconds) 60))) ;; (for-each ;; (lambda (run-id) ;; (let ((connection (hash-table-ref/default *runremote* run-id #f))) ;; (if (and connection ;; (< (http-transport:server-dat-get-last-access connection) expire-time)) ;; (begin ;; (debug:print-info 0 "Discarding connection to server for run-id " run-id ", too long between accesses") ;; ;; SHOULD CLOSE THE CONNECTION HERE ;; (hash-table-delete! *runremote* run-id))))) ;; (hash-table-keys *runremote*))) ;; (mutex-unlock! *db-multi-sync-mutex*) ;; (mutex-lock! *send-receive-mutex*) (let* ((run-id (if rid rid 0)) (connection-info (rmt:get-connection-info run-id))) ;; the nmsg method does the encoding under the hood (the http method should be changed to do this also) (if connection-info ;; use the server if have connection info (let* ((dat (case *transport-type* ((http)(http-transport:client-api-send-receive run-id connection-info cmd params)) ((nmsg)(condition-case (nmsg-transport:client-api-send-receive run-id connection-info cmd params) ((timeout)(vector #f "timeout talking to server")))) (else (exit)))) (success (if (and dat (vector? dat)) (vector-ref dat 0) #f)) (res (if (and dat (vector? dat)) (vector-ref dat 1) #f))) (http-transport:server-dat-update-last-access connection-info) (if success (begin ;; (mutex-unlock! *send-receive-mutex*) (case *transport-type* ((http) res) ;; (db:string->obj res)) ((nmsg) res))) ;; (vector-ref res 1))) (begin ;; let ((new-connection-info (client:setup run-id))) (debug:print 0 "WARNING: Communication failed, trying call to rmt:send-receive again.") ;; (case *transport-type* ;; ((nmsg)(nn-close (http-transport:server-dat-get-socket connection-info)))) (hash-table-delete! *runremote* run-id) ;; don't keep using the same connection (if (eq? (modulo attemptnum 5) 0) (tasks:kill-server-run-id run-id tag: "api-send-receive-failed")) ;; (mutex-unlock! *send-receive-mutex*) ;; close the mutex here to allow other threads access to communications (tasks:start-and-wait-for-server (tasks:open-db) run-id 15) ;; (nmsg-transport:client-api-send-receive run-id connection-info cmd param remtries: (- remtries 1)))))) ;; no longer killing the server in http-transport:client-api-send-receive ;; may kill it here but what are the criteria? ;; start with three calls then kill server ;; (if (eq? attemptnum 3)(tasks:kill-server-run-id run-id)) ;; (thread-sleep! 2) (rmt:send-receive cmd run-id params attemptnum: (+ attemptnum 1))))) ;; no connection info? try to start a server (if (and (< attemptnum 15) (tasks:need-server run-id)) (begin (hash-table-delete! *runremote* run-id) ;; (mutex-unlock! *send-receive-mutex*) (tasks:start-and-wait-for-server (db:delay-if-busy (tasks:open-db)) run-id 10) (client:setup run-id) (thread-sleep! (random 5)) ;; give some time to settle and minimize collison? (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))) (begin (debug:print 0 "ERROR: Communication failed!") ;; (mutex-unlock! *send-receive-mutex*) (exit) ;; (rmt:open-qry-close-locally cmd run-id params)))) ))))) (define (rmt:update-db-stats run-id rawcmd params duration) (mutex-lock! *db-stats-mutex*) (handle-exceptions |
︙ | ︙ | |||
333 334 335 336 337 338 339 | (debug:print "ERROR: rmt:get-tests-for-run called with bad run-id=" run-id) (print-call-chain (current-error-port)) '()))) ;; IDEA: Threadify these - they spend a lot of time waiting ... ;; (define (rmt:get-tests-for-runs-mindata run-ids testpatt states status not-in) | > | | > > > > > > > > > > | | | > > > > > > > > > > > > > | 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 | (debug:print "ERROR: rmt:get-tests-for-run called with bad run-id=" run-id) (print-call-chain (current-error-port)) '()))) ;; IDEA: Threadify these - they spend a lot of time waiting ... ;; (define (rmt:get-tests-for-runs-mindata run-ids testpatt states status not-in) (let ((multi-run-mutex (make-mutex)) (run-id-list (if run-ids run-ids (rmt:get-all-run-ids))) (result '())) (if (null? run-id-list) '() (for-each (lambda (th) (thread-join! th)) ;; I assume that joining completed threads just moves on (let loop ((hed (car run-id-list)) (tal (cdr run-id-list)) (threads '())) (let* ((newthread (make-thread (lambda () (let ((res (rmt:send-receive 'get-tests-for-run-mindata hed (list hed testpatt states status not-in)))) (if (list? res) (begin (mutex-lock! multi-run-mutex) (set! result (append result res)) (mutex-unlock! multi-run-mutex)) (debug:print 0 "ERROR: get-tests-for-run-mindata failed for run-id " hed ", testpatt " testpatt ", states " states ", status " status ", not-in " not-in)))) (conc "multi-run-thread for run-id " hed))) (newthreads (cons newthread threads))) (thread-start! newthread) (thread-sleep! 0.5) ;; give that thread some time to start (if (null? tal) newthreads (loop (car tal)(cdr tal) newthreads)))))) result)) (define (rmt:delete-test-records run-id test-id) (rmt:send-receive 'delete-test-records run-id (list run-id test-id))) ;; This is not needed as test steps are deleted on test delete call ;; ;; (define (rmt:delete-test-step-records run-id test-id) |
︙ | ︙ |
Modified tasks.scm from [42971aa928] to [1cac97501f].
︙ | ︙ | |||
359 360 361 362 363 364 365 | (define (tasks:need-server run-id) (let ((forced (configf:lookup *configdat* "server" "required")) (maxqry (cdr (rmt:get-max-query-average run-id))) (threshold (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "10")))) (cond (forced (if (common:low-noise-print 60 run-id "server required is set") | | | 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 | (define (tasks:need-server run-id) (let ((forced (configf:lookup *configdat* "server" "required")) (maxqry (cdr (rmt:get-max-query-average run-id))) (threshold (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "10")))) (cond (forced (if (common:low-noise-print 60 run-id "server required is set") (debug:print-info 0 "Server required is set, starting server for run-id " run-id ".")) #t) ((> maxqry threshold) (if (common:low-noise-print 60 run-id "Max query time execeeded") (debug:print-info 0 "Max avg query time of " maxqry "ms exceeds limit of " threshold "ms, starting server.")) #t) (else #f)))) |
︙ | ︙ |
Modified tests/fullrun/megatest.config from [26d5518426] to [ebc04ead74].
︙ | ︙ | |||
29 30 31 32 33 34 35 | waivercommentpatt ^WW\d+ [a-z].* incomplete-timeout 1 # yes, anything else is no run-wait yes | < < < | 29 30 31 32 33 34 35 36 37 38 39 40 41 42 | waivercommentpatt ^WW\d+ [a-z].* incomplete-timeout 1 # yes, anything else is no run-wait yes # If set to "default" the old code is used. Otherwise defaults to 200 or uses # numeric value given. # runqueue 20 # Default runtimelim 1d 1h 1m 10s |
︙ | ︙ | |||
122 123 124 125 126 127 128 129 130 131 132 133 134 135 | blah MAX_ALLOWED_LOAD 200 # XTERM [system xterm] # RUNDEAD [system exit 56] [server] synchronous 0 # If the server can't be started on this port it will try the next port until # it succeeds port 8080 # This server will keep running this number of hours after last access. | > > > > > > | 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 | blah MAX_ALLOWED_LOAD 200 # XTERM [system xterm] # RUNDEAD [system exit 56] [server] # Use http instead of direct filesystem access transport http # transport fs # transport nmsg synchronous 0 # If the server can't be started on this port it will try the next port until # it succeeds port 8080 # This server will keep running this number of hours after last access. |
︙ | ︙ |