Changes In Branch integ-home Through [69d0abc279] Excluding Merge-Ins
This is equivalent to a diff from 4b3bf0b62b to 69d0abc279
2017-02-09
| ||
21:02 | Merged v1.63 to trunk check-in: 31e9f07df0 user: matt tags: trunk | |
2016-12-16
| ||
14:43 | Automated merge of v1.63/ca7ed57c94/integ into integ-home check-in: d05283fa70 user: matt tags: integ-home | |
13:22 | Automated merge of v1.63/473da90c19/integ into integ-home check-in: 69d0abc279 user: matt tags: integ-home | |
13:08 | Fixed two bugs in getting and using cpuload check-in: 473da90c19 user: mrwellan tags: v1.63 | |
2016-12-15
| ||
12:40 | Automated merge of v1.63/c16914fba9/integ into integ-home check-in: 7afe215e9b user: matt tags: integ-home | |
2016-12-14
| ||
15:28 | merged work done on trunk accidentally check-in: 327a91c7af user: bjbarcla tags: v1.63 | |
2016-12-13
| ||
18:01 | Automated merge of trunk/4b3bf0b62b/integ into integ-home check-in: b5b44bddc1 user: matt tags: integ-home | |
17:17 | fixed a couple bugs in common:get-least-loaded-host check-in: 4b3bf0b62b user: bjbarcla tags: trunk | |
2016-12-12
| ||
14:11 | Added info to docs re. scriptinc. check-in: 36aa2d76fe user: mrwellan tags: trunk | |
Modified api.scm from [fe7a2f21be] to [97e3840c02].
︙ | ︙ | |||
237 238 239 240 241 242 243 244 245 246 247 248 249 250 | ((get-steps-data) (apply db:get-steps-data dbstruct params)) ((get-steps-for-test) (apply db:get-steps-for-test dbstruct params)) ;; TEST DATA ((read-test-data) (apply db:read-test-data dbstruct params)) ;; MISC ((have-incompletes?) (apply db:have-incompletes? dbstruct params)) ((login) (apply db:login dbstruct params)) ((general-call) (let ((stmtname (car params)) (run-id (cadr params)) (realparams (cddr params))) (db:with-db dbstruct run-id #t ;; these are all for modifying the db (lambda (db) | > | 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 | ((get-steps-data) (apply db:get-steps-data dbstruct params)) ((get-steps-for-test) (apply db:get-steps-for-test dbstruct params)) ;; TEST DATA ((read-test-data) (apply db:read-test-data dbstruct params)) ;; MISC ((get-latest-host-load) (apply db:get-latest-host-load dbstruct params)) ((have-incompletes?) (apply db:have-incompletes? dbstruct params)) ((login) (apply db:login dbstruct params)) ((general-call) (let ((stmtname (car params)) (run-id (cadr params)) (realparams (cddr params))) (db:with-db dbstruct run-id #t ;; these are all for modifying the db (lambda (db) |
︙ | ︙ |
Modified client.scm from [50265f350f] to [e8cb4a6c94].
︙ | ︙ | |||
44 45 46 47 48 49 50 | ok)) (define (client:connect iface port) (case (server:get-transport) ((rpc) (rpc:client-connect iface port)) ((http) (http:client-connect iface port)) ((zmq) (zmq:client-connect iface port)) | | > > | > > > > | | 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 | ok)) (define (client:connect iface port) (case (server:get-transport) ((rpc) (rpc:client-connect iface port)) ((http) (http:client-connect iface port)) ((zmq) (zmq:client-connect iface port)) (else (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport *runremote*) " not supported (5)") (exit)))) (define (client:setup run-id #!key (remaining-tries 10) (failed-connects 0)) (case (server:get-transport) ((rpc) (let ((res (client:setup-rpc run-id remaining-tries: remaining-tries))) (remote-conndat-set! *runremote* res) res)) ((http)(client:setup-http run-id remaining-tries: remaining-tries failed-connects: failed-connects)) (else (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport *runremote*) " not supported (6)") (exit)))) ;; (client:setup-rpc run-id)))) ;; (define (client:login-no-auto-setup server-info run-id) ;; (case (server:get-transport) ;; ((rpc) (rpc:login-no-auto-client-setup server-info run-id)) ;; ((http) (rmt:login-no-auto-client-setup server-info run-id)) ;; (else (rpc:login-no-auto-client-setup server-info run-id)))) ;; |
︙ | ︙ | |||
150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 | ;; 2. We are a run tests, list runs or other interactive process and we must figure out ;; *transport-type* and *runremote* from the monitor.db ;; ;; client:setup ;; ;; lookup_server, need to remove *runremote* stuff ;; (define (client:setup-http run-id #!key (remaining-tries 10) (failed-connects 0)) (debug:print-info 2 *default-log-port* "client:setup remaining-tries=" remaining-tries) (let* ((tdbdat (tasks:open-db))) (if (<= remaining-tries 0) (begin (debug:print-error 0 *default-log-port* "failed to start or connect to server for run-id " run-id) (exit 1)) (let* ((server-dat (tasks:get-server (db:delay-if-busy tdbdat) run-id))) (debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) (if server-dat (let* ((iface (tasks:hostinfo-get-interface server-dat)) (hostname (tasks:hostinfo-get-hostname server-dat)) (port (tasks:hostinfo-get-port server-dat)) | > > > > > > > > > > > > > > > > > > > > < | < < < | < < < < < < | 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 | ;; 2. We are a run tests, list runs or other interactive process and we must figure out ;; *transport-type* and *runremote* from the monitor.db ;; ;; client:setup ;; ;; lookup_server, need to remove *runremote* stuff ;; (define (client:setup-rpc run-id #!key (remaining-tries 10) (failed-connects 0)) (debug:print-info 2 *default-log-port* "client:setup-rpc remaining-tries=" remaining-tries) (let* ((server-dat (tasks:get-server (db:delay-if-busy (tasks:open-db)) run-id)) (num-available (tasks:num-in-available-state (db:delay-if-busy (tasks:open-db)) run-id))) (cond ((<= remaining-tries 0) (debug:print-error 0 *default-log-port* "failed to start or connect to server for run-id " run-id) (exit 1)) (server-dat (debug:print-info 4 *default-log-port* "client:setup-rpc server-dat=" server-dat ", remaining-tries=" remaining-tries) (rpc-transport:client-setup run-id server-dat remaining-tries: remaining-tries)) (else (if (< num-available 2) (server:try-running run-id)) (thread-sleep! (+ 2 (random (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms. (client:setup run-id remaining-tries: (- remaining-tries 1)))))) (define (client:setup-http run-id #!key (remaining-tries 10) (failed-connects 0)) (debug:print-info 2 *default-log-port* "client:setup remaining-tries=" remaining-tries) (let* ((tdbdat (tasks:open-db))) (if (<= remaining-tries 0) (begin (debug:print-error 0 *default-log-port* "failed to start or connect to server for run-id " run-id) (exit 1)) (let* ((server-dat (tasks:get-server (db:delay-if-busy tdbdat) run-id))) (debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) (if server-dat (let* ((iface (tasks:hostinfo-get-interface server-dat)) (hostname (tasks:hostinfo-get-hostname server-dat)) (port (tasks:hostinfo-get-port server-dat)) (start-res (http-transport:client-connect iface port)) (ping-res (rmt:login-no-auto-client-setup start-res))) (if (and start-res ping-res) (begin (remote-conndat-set! *runremote* start-res) ;; (hash-table-set! *runremote* run-id start-res) (debug:print-info 2 *default-log-port* "connected to " (http-transport:server-dat-make-url start-res)) start-res) (begin ;; login failed but have a server record, clean out the record and try again |
︙ | ︙ | |||
207 208 209 210 211 212 213 | (let ((num-available (tasks:num-in-available-state (db:dbdat-get-db tdbdat) run-id))) (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries " num-available=" num-available) (if (< num-available 2) (server:try-running run-id)) (thread-sleep! (+ 5 (random (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms. (client:setup run-id remaining-tries: (- remaining-tries 1))))))))) | | | 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 | (let ((num-available (tasks:num-in-available-state (db:dbdat-get-db tdbdat) run-id))) (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries " num-available=" num-available) (if (< num-available 2) (server:try-running run-id)) (thread-sleep! (+ 5 (random (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms. (client:setup run-id remaining-tries: (- remaining-tries 1))))))))) ;; keep this as a function to ease future ;; this is unused, not porting for rpc -BB (define (client:start run-id server-info) (http-transport:client-connect (tasks:hostinfo-get-interface server-info) (tasks:hostinfo-get-port server-info))) ;; ;; client:signal-handler ;; (define (client:signal-handler signum) ;; (signal-mask! signum) |
︙ | ︙ |
Modified common.scm from [7404179285] to [d79db433da].
︙ | ︙ | |||
16 17 18 19 20 21 22 23 24 25 26 27 28 29 | (import (prefix sqlite3 sqlite3:)) (import (prefix base64 base64:)) (declare (unit common)) (include "common_records.scm") ;; (require-library margs) ;; (include "margs.scm") ;; (define old-exit exit) ;; ;; (define (exit . code) | > > | 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 | (import (prefix sqlite3 sqlite3:)) (import (prefix base64 base64:)) (declare (unit common)) (include "common_records.scm") (include "thunk-utils.scm") ;; (require-library margs) ;; (include "margs.scm") ;; (define old-exit exit) ;; ;; (define (exit . code) |
︙ | ︙ | |||
58 59 60 61 62 63 64 | (mutex-lock! *context-mutex*) (let ((cxt (hash-table-ref/default *contexts* toppath #f))) (if (not cxt) (set! cxt (let ((x (make-cxt)))(hash-table-set! *contexts* toppath x) x))) (let ((cxt-mutex (cxt-mutex cxt))) (mutex-unlock! *context-mutex*) (mutex-lock! cxt-mutex) | > > > > > > | > > | > > | > | 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | (mutex-lock! *context-mutex*) (let ((cxt (hash-table-ref/default *contexts* toppath #f))) (if (not cxt) (set! cxt (let ((x (make-cxt)))(hash-table-set! *contexts* toppath x) x))) (let ((cxt-mutex (cxt-mutex cxt))) (mutex-unlock! *context-mutex*) (mutex-lock! cxt-mutex) ;; here we guard proc with exception handler so ;; no matter how proc succeeds or fails, ;; the cxt-mutex will be unlocked afterward. (let* ((EXCEPTION-SYMBOL (gensym)) ;; use a generated symbol (guarded-proc ;; to avoid collision (lambda args (let* ((res (condition-case (apply proc args) [x () (cons EXCEPTION-SYMBOL x)]))) (mutex-unlock! cxt-mutex) (if (and (pair? res) (eq? (car res) EXCEPTION)) (abort (cdr res)) res))))) (guarded-proc cxt))))) (define *db-keys* #f) (define *configinfo* #f) ;; raw results from setup, includes toppath and table from megatest.config (define *runconfigdat* #f) ;; run configs data (define *configdat* #f) ;; megatest.config data (define *configstatus* #f) ;; status of data; 'fulldata : all processing done, #f : no data yet, 'partialdata : partial read done |
︙ | ︙ | |||
100 101 102 103 104 105 106 | (define *task-db* #f) ;; (vector db path-to-db) (define *db-access-allowed* #t) ;; flag to allow access (define *db-access-mutex* (make-mutex)) (define *db-cache-path* #f) ;; SERVER (define *my-client-signature* #f) | | > > > > > > > > > > > | 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 | (define *task-db* #f) ;; (vector db path-to-db) (define *db-access-allowed* #t) ;; flag to allow access (define *db-access-mutex* (make-mutex)) (define *db-cache-path* #f) ;; SERVER (define *my-client-signature* #f) (define *transport-type* #f) ;; override with [server] transport http|rpc|nmsg (define *DEFAULT-TRANSPORT* "http") (define (common:set-transport-type) (set! *transport-type* (string->symbol (or (args:get-arg "-transport") (configf:lookup *configdat* "server" "transport") *DEFAULT-TRANSPORT*))) *transport-type*) (define *runremote* #f) ;; if set up for server communication this will hold <host port> (define *max-cache-size* 0) (define *logged-in-clients* (make-hash-table)) (define *server-id* #f) (define *server-info* #f) (define *time-to-exit* #f) (define *server-run* #t) |
︙ | ︙ | |||
607 608 609 610 611 612 613 614 615 616 617 618 619 620 | (thread-sleep! 1) (delay-loop (+ count 1)))) (loop))) (if (common:low-noise-print 30) (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*))))))) (define (std-exit-procedure) (let ((no-hurry (if *time-to-exit* ;; hurry up #f (begin (set! *time-to-exit* #t) #t)))) (debug:print-info 4 *default-log-port* "starting exit process, finalizing databases.") (if (and no-hurry (debug:debug-mode 18)) | > | 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 | (thread-sleep! 1) (delay-loop (+ count 1)))) (loop))) (if (common:low-noise-print 30) (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*))))))) (define (std-exit-procedure) (let ((no-hurry (if *time-to-exit* ;; hurry up #f (begin (set! *time-to-exit* #t) #t)))) (debug:print-info 4 *default-log-port* "starting exit process, finalizing databases.") (if (and no-hurry (debug:debug-mode 18)) |
︙ | ︙ | |||
635 636 637 638 639 640 641 642 643 644 645 646 647 648 | (debug:print 4 *default-log-port* "Attempting clean exit. Please be patient and wait a few seconds...") (if no-hurry (thread-sleep! 5) ;; give the clean up few seconds to do it's stuff (thread-sleep! 2)) (debug:print 4 *default-log-port* " ... done") ) "clean exit"))) (thread-start! th1) (thread-start! th2) (thread-join! th1)))) (define (std-signal-handler signum) ;; (signal-mask! signum) (set! *time-to-exit* #t) | > > > > > > > > > | 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 | (debug:print 4 *default-log-port* "Attempting clean exit. Please be patient and wait a few seconds...") (if no-hurry (thread-sleep! 5) ;; give the clean up few seconds to do it's stuff (thread-sleep! 2)) (debug:print 4 *default-log-port* " ... done") ) "clean exit"))) ;; let's try to clean up open sockets (if *runremote* (case (remote-transport *runremote*) ((http) #t) ((rpc) (rpc:close-all-connections!)) (else (debug:print-info 0 *default-log-port* "Transport "(remote-transport *runremote*)" not supported")))) (thread-start! th1) (thread-start! th2) (thread-join! th1)))) (define (std-signal-handler signum) ;; (signal-mask! signum) (set! *time-to-exit* #t) |
︙ | ︙ | |||
1082 1083 1084 1085 1086 1087 1088 | (with-input-from-pipe (conc "ssh " remote-host " cat /proc/loadavg") (lambda ()(list (read)(read)(read))))) (with-input-from-file "/proc/loadavg" (lambda ()(list (read)(read)(read)))))) ;; get normalized cpu load by reading from /proc/loadavg and /proc/cpuinfo return all three values and the number of real cpus and the number of threads | | > | 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 | (with-input-from-pipe (conc "ssh " remote-host " cat /proc/loadavg") (lambda ()(list (read)(read)(read))))) (with-input-from-file "/proc/loadavg" (lambda ()(list (read)(read)(read)))))) ;; get normalized cpu load by reading from /proc/loadavg and /proc/cpuinfo return all three values and the number of real cpus and the number of threads ;; returns alist '((adj-cpu-load . normalized-proc-load) ... etc. ;; keys: adj-proc-load, adj-core-load, 1m-load, 5m-load, 15m-load ;; (define (common:get-normalized-cpu-load remote-host) (let ((data (if remote-host (with-input-from-pipe (conc "ssh " remote-host " cat /proc/loadavg;cat /proc/cpuinfo;echo end") read-lines) (append |
︙ | ︙ | |||
1141 1142 1143 1144 1145 1146 1147 | (define (common:unix-ping hostname) (let ((res (system (conc "ping -c 1 " hostname " > /dev/null")))) (eq? res 0))) ;; ideally put all this info into the db, no need to preserve it across moving homehost ;; | > > | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > < < < < < < < < < < | | | | | | | | < | | | < > | | | > > > > > > > > > > > > > > > > > > > | > | | > > | < < < < | | | > > > | 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 | (define (common:unix-ping hostname) (let ((res (system (conc "ping -c 1 " hostname " > /dev/null")))) (eq? res 0))) ;; ideally put all this info into the db, no need to preserve it across moving homehost ;; ;; return list of ;; ( reachable? cpuload update-time ) (define (common:get-host-info hostname) (let* ((loadinfo (rmt:get-latest-host-load hostname)) (load (car loadinfo)) (load-sample-time (cdr loadinfo)) (load-sample-age (- (current-seconds) load-sample-time)) (loadinfo-timeout-seconds 20) (host-last-update-timeout-seconds 10) (host-rec (hash-table-ref/default *host-loads* hostname #f)) ) (cond ((< load-sample-age loadinfo-timeout-seconds) ;;(print "BB> chr - 1") (list #t load-sample-time load)) ((and host-rec (< (current-seconds) (+ (host-last-update host-rec) host-last-update-timeout-seconds))) ;;(print "BB> chr - 2") (list #t (host-last-update host-rec) (host-last-cpuload host-rec ))) ((common:unix-ping hostname) ;;(print "BB> chr - 3 host-rec="host-rec" lu="(if host-rec (- (current-seconds) (host-last-update host-rec)) "None")) (list #t (current-seconds) (alist-ref 'adj-core-load (common:get-normalized-cpu-load hostname)))) (else (list #f 0 -1))))) (define (common:update-host-loads-table hosts-raw) (let* ((hosts (filter (lambda (x) (string-match (regexp "^\\S+$") x)) hosts-raw))) (for-each (lambda (hostname) (let* ((rec (let ((h (hash-table-ref/default *host-loads* hostname #f))) (if h h (let ((h (make-host))) (hash-table-set! *host-loads* hostname h) h)))) (host-info (common:get-host-info hostname)) (is-reachable (car host-info)) (last-reached-time (cadr host-info)) (load (caddr host-info))) (host-reachable-set! rec is-reachable) (host-last-update-set! rec last-reached-time) (host-last-cpuload-set! rec load))) hosts))) (define (common:get-least-loaded-host hosts-raw) (let* ((hosts (filter (lambda (x) (string-match (regexp "^\\S+$") x)) hosts-raw)) (best-host #f) (best-load 99999) (curr-time (current-seconds))) (common:update-host-loads-table hosts) (for-each (lambda (hostname) (let* ((rec (let ((h (hash-table-ref/default *host-loads* hostname #f))) (if h h (let ((h (make-host))) (hash-table-set! *host-loads* hostname h) h)))) (reachable (host-reachable rec)) (load (host-last-cpuload rec))) (cond ((not reachable) #f) ((< (+ load (/ (random 250) 1000)) ;; add a random factor to keep from getting in a rut (+ best-load (/ (random 250) 1000)) ) (set! best-load load) (set! best-host hostname))))) hosts) best-host)) (define (common:wait-for-cpuload maxload numcpus waitdelay #!key (count 1000) (msg #f)(remote-host #f)) (let* ((loadavg (common:get-cpu-load remote-host)) (first (car loadavg)) (next (cadr loadavg)) (adjload (* maxload numcpus)) (loadjmp (- first next))) |
︙ | ︙ |
Modified db.scm from [b8a881530e] to [eba0b31003].
︙ | ︙ | |||
1495 1496 1497 1498 1499 1500 1501 | ;; (> (- (current-seconds)(file-modification-time tdatpath)) 600)))) ;; no change in 10 minutes to testdat.db - she's dead Jim ;; incompleted)) (min-incompleted-ids (map car incompleted)) ;; do 'em all (all-ids (append min-incompleted-ids (map car oldlaunched)))) (if (> (length all-ids) 0) (begin (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") " as INCOMPLETE") | > > > > > > > | | | | | | | | | | | | | | | | | 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 | ;; (> (- (current-seconds)(file-modification-time tdatpath)) 600)))) ;; no change in 10 minutes to testdat.db - she's dead Jim ;; incompleted)) (min-incompleted-ids (map car incompleted)) ;; do 'em all (all-ids (append min-incompleted-ids (map car oldlaunched)))) (if (> (length all-ids) 0) (begin (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") " as INCOMPLETE") (for-each (lambda (test-id) (db:test-set-status-state dbstruct run-id test-id "COMPLETE" "DEAD" "Test failed to complete")) all-ids)))))) ;; ALL REPLACED BY THE BLOCK ABOVE ;; ;; (sqlite3:execute ;; db ;; (conc "UPDATE tests SET state='INCOMPLETE' WHERE run_id=? AND id IN (" ;; (string-intersperse (map conc all-ids) ",") ;; ");") ;; run-id)))) ;; ;; ;; Now do rollups for the toplevel tests ;; ;; ;; ;; (db:delay-if-busy dbdat) ;; (for-each ;; (lambda (toptest) ;; (let ((test-name (list-ref toptest 3))) ;; ;; (run-id (list-ref toptest 5))) ;; (db:top-test-set-per-pf-counts dbstruct run-id test-name))) ;; toplevels))) ;; BUG: Probably broken - does not explicitly use run-id in the query ;; (define (db:top-test-set-per-pf-counts dbstruct run-id test-name) (db:general-call (db:get-db dbstruct run-id) 'top-test-set-per-pf-counts (list test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name))) |
︙ | ︙ | |||
3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 | (sqlite3:for-each-row (lambda (state status count) (set! res (cons (vector state status count) res))) db "SELECT state,status,count(state) FROM tests WHERE run_id=? AND testname=? AND item_path='' GROUP BY state,status;" run-id testname) res)) (define (db:set-top-level-from-items dbstruct run-id testname) (let* ((dbdat (db:get-db dbstruct run-id)) (db (db:dbdat-get-db dbdat)) (summ (db:get-state-status-summary db run-id testname)) (find (lambda (state status) (if (null? summ) | > > > > > > > > > > > > > > > > > > > | 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 | (sqlite3:for-each-row (lambda (state status count) (set! res (cons (vector state status count) res))) db "SELECT state,status,count(state) FROM tests WHERE run_id=? AND testname=? AND item_path='' GROUP BY state,status;" run-id testname) res)) (define (db:get-latest-host-load dbstruct raw-hostname) (let* ((hostname (string-substitute "\\..*$" "" raw-hostname)) (res (cons -1 0)) (mydb (db:dbdat-get-db (db:get-db dbstruct 0))) ) (print "BB> hostname="hostname" raw-hostname="raw-hostname" dbstruct="dbstruct" mydb="mydb) (db:with-db dbstruct 0 #f (lambda (db) (sqlite3:for-each-row (lambda (cpuload update-time) (set! res (cons cpuload update-time))) db "SELECT tr.cpuload, tr.update_time FROM test_rundat tr, tests t WHERE t.host=? AND tr.cpuload != -1 AND tr.test_id=t.id ORDER BY tr.update_time DESC LIMIT 1;" hostname))) res )) (define (db:set-top-level-from-items dbstruct run-id testname) (let* ((dbdat (db:get-db dbstruct run-id)) (db (db:dbdat-get-db dbdat)) (summ (db:get-state-status-summary db run-id testname)) (find (lambda (state status) (if (null? summ) |
︙ | ︙ |
Modified http-transport.scm from [4d8eecbf3a] to [b18d5a6f65].
︙ | ︙ | |||
82 83 84 85 86 87 88 | (res #f)) (cond ((equal? (uri-path (request-uri (current-request))) '(/ "api")) (send-response body: (api:process-request *dbstruct-db* $) ;; the $ is the request vars proc headers: '((content-type text/plain))) (mutex-lock! *heartbeat-mutex*) | | | 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 | (res #f)) (cond ((equal? (uri-path (request-uri (current-request))) '(/ "api")) (send-response body: (api:process-request *dbstruct-db* $) ;; the $ is the request vars proc headers: '((content-type text/plain))) (mutex-lock! *heartbeat-mutex*) (set! *db-lastaccess* (current-seconds)) (mutex-unlock! *heartbeat-mutex*)) ((equal? (uri-path (request-uri (current-request))) '(/ "")) (send-response body: (http-transport:main-page))) ((equal? (uri-path (request-uri (current-request))) '(/ "json_api")) (send-response body: (http-transport:main-page))) |
︙ | ︙ | |||
333 334 335 336 337 338 339 | ;; ;; connect ;; (define (http-transport:client-connect iface port) (let* ((api-url (conc "http://" iface ":" port "/api")) (api-uri (uri-reference (conc "http://" iface ":" port "/api"))) (api-req (make-request method: 'POST uri: api-uri)) | | | 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 | ;; ;; connect ;; (define (http-transport:client-connect iface port) (let* ((api-url (conc "http://" iface ":" port "/api")) (api-uri (uri-reference (conc "http://" iface ":" port "/api"))) (api-req (make-request method: 'POST uri: api-uri)) (server-dat (vector iface port api-uri api-url api-req (current-seconds) 'http))) server-dat)) ;; run http-transport:keep-running in a parallel thread to monitor that the db is being ;; used and to shutdown after sometime if it is not. ;; (define (http-transport:keep-running server-id run-id) ;; if none running or if > 20 seconds since |
︙ | ︙ | |||
396 397 398 399 400 401 402 | (begin (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "dbprep") (thread-sleep! 0.5) ;; give some margin for queries to complete before switching from file based access to server based access (set! *dbstruct-db* (db:setup)) ;; run-id)) (set! server-going #t) (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running") (server:write-dotserver *toppath* (conc iface ":" port)) | | | | 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 | (begin (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "dbprep") (thread-sleep! 0.5) ;; give some margin for queries to complete before switching from file based access to server based access (set! *dbstruct-db* (db:setup)) ;; run-id)) (set! server-going #t) (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running") (server:write-dotserver *toppath* (conc iface ":" port)) (server:dotserver-starting-remove)) (begin ;; gotta exit nicely (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "collision") (http-transport:server-shutdown server-id port)))))) ;; when things go wrong we don't want to be doing the various queries too often ;; so we strive to run this stuff only every four seconds or so. (let* ((sync-time (- (current-milliseconds) start-time)) (rem-time (quotient (- 4000 sync-time) 1000))) |
︙ | ︙ | |||
518 519 520 521 522 523 524 | (exit))) ;; all routes though here end in exit ... ;; ;; start_server? ;; (define (http-transport:launch run-id) | < | < < | | | > | 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 | (exit))) ;; all routes though here end in exit ... ;; ;; start_server? ;; (define (http-transport:launch run-id) (server:dotserver-starting) (let* ((tdbdat (tasks:open-db))) (set! *run-id* run-id) (if (args:get-arg "-daemonize") (begin (daemon:ize) (if *alt-log-file* ;; we should re-connect to this port, I think daemon:ize disrupts it (begin (current-error-port *alt-log-file*) (current-output-port *alt-log-file*))))) (if (and (server:read-dotserver *toppath*) (server:check-if-running run-id)) (begin (debug:print 0 *default-log-port* "INFO: Server for run-id " run-id " already running") (exit 0)) (begin ;; ok, no server detected, clean out any lingering records (tasks:server-force-clean-running-records-for-run-id (db:delay-if-busy tdbdat) run-id "notresponding"))) (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id 'http)) (remtries 4)) (if (not server-id) (if (> remtries 0) (begin (thread-sleep! 2) (loop (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id 'http) (- remtries 1))) (begin ;; since we didn't get the server lock we are going to clean up and bail out (debug:print-info 2 *default-log-port* "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue") (tasks:server-delete-records-for-this-pid (db:delay-if-busy tdbdat) " http-transport:launch") (server:dotserver-starting-remove) )) (let* ((th2 (make-thread (lambda () (debug:print-info 0 *default-log-port* "Server run thread started") (http-transport:run (if (args:get-arg "-server") (args:get-arg "-server") "-") |
︙ | ︙ | |||
636 637 638 639 640 641 642 | " ms</td></tr>" "<tr><td>Number non-cached queries</td> <td>" *number-non-write-queries* "</td></tr>" "<tr><td>Average non-cached time</td> <td>" (if (eq? *number-non-write-queries* 0) "n/a (no queries)" (/ *total-non-write-delay* *number-non-write-queries*)) " ms</td></tr>" | | | 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 | " ms</td></tr>" "<tr><td>Number non-cached queries</td> <td>" *number-non-write-queries* "</td></tr>" "<tr><td>Average non-cached time</td> <td>" (if (eq? *number-non-write-queries* 0) "n/a (no queries)" (/ *total-non-write-delay* *number-non-write-queries*)) " ms</td></tr>" "<tr><td>Last access</td><td>" (seconds->time-string *last-db-access*) "</td></tr>" "</table>"))) (mutex-unlock! *heartbeat-mutex*) res)) (define (http-transport:runs linkpath) (conc "<h3>Runs</h3>" (string-intersperse |
︙ | ︙ |
Modified launch.scm from [759f63b522] to [3726b1b6ee].
︙ | ︙ | |||
120 121 122 123 124 125 126 127 128 129 130 131 132 133 | (rmt:teststep-set-status! run-id test-id stepname "start" "-" #f #f) ;; now launch the actual process (call-with-environment-variables (list (cons "PATH" (conc (get-environment-variable "PATH") ":."))) (lambda () ;; (process-run "/bin/bash" "-c" "exec ls -l /tmp/foobar > /tmp/delme-more.log 2>&1") (let* ((cmd (conc stepcmd " > " stepname ".log 2>&1")) ;; >outfile 2>&1 (pid (process-run "/bin/bash" (list "-c" cmd)))) (rmt:test-set-top-process-pid run-id test-id pid) (let processloop ((i 0)) (let-values (((pid-val exit-status exit-code)(process-wait pid #t))) (mutex-lock! m) (launch:einf-pid-set! exit-info pid) ;; (vector-set! exit-info 0 pid) (launch:einf-exit-status-set! exit-info exit-status) ;; (vector-set! exit-info 1 exit-status) (launch:einf-exit-code-set! exit-info exit-code) ;; (vector-set! exit-info 2 exit-code) | > > > > > > > > > > > > | 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 | (rmt:teststep-set-status! run-id test-id stepname "start" "-" #f #f) ;; now launch the actual process (call-with-environment-variables (list (cons "PATH" (conc (get-environment-variable "PATH") ":."))) (lambda () ;; (process-run "/bin/bash" "-c" "exec ls -l /tmp/foobar > /tmp/delme-more.log 2>&1") (let* ((cmd (conc stepcmd " > " stepname ".log 2>&1")) ;; >outfile 2>&1 (pid (process-run "/bin/bash" (list "-c" cmd)))) (with-output-to-file "Makefile.ezsteps" (lambda () (print stepname ".log :") (print "\t" cmd) (if (file-exists? (conc stepname ".logpro")) (print "\tlogpro " stepname ".logpro " stepname ".html < " stepname ".log")) (print) (print stepname " : " stepname ".log") (print)) #:append) (rmt:test-set-top-process-pid run-id test-id pid) (let processloop ((i 0)) (let-values (((pid-val exit-status exit-code)(process-wait pid #t))) (mutex-lock! m) (launch:einf-pid-set! exit-info pid) ;; (vector-set! exit-info 0 pid) (launch:einf-exit-status-set! exit-info exit-status) ;; (vector-set! exit-info 1 exit-status) (launch:einf-exit-code-set! exit-info exit-code) ;; (vector-set! exit-info 2 exit-code) |
︙ | ︙ | |||
314 315 316 317 318 319 320 | (current-seconds) start-seconds))))) (kill-tries 0)) ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area) ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10) (let loop ((minutes (calc-minutes)) | | | | | 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 | (current-seconds) start-seconds))))) (kill-tries 0)) ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area) ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10) (let loop ((minutes (calc-minutes)) (cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (disk-free (get-df (current-directory)))) (let ((new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (delta (abs (- load cpu-load)))) (if (> delta 0.1) ;; don't bother updating with small changes load #f))) (new-disk-free (let* ((df (get-df (current-directory))) (delta (abs (- df disk-free)))) (if (> delta 200) ;; ignore changes under 200 Meg df #f)))) |
︙ | ︙ | |||
814 815 816 817 818 819 820 821 822 823 824 825 826 827 | (set! *runconfigdat* rdat) (set! *toppath* toppath) (set! *configstatus* 'partial)) (begin (debug:print-error 0 *default-log-port* "No " mtconfig " file found. Giving up.") (exit 2)))))) ;; additional house keeping (let* ((linktree (or (getenv "MT_LINKTREE") (if *configdat* (configf:lookup *configdat* "setup" "linktree") #f)))) (if linktree (begin (if (not (file-exists? linktree)) (begin (handle-exceptions | > | 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 | (set! *runconfigdat* rdat) (set! *toppath* toppath) (set! *configstatus* 'partial)) (begin (debug:print-error 0 *default-log-port* "No " mtconfig " file found. Giving up.") (exit 2)))))) ;; additional house keeping (common:set-transport-type) (let* ((linktree (or (getenv "MT_LINKTREE") (if *configdat* (configf:lookup *configdat* "setup" "linktree") #f)))) (if linktree (begin (if (not (file-exists? linktree)) (begin (handle-exceptions |
︙ | ︙ | |||
859 860 861 862 863 864 865 | (if disks (let ((res (common:get-disk-with-most-free-space disks minspace))) ;; min size of 1000, seems tad dumb (if res (cdr res) (begin (if (common:low-noise-print 20 "No valid disks or no disk with enough space") (debug:print-error 0 *default-log-port* "No valid disks found in megatest.config. Please add some to your [disks] section and ensure the directory exists and has enough space!\n You can change minspace in the [setup] section of megatest.config. Current setting is: " minspace)) | | | 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 | (if disks (let ((res (common:get-disk-with-most-free-space disks minspace))) ;; min size of 1000, seems tad dumb (if res (cdr res) (begin (if (common:low-noise-print 20 "No valid disks or no disk with enough space") (debug:print-error 0 *default-log-port* "No valid disks found in megatest.config. Please add some to your [disks] section and ensure the directory exists and has enough space!\n You can change minspace in the [setup] section of megatest.config. Current setting is: " minspace)) (exit 1))))))) ;; TODO - move the exit to the calling location and return #f ;; Desired directory structure: ;; ;; <linkdir> - <target> - <testname> -. ;; | ;; v ;; <rundir> - <target> - <testname> -|- <itempath(s)> |
︙ | ︙ | |||
1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 | ;; 2. create run dir on disk, path name is meaningful ;; 3. create link from run dir to megatest runs area ;; 4. remotely run the test on allocated host ;; - could be ssh to host from hosts table (update regularly with load) ;; - could be netbatch ;; (launch-test db (cadr status) test-conf)) (define (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat params) (let* ((item-path (item-list->path itemdat))) (let loop ((delta (- (current-seconds) *last-launch*)) (launch-delay (string->number (or (configf:lookup *configdat* "setup" "launch-delay") "5")))) (if (> launch-delay delta) (begin (debug:print-info 0 *default-log-port* "Delaying launch of " test-name " for " (- launch-delay delta) " seconds") (thread-sleep! (- launch-delay delta)) (loop (- (current-seconds) *last-launch*) launch-delay)))) | > < > | | | | | > | | > > > | | 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 | ;; 2. create run dir on disk, path name is meaningful ;; 3. create link from run dir to megatest runs area ;; 4. remotely run the test on allocated host ;; - could be ssh to host from hosts table (update regularly with load) ;; - could be netbatch ;; (launch-test db (cadr status) test-conf)) (define (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat params) (mutex-lock! *launch-setup-mutex*) ;; setting variables and processing the testconfig is NOT thread-safe, reuse the launch-setup mutex (let* ((item-path (item-list->path itemdat))) (let loop ((delta (- (current-seconds) *last-launch*)) (launch-delay (string->number (or (configf:lookup *configdat* "setup" "launch-delay") "5")))) (if (> launch-delay delta) (begin (debug:print-info 0 *default-log-port* "Delaying launch of " test-name " for " (- launch-delay delta) " seconds") (thread-sleep! (- launch-delay delta)) (loop (- (current-seconds) *last-launch*) launch-delay)))) (change-directory *toppath*) (alist->env-vars ;; consolidate this code with the code in megatest.scm for "-execute", *maybe* - the longer they are set the longer each launch takes (must be non-overlapping with the vars) (append (list (list "MT_RUN_AREA_HOME" *toppath*) (list "MT_TEST_NAME" test-name) (list "MT_RUNNAME" runname) (list "MT_ITEMPATH" item-path) ) itemdat)) (let* ((tregistry (tests:get-all)) ;; third param (below) is system-allowed ;; for tconfig, why do we allow fallback to test-conf? (tconfig (or (tests:get-testconfig test-name tregistry #t force-create: #t) (begin (debug:print 0 *default-log-port* "WARNING: falling back to pre-calculated testconfig. This is likely not desired.") test-conf))) ;; force re-read now that all vars are set (useshell (let ((ush (config-lookup *configdat* "jobtools" "useshell"))) (if ush (if (equal? ush "no") ;; must use "no" to NOT use shell #f ush) #t))) ;; default is yes (runscript (config-lookup tconfig "setup" "runscript")) |
︙ | ︙ | |||
1110 1111 1112 1113 1114 1115 1116 | (cmdparms #f) (fullcmd #f) ;; (define a (with-output-to-string (lambda ()(write x)))) (mt-bindir-path #f) (testinfo (rmt:get-test-info-by-id run-id test-id)) (mt_target (string-intersperse (map cadr keyvals) "/")) (debug-param (append (if (args:get-arg "-debug") (list "-debug" (args:get-arg "-debug")) '()) (if (args:get-arg "-logging")(list "-logging") '())))) | < > | 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 | (cmdparms #f) (fullcmd #f) ;; (define a (with-output-to-string (lambda ()(write x)))) (mt-bindir-path #f) (testinfo (rmt:get-test-info-by-id run-id test-id)) (mt_target (string-intersperse (map cadr keyvals) "/")) (debug-param (append (if (args:get-arg "-debug") (list "-debug" (args:get-arg "-debug")) '()) (if (args:get-arg "-logging")(list "-logging") '())))) ;; (if hosts (set! hosts (string-split hosts))) ;; set the megatest to be called on the remote host (if (not remote-megatest)(set! remote-megatest local-megatest)) ;; "megatest")) (set! mt-bindir-path (pathname-directory remote-megatest)) (if launcher (set! launcher (string-split launcher))) ;; set up the run work area for this test (if (and (args:get-arg "-preclean") ;; user has requested to preclean for this run (not (member (db:test-get-rundir testinfo)(list "n/a" "/tmp/badname")))) ;; n/a is a placeholder and thus not a read dir (begin (debug:print-info 0 *default-log-port* "attempting to preclean directory " (db:test-get-rundir testinfo) " for test " test-name "/" item-path) (runs:remove-test-directory testinfo 'remove-data-only))) ;; remove data only, do not perturb the record ;; prevent overlapping actions - set to LAUNCHED as early as possible ;; ;; the following call handles waiver propogation. cannot yet condense into roll-up-pass-fail (tests:test-set-status! run-id test-id "LAUNCHED" "n/a" #f #f) ;; (if launch-results launch-results "FAILED")) (rmt:roll-up-pass-fail-counts run-id test-name item-path #f "LAUNCHED" #f) ;; (pp (hash-table->alist tconfig)) (set! diskpath (get-best-disk *configdat* tconfig)) (if diskpath (let ((dat (create-work-area run-id run-info keyvals test-id test-path diskpath test-name itemdat))) (set! work-area (car dat)) (set! toptest-work-area (cadr dat)) (debug:print-info 2 *default-log-port* "Using work area " work-area)) (begin |
︙ | ︙ | |||
1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 | (if (not useshell)(debug:print 0 *default-log-port* "WARNING: internal launching will not work well without \"useshell yes\" in your [jobtools] section")) (set! fullcmd (append (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param (list (if useshell "&" "")))))) ;; (set! fullcmd (list remote-megatest test-sig "-execute" cmdparms (if useshell "&" ""))))) (if (args:get-arg "-xterm")(set! fullcmd (append fullcmd (list "-xterm")))) (debug:print 1 *default-log-port* "Launching " work-area) ;; set pre-launch-env-vars before launching, keep the vars in prevvals and put the envionment back when done (debug:print 4 *default-log-port* "fullcmd: " fullcmd) (let* ((commonprevvals (alist->env-vars (hash-table-ref/default *configdat* "env-override" '()))) (miscprevvals (alist->env-vars ;; consolidate this code with the code in megatest.scm for "-execute" (append (list (list "MT_TEST_RUN_DIR" work-area) (list "MT_TEST_NAME" test-name) (list "MT_ITEM_INFO" (conc itemdat)) (list "MT_RUNNAME" runname) | > | 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 | (if (not useshell)(debug:print 0 *default-log-port* "WARNING: internal launching will not work well without \"useshell yes\" in your [jobtools] section")) (set! fullcmd (append (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param (list (if useshell "&" "")))))) ;; (set! fullcmd (list remote-megatest test-sig "-execute" cmdparms (if useshell "&" ""))))) (if (args:get-arg "-xterm")(set! fullcmd (append fullcmd (list "-xterm")))) (debug:print 1 *default-log-port* "Launching " work-area) ;; set pre-launch-env-vars before launching, keep the vars in prevvals and put the envionment back when done (debug:print 4 *default-log-port* "fullcmd: " fullcmd) (set! *last-launch* (current-seconds)) ;; all that junk above takes time, set this as late as possible. (let* ((commonprevvals (alist->env-vars (hash-table-ref/default *configdat* "env-override" '()))) (miscprevvals (alist->env-vars ;; consolidate this code with the code in megatest.scm for "-execute" (append (list (list "MT_TEST_RUN_DIR" work-area) (list "MT_TEST_NAME" test-name) (list "MT_ITEM_INFO" (conc itemdat)) (list "MT_RUNNAME" runname) |
︙ | ︙ | |||
1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 | (if launchwait cmdstr (conc cmdstr " >> mt_launch.log 2>&1"))) (car fullcmd)) (if useshell '() (cdr fullcmd))))) (if (not launchwait) ;; give the OS a little time to allow the process to start (thread-sleep! 0.01)) (with-output-to-file "mt_launch.log" (lambda () (print "LAUNCHCMD: " (string-intersperse fullcmd " ")) (if (list? launch-results) (apply print launch-results) | > | 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 | (if launchwait cmdstr (conc cmdstr " >> mt_launch.log 2>&1"))) (car fullcmd)) (if useshell '() (cdr fullcmd))))) (mutex-unlock! *launch-setup-mutex*) ;; yes, really should mutex all the way to here. Need to put this entire process into a fork. (if (not launchwait) ;; give the OS a little time to allow the process to start (thread-sleep! 0.01)) (with-output-to-file "mt_launch.log" (lambda () (print "LAUNCHCMD: " (string-intersperse fullcmd " ")) (if (list? launch-results) (apply print launch-results) |
︙ | ︙ |
Modified megatest.scm from [da4e664704] to [37f657694c].
︙ | ︙ | |||
325 326 327 328 329 330 331 332 333 334 335 336 337 338 | "-q" ;; quiet 0, errors/warnings only ) args:arg-hash 0)) ;; Add args that use remargs here ;; (if (and (not (null? remargs)) (not (or (args:get-arg "-runstep") (args:get-arg "-envcap") (args:get-arg "-envdelta") ) )) | > | 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 | "-q" ;; quiet 0, errors/warnings only ) args:arg-hash 0)) ;; Add args that use remargs here ;; (if (and (not (null? remargs)) (not (or (args:get-arg "-runstep") (args:get-arg "-envcap") (args:get-arg "-envdelta") ) )) |
︙ | ︙ | |||
697 698 699 700 701 702 703 | (if (args:get-arg "-server") ;; Server? Start up here. ;; (let ((tl (launch:setup)) ;; (run-id (and (args:get-arg "-run-id") ;; (string->number (args:get-arg "-run-id")))) | | < < | 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 | (if (args:get-arg "-server") ;; Server? Start up here. ;; (let ((tl (launch:setup)) ;; (run-id (and (args:get-arg "-run-id") ;; (string->number (args:get-arg "-run-id")))) (transport-type *transport-type* )) (server:launch 0 transport-type) (set! *didsomething* #t))) ;; ;; (debug:print-error 0 *default-log-port* "server requires run-id be specified with -run-id"))) ;; ;; ;; Not a server? This section will decide how to communicate ;; ;; ;; ;; Setup client for all expect listed here |
︙ | ︙ | |||
1984 1985 1986 1987 1988 1989 1990 | (debug:print 0 *default-log-port* "Failed to create HTML output in " toppath "/lt/runs-index.html")) (set! *didsomething* #t))) ;;====================================================================== ;; Exit and clean up ;;====================================================================== | < | 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 | (debug:print 0 *default-log-port* "Failed to create HTML output in " toppath "/lt/runs-index.html")) (set! *didsomething* #t))) ;;====================================================================== ;; Exit and clean up ;;====================================================================== (if (not *didsomething*) (debug:print 0 *default-log-port* help)) (set! *time-to-exit* #t) (thread-join! *watchdog*) |
︙ | ︙ |
Modified rmt.scm from [2632e87e3e] to [b80fb1b75c].
1 2 3 4 5 6 7 8 9 10 | ;;====================================================================== ;; Copyright 2006-2013, Matthew Welland. ;; ;; This program is made available under the GNU GPL version 2.0 or ;; greater. See the accompanying file COPYING for details. ;; ;; This program is distributed WITHOUT ANY WARRANTY; without even the ;; implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR ;; PURPOSE. ;;====================================================================== | | > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 | ;;====================================================================== ;; Copyright 2006-2013, Matthew Welland. ;; ;; This program is made available under the GNU GPL version 2.0 or ;; greater. See the accompanying file COPYING for details. ;; ;; This program is distributed WITHOUT ANY WARRANTY; without even the ;; implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR ;; PURPOSE. ;;====================================================================== ;; (use format typed-records) ;; RADT => purpose of json format?? (declare (unit rmt)) (declare (uses api)) (declare (uses tdb)) (declare (uses http-transport)) (declare (uses rpc-transport)) ;;(declare (uses nmsg-transport)) (include "common_records.scm") ;; ;; THESE ARE ALL CALLED ON THE CLIENT SIDE!!! ;; |
︙ | ︙ | |||
131 132 133 134 135 136 137 | ;; if not on homehost ensure we have a connection to a live server ;; NOTE: we *have* a homehost record by now ((and (not (cdr (remote-hh-dat *runremote*))) ;; are we on a homehost? (not (remote-conndat *runremote*))) ;; and no connection (debug:print-info 12 *default-log-port* "rmt:send-receive, case 6 hh-dat: " (remote-hh-dat *runremote*) " conndat: " (remote-conndat *runremote*)) (mutex-unlock! *rmt-mutex*) (tasks:start-and-wait-for-server (tasks:open-db) 0 15) | > > > > | > > > > > | | | 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 | ;; if not on homehost ensure we have a connection to a live server ;; NOTE: we *have* a homehost record by now ((and (not (cdr (remote-hh-dat *runremote*))) ;; are we on a homehost? (not (remote-conndat *runremote*))) ;; and no connection (debug:print-info 12 *default-log-port* "rmt:send-receive, case 6 hh-dat: " (remote-hh-dat *runremote*) " conndat: " (remote-conndat *runremote*)) (mutex-unlock! *rmt-mutex*) (tasks:start-and-wait-for-server (tasks:open-db) 0 15) (let* ((cinfo (rmt:get-connection-info 0)) (transport (if cinfo (vector-ref cinfo 6) (server:get-transport)))) ;; TODO: replace with tasks:server-dat-accessor-?? for transport (remote-conndat-set! *runremote* cinfo) ;; calls client:setup which calls client:setup-http (remote-transport-set! *runremote* transport)) (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; all set up if get this far, dispatch the query ((cdr (remote-hh-dat *runremote*)) ;; we are on homehost (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 7") (rmt:open-qry-close-locally cmd (if rid rid 0) params)) ;; not on homehost, do server query (else (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9") (let* ((conninfo (remote-conndat *runremote*)) (dat (case (remote-transport *runremote*) ((http) (condition-case ;; handling here has caused a lot of problems. However it is needed to deal with attemtped communication to servers that have gone away (http-transport:client-api-send-receive 0 conninfo cmd params) ((commfail)(vector #f "communications fail")) ((exn)(vector #f "other fail" (print-call-chain))))) ((rpc) (condition-case ;; handling here has caused a lot of problems. However it is needed to deal with attemtped communication to servers that have gone away (rpc-transport:client-api-send-receive 0 conninfo cmd params) ((commfail)(vector #f "communications fail")) ((exn)(vector #f "other fail" (print-call-chain))))) (else (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport *runremote*) " not supported (1)") (exit)))) (success (if (vector? dat) (vector-ref dat 0) #f)) (res (if (vector? dat) (vector-ref dat 1) #f))) (if (vector? conninfo)(http-transport:server-dat-update-last-access conninfo)) ;; refresh access time (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9. conninfo=" conninfo " dat=" dat) (if success (case (remote-transport *runremote*) ((http rpc) res) (else (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport *runremote*) " is unknown") (exit 1))) (begin (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum) (remote-conndat-set! *runremote* #f) (remote-server-url-set! *runremote* #f) |
︙ | ︙ | |||
261 262 263 264 265 266 267 268 269 270 | (mutex-lock! *db-multi-sync-mutex*) (set! *db-last-write* start-time) ;; the oldest "write" (mutex-unlock! *db-multi-sync-mutex*))))) res)) (define (rmt:send-receive-no-auto-client-setup connection-info cmd run-id params) (let* ((run-id (if run-id run-id 0)) (res (handle-exceptions exn #f | > > | > > > > > > | 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 | (mutex-lock! *db-multi-sync-mutex*) (set! *db-last-write* start-time) ;; the oldest "write" (mutex-unlock! *db-multi-sync-mutex*))))) res)) (define (rmt:send-receive-no-auto-client-setup connection-info cmd run-id params) (let* ((run-id (if run-id run-id 0)) (transport (or (remote-transport *runremote*) (server:get-transport))) (res (handle-exceptions exn #f (case transport ((http) (http-transport:client-api-send-receive run-id connection-info cmd params)) ((rpc) (rpc-transport:client-api-send-receive run-id connection-info cmd params)) (else (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport *runremote*) " not supported (2)") (exit)) )))) (if (and res (vector-ref res 0)) (vector-ref res 1) ;;; YES!! THIS IS CORRECT!! CHANGE IT HERE, THEN CHANGE rmt:send-receive ALSO!!! #f))) ;; ;; Wrap json library for strings (why the ports crap in the first place?) ;; (define (rmt:dat->json-str dat) ;; (with-output-to-string |
︙ | ︙ | |||
308 309 310 311 312 313 314 | (rmt:send-receive 'login run-id (list *toppath* megatest-version *my-client-signature*))) ;; This login does no retries under the hood - it acts a bit like a ping. ;; Deprecated for nmsg-transport. ;; (define (rmt:login-no-auto-client-setup connection-info) (case *transport-type* ;; run-id of 0 is just a placeholder | | > > > > > > > > > > | 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 | (rmt:send-receive 'login run-id (list *toppath* megatest-version *my-client-signature*))) ;; This login does no retries under the hood - it acts a bit like a ping. ;; Deprecated for nmsg-transport. ;; (define (rmt:login-no-auto-client-setup connection-info) (case *transport-type* ;; run-id of 0 is just a placeholder ((http rpc)(rmt:send-receive-no-auto-client-setup connection-info 'login 0 (list *toppath* megatest-version *my-client-signature*))) (else (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport *runremote*) " not supported (3)") (exit)) ;;((nmsg)(nmsg-transport:client-api-send-receive run-id connection-info 'login (list *toppath* megatest-version run-id *my-client-signature*))) )) ;; hand off a call to one of the db:queries statements ;; added run-id to make looking up the correct db possible ;; (define (rmt:general-call stmtname run-id . params) (rmt:send-receive 'general-call run-id (append (list stmtname run-id) params))) ;; given a hostname, return a pair of cpu load and update time representing latest intelligence from tests running on that host (define (rmt:get-latest-host-load hostname) (rmt:send-receive 'get-latest-host-load 0 (list hostname))) ;; (define (rmt:sync-inmem->db run-id) ;; (rmt:send-receive 'sync-inmem->db run-id '())) (define (rmt:sdb-qry qry val run-id) ;; add caching if qry is 'getid or 'getstr (rmt:send-receive 'sdb-qry run-id (list qry val))) |
︙ | ︙ |
Modified rpc-transport.scm from [7aa56cfddc] to [a3a8e96335].
1 |
| | | 1 2 3 4 5 6 7 8 9 | ;; Copyright 2006-2016, Matthew Welland. ;; ;; This program is made available under the GNU GPL version 2.0 or ;; greater. See the accompanying file COPYING for details. ;; ;; This program is distributed WITHOUT ANY WARRANTY; without even the ;; implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR ;; PURPOSE. |
︙ | ︙ | |||
19 20 21 22 23 24 25 26 27 | (declare (uses common)) (declare (uses db)) (declare (uses tests)) (declare (uses tasks)) ;; tasks are where stuff is maintained about what is running. (include "common_records.scm") (include "db_records.scm") ;; procstr is the name of the procedure to be called as a string | > > > > | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | > > > > > > > | > > | > > > > > | > > > > > > > > | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | > | > > | < < | > > | | > > > > > > | < | | > > > > > > > > > > > > > | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | | > > > > > | | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | > > | | | > > > > > > > > > > > | > | < > > | > > > > > > > > > > | | > > > | | > > > > > > > > > > > > > > | > > | | | > | > > > > > > | > | > > | | | > > > > | | | | > > > > > > > > > > > > | < < < > > > > | > > > > > > > > | | | > > | > > > > > > | > > > > | > > > > > > > > > > > > > > | | | > > > > > > > > > > > > > > > > > > > > > > > > | | < | > | < > | | > | > > | > > > > > | > > | | > > | > > > > | > > > > > > > > > > > | | > | > > | > > > > > > > > > > | > > | > | | | > > > | > | | < | | | | < < < | < | > > | | | < < | < < < < < < < < < < < | < > | | > | > > | | | < | | < < < < < < < | > | < < < > | < < < < | > > > > | | | | | > > | > > > | | > | 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 | (declare (uses common)) (declare (uses db)) (declare (uses tests)) (declare (uses tasks)) ;; tasks are where stuff is maintained about what is running. (include "common_records.scm") (include "db_records.scm") (define *heartbeat-mutex* (make-mutex)) (define *server-loop-heart-beat* (current-seconds)) ;; procstr is the name of the procedure to be called as a string (define (rpc-transport:autoremote procstr params) ;; may be unused, I think api-exec deprecates this one. (let* ((procsym (if (symbol? procstr) procstr (string->symbol (->string procstr)))) (res (begin (apply (eval procsym) params)))) res)) ;; rpc receiver (define (rpc-transport:api-exec cmd params) (let* ( (resdat (api:execute-requests *dbstruct-db* (vector cmd params))) ;; #( flag result ) (flag (vector-ref resdat 0)) (res (vector-ref resdat 1))) (mutex-lock! *heartbeat-mutex*) (set! *last-db-access* (current-seconds)) ;; bump *last-db-access*; this will renew keep-running thread's lease on life for another (server:get-timeout) seconds ;;(BB> "in api-exec; last-db-access updated to "*last-db-access*) (mutex-unlock! *heartbeat-mutex*) res)) ;; retry an operation (depends on srfi-18) ;; ================== ;; idea here is to avoid spending time on coding retrying something. Trying to be generic here. ;; ;; Exception handling: ;; ------------------- ;; if evaluating the thunk results in exception, it will be retried. ;; on last try, if final-failure-returns-actual is true, the exception will be re-thrown to caller. ;; ;; look at options below #!key to see how to configure behavior ;; ;; (define (retry-thunk the-thunk #!key ;;;; options below (accept-result? (lambda (x) x)) ;; retry if predicate applied to thunk's result is false (retries 4) ;; how many tries (failure-value #f) ;; return this on final failure, unless following option is enabled: (final-failure-returns-actual #f) ;; on failure, on the last try, just return the result, not failure-value (retry-delay 0.1) ;; delay between tries (back-off-factor 1) ;; multiply retry-delay by this factor on retry (random-delay 0.1) ;; add a random portion of this value to wait (chatty #f) ;; print status as we go, for debugging. ) (when chatty (print) (print "Entered retry-thunk") (print "-=-=-=-=-=-")) (let* ((guarded-thunk ;; we are guarding the thunk against exceptions. We will record whether result of evaluation is an exception or a regular result. (lambda () (let* ((EXCEPTION (gensym)) ;; using gensym to avoid potential collision (res (condition-case (the-thunk) ;; this is what we are guarding the execution of [x () (cons EXCEPTION x)] ))) (cond ((and (pair? res) (eq? (car res) EXCEPTION)) (if chatty (print " - the-thunk threw exception >"(cdr res)"<")) (cons 'exception (cdr res))) (else (if chatty (print " - the-thunk returned result >"res"<")) (cons 'regular-result res))))))) (let loop ((guarded-res (guarded-thunk)) (retries-left retries) (fail-wait retry-delay)) (if chatty (print " ==========")) (let* ((wait-time (+ fail-wait (+ (* fail-wait back-off-factor) (* random-delay (/ (random 1024) 1024) )))) (res-type (car guarded-res)) (res-value (cdr guarded-res))) (cond ((and (eq? res-type 'regular-result) (accept-result? res-value)) (if chatty (print " + return result that satisfied accept-result? >"res-value"<")) res-value) ((> retries-left 0) (if chatty (print " - sleep "wait-time)) (thread-sleep! wait-time) (if chatty (print " + retry ["retries-left" tries left]")) (loop (guarded-thunk) (sub1 retries-left) wait-time)) ((eq? res-type 'regular-result) (if final-failure-returns-actual (begin (if chatty (print " + last try failed- return the result >"res-value"<")) res-value) (begin (if chatty (print " + last try failed- return canned failure value >"failure-value"<")) failure-value))) (else ;; no retries left; result was not accepted and res-type can only be 'exception (if final-failure-returns-actual (begin (if chatty (print " + last try failed with exception- re-throw it >"res-value"<")) (abort res-value)); re-raise the exception. TODO: find a way for call-history to show as though from entry to this function (begin (if chatty (print " + last try failed with exception- return canned failure value >"failure-value"<")) failure-value)))))))) (define (rpc-transport:server-shutdown server-id rpc:listener ) ;;#!key (from-on-exit #f)) ;;(on-exit (lambda () #t)) ;; turn off on-exit stuff ;;(tcp-close rpc:listener) ;; gotta exit nicely ;;(tasks:server-set-state! (db:delay-if-busy (tasks:open-db)) server-id "stopped") ;; TODO: (low) the following is extraordinaritly slow. Maybe we don't even need portlogger for rpc anyway?? the exception-based failover when ports are taken is fast! ;;(portlogger:open-run-close portlogger:set-port (rpc:default-server-port) "released") (set! *time-to-exit* #t) ;;(if *dbstruct-db* (db:sync-touched *dbstruct-db* *run-id* force-sync: #t)) (server:remove-dotserver-file *toppath* "anyhost:anyport" force: #t) (tasks:server-delete-record (db:delay-if-busy (tasks:open-db)) server-id " rpc-transport:keep-running complete") (rpc:close-all-connections!) ;;(BB> "Before (exit) (from-on-exit="from-on-exit")") ;;(unless from-on-exit (exit)) ;; sometimes we hang (around) here with 100% cpu. ;;(BB> "After") ;; strace reveals endless: ;; getrusage(RUSAGE_SELF, {ru_utime={413, 917868}, ru_stime={0, 60003}, ...}) = 0 ;; getrusage(RUSAGE_SELF, {ru_utime={414, 9874}, ru_stime={0, 60003}, ...}) = 0 ;; getrusage(RUSAGE_SELF, {ru_utime={414, 13874}, ru_stime={0, 60003}, ...}) = 0 ;; getrusage(RUSAGE_SELF, {ru_utime={414, 105880}, ru_stime={0, 60003}, ...}) = 0 ;; getrusage(RUSAGE_SELF, {ru_utime={414, 109880}, ru_stime={0, 60003}, ...}) = 0 ;; getrusage(RUSAGE_SELF, {ru_utime={414, 201886}, ru_stime={0, 60003}, ...}) = 0 ;; getrusage(RUSAGE_SELF, {ru_utime={414, 205886}, ru_stime={0, 60003}, ...}) = 0 ;; getrusage(RUSAGE_SELF, {ru_utime={414, 297892}, ru_stime={0, 60003}, ...}) = 0 ;; getrusage(RUSAGE_SELF, {ru_utime={414, 301892}, ru_stime={0, 60003}, ...}) = 0 ;; getrusage(RUSAGE_SELF, {ru_utime={414, 393898}, ru_stime={0, 60003}, ...}) = 0 ;; getrusage(RUSAGE_SELF, {ru_utime={414, 397898}, ru_stime={0, 60003}, ...}) = 0 ;; make a post to chicken-users w/ http://paste.call-cc.org/paste?id=60a4b66a29ccf7d11359ea866db642c970735978 ;; (if from-on-exit ;; ;; avoid above condition! End current process externally since 1 in 20 (exit)'s result in hung, 100% cpu zombies. (see above) (system (conc "kill -9 "(current-process-id))) ) ;; all routes though here end in exit ... ;; ;; start_server? ;; (define (rpc-transport:launch run-id) (set! *run-id* run-id) ;; ;; send to background if requested ;; (when (args:get-arg "-daemonize") ;; (daemon:ize) ;; (when *alt-log-file* ;; we should re-connect to this port, I think daemon:ize disrupts it ;; (current-error-port *alt-log-file*) ;; (current-output-port *alt-log-file*))) ;; double check we dont alrady have a running server for this run-id (when (and (server:read-dotserver *toppath*) (server:check-if-running run-id)) (debug:print 0 *default-log-port* "INFO: Server for run-id " run-id " already running") (exit 0)) ;; did not find server running, let's clean up the table of dead servers (tasks:server-force-clean-running-records-for-run-id (db:delay-if-busy (tasks:open-db)) run-id "notresponding") (server:dotserver-starting) ;; let's get a server-id for this server ;; if at first we do not suceed, try 3 more times. (let ((server-id (retry-thunk (lambda () (tasks:server-lock-slot (db:delay-if-busy (tasks:open-db)) run-id 'rpc)) chatty: #f final-failure-returns-actual: #t retries: 4))) (when (not server-id) ;; dang we couldn't get a server-id. ;; since we didn't get the server lock we are going to clean up and bail out (debug:print-info 2 *default-log-port* "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue") (tasks:server-delete-records-for-this-pid (db:delay-if-busy (tasks:open-db)) " rpc-transport:launch") (server:dotserver-starting-remove) (exit 1)) ;; we got a server-id (and a corresponding entry in servers table in globally shared mdb) ;; all systems go. Proceed to setup rpc server. (rpc-transport:run (if (args:get-arg "-server") (args:get-arg "-server") "-") run-id server-id) (exit))) (define *rpc-listener-port* #f) (define *rpc-listener-port-bind-timestamp* #f) (define *on-exit-flag #f) (define (rpc-transport:server-dat-get-iface vec) (vector-ref vec 0)) (define (rpc-transport:server-dat-get-port vec) (vector-ref vec 1)) (define (rpc-transport:server-dat-get-last-access vec) (vector-ref vec 5)) (define (rpc-transport:server-dat-get-transport vec) (vector-ref vec 6)) (define (rpc-transport:server-dat-update-last-access vec) (if (vector? vec) (vector-set! vec 5 (current-seconds)) (begin (print-call-chain (current-error-port)) (debug:print-error 0 *default-log-port* "call to rpc-transport:server-dat-update-last-access with non-vector!!")))) (define *api-exec-ht* (make-hash-table)) (define *api-exec-mutex* (make-mutex)) ;; let's see if caching the rpc stub curbs thread-profusion on server side (define (rpc-transport:get-api-exec iface port) (mutex-lock! *api-exec-mutex*) (let* ((lu (hash-table-ref/default *api-exec-ht* (cons iface port) #f))) (if lu (begin (mutex-unlock! *api-exec-mutex*) lu) (let ((res (rpc:procedure 'api-exec iface port))) (hash-table-set! *api-exec-ht* (cons iface port) res) (mutex-unlock! *api-exec-mutex*) res)))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; this client-side procedure makes rpc call to server and returns result ;; (define (rpc-transport:client-api-send-receive run-id serverdat cmd params #!key (numretries 3)) ;;(BB> "entered rpc-transport:client-api-send-receive with run-id="run-id " serverdat="serverdat" cmd="cmd" params="params" numretries="numretries) (if (not (vector? serverdat)) (begin (BB> "WHAT?? for run-id="run-id", serverdat="serverdat) (print-call-chain) (rpc:close-all-connections!) (exit 1))) (let* ((iface (rpc-transport:server-dat-get-iface serverdat)) (port (rpc-transport:server-dat-get-port serverdat)) (res #f) (api-exec (rpc-transport:get-api-exec iface port)) ;; chached by host/port. may need to clear... (send-receive (lambda () (tcp-buffer-size 0) (set! res (retry-thunk (lambda () (condition-case ;;(vector #t (run-remote cmd params)) (vector 'success (api-exec cmd params)) [x (exn i/o net) (vector 'comms-fail (conc "communications fail ["(->string x)"]") x)] [x () (vector 'other-fail "other fail ["(->string x)"]" x)])) chatty: #f accept-result?: (lambda(x) (and (vector? x) (vector-ref x 0))) retries: 8 back-off-factor: 1.5 random-wait: 0.2 retry-delay: 0.1 final-failure-returns-actual: #t)) ;;(BB> "HEY res="res) res )) (th1 (make-thread send-receive "send-receive")) (time-out-reached #f) (time-out (lambda () (thread-sleep! 45) (set! time-out-reached #t) (thread-terminate! th1) #f)) (th2 (make-thread time-out "time out"))) (thread-start! th1) (thread-start! th2) (thread-join! th1) (thread-terminate! th2) ;;(BB> "alt got res="res) (debug:print-info 11 *default-log-port* "got res=" res) (if (vector? res) (case (vector-ref res 0) ((success) (vector #t (vector-ref res 1))) ( (comms-fail other-fail) ;;(comms-fail) (debug:print 0 *default-log-port* "WARNING: comms failure for rpc request >>"res"<<") ;;(debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (vector #f (vector-ref res 1))) (else (debug:print-error 0 *default-log-port* "error occured at server, info=" (vector-ref res 1)) (debug:print 0 *default-log-port* " client call chain:") (print-call-chain (current-error-port)) (debug:print 0 *default-log-port* " server call chain:") (pp (vector-ref res 1) (current-error-port)) (signal (vector-ref res 2)))) (signal (make-composite-condition (make-property-condition 'timeout 'message "nmsg-transport:client-api-send-receive-raw timed out talking to server")))))) (define (rpc-transport:run hostn run-id server-id) (debug:print 2 *default-log-port* "Attempting to start the rpc server ...") ;; (trace rpc:publish-procedure!) ;;====================================================================== ;; start of publish-procedure section ;;====================================================================== (rpc:publish-procedure! 'server:login server:login) ;; this allows client to validate it is the same megatest instance as the server. No security here, just making sure we're in the right room. (rpc:publish-procedure! 'testing (lambda () "Just testing")) ;; procedure to receive arbitrary API request from client's rpc:send-receive/rpc-transport:client-api-send-receive (rpc:publish-procedure! 'rpc-transport:autoremote rpc-transport:autoremote) ;; can use this to run most anything at the remote (rpc:publish-procedure! 'api-exec rpc-transport:api-exec) ;;====================================================================== ;; end of publish-procedure section ;;====================================================================== (let* ((db #f) (hostname (let ((res (get-host-name))) res)) (server-start-time (current-seconds)) (server-timeout (server:get-timeout)) (ipaddrstr (let* ((ipstr (if (string=? "-" hostn) ;; (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".") (server:get-best-guess-address hostname) #f)) (res (if ipstr ipstr hostn))) res)) ;; hostname))) (start-port (let ((res (portlogger:open-run-close portlogger:find-port))) ;; BB> TODO: remove portlogger! res)) (link-tree-path (configf:lookup *configdat* "setup" "linktree")) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rpc:listener is the tcp-listen result from inside the find-free-port-and-open complex. ;; It is our handle on the listening tcp port ;; We will attach this to our rpc server with rpc:make-server in thread th1 . (rpc:listener (rpc-transport:find-free-port-and-open start-port)) (th1 (make-thread (lambda () ;;(BB> "BEFORE rpc:make-server") ((rpc:make-server rpc:listener) #t) ;;(BB> "BEFORE rpc:make-server") ) "rpc:server")) (hostname (if (string=? "-" hostn) (get-host-name) hostn)) (ipaddrstr (if (string=? "-" hostn) (server:get-best-guess-address hostname) ;; (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".") (string-intersperse (map number->string (u8vector->list (hostname->ip hostn))) ".") )) (portnum (let ((res (rpc:default-server-port))) res)) (host:port (conc (if ipaddrstr ipaddrstr hostname) ":" portnum))) (when (not (equal? ipaddrstr (server:get-best-guess-address (get-host-name)))) (debug:print 0 *default-log-port* "Error: This host "(ip->string (hostname->ip (get-host-name)))" ("(get-host-name)") is not the homehost "ipaddrstr" ("(ip->hostname (string->ip ipaddrstr))"; Cannot proceed.") (server:dotserver-starting-remove) (tcp-close rpc:listener) ;; gotta exit nicely and free up that tcp port (exit)) (tasks:server-set-interface-port (db:delay-if-busy (tasks:open-db)) server-id ipaddrstr portnum) ;;============================================================ ;; activate thread th1 to attach opened tcp port to rpc server ;;============================================================= (thread-start! th1) (set! db *dbstruct-db*) (debug:print 0 *default-log-port* "Server started on " host:port) ;;(BB> "before SELF-TEST") (if (retry-thunk (lambda () (rpc-transport:self-test run-id ipaddrstr portnum)) final-failure-returns-actual: #t ;; TODO: remove this line ) (debug:print 0 *default-log-port* "INFO: rpc self test passed!") (begin (debug:print 0 *default-log-port* "Error: rpc listener did not pass self test. Shutting down. On: " host:port) (tasks:server-set-state! (db:delay-if-busy (tasks:open-db)) server-id "dead") (tcp-close rpc:listener) ;; gotta exit nicely and free up that tcp port (rpc-transport:server-shutdown server-id rpc:listener) (server:dotserver-starting-remove) (exit))) ;;(on-exit (lambda () ;; (rpc-transport:server-shutdown server-id rpc:listener from-on-exit: #t))) ;; check again for running servers for this run-id in case one has snuck in since we checked last in rpc-transport:launch (if (not (equal? server-id (tasks:server-am-i-the-server? (db:delay-if-busy (tasks:open-db)) run-id)));; try to ensure no double registering of servers (begin ;; i am not the server, another server snuck in and beat this one to the punch (tcp-close rpc:listener) ;; gotta exit nicely and free up that tcp port (tasks:server-set-state! (db:delay-if-busy (tasks:open-db)) server-id "collision") (server:dotserver-starting-remove)) (begin ;; i am the server ;; setup the in-memory db (set! *dbstruct-db* (db:setup run-id)) (db:get-db *dbstruct-db* run-id) ;; at this point, satisfied server has started ;; let's make it official (server:write-dotserver *toppath* (conc ipaddrstr ":" portnum)) (mutex-lock! *heartbeat-mutex*) (set! *last-db-access* (current-seconds)) (mutex-unlock! *heartbeat-mutex*) (set! *rpc:listener* rpc:listener) (tasks:server-set-state! (db:delay-if-busy (tasks:open-db)) server-id "running") ;; update our mdb servers entry ;; this let loop will hold open this thread until we want the server to shut down. ;; if no requests received within the last 20 seconds : ;; database hasnt changed in ?? ;; ;; keep-running loop: polls last-db-access to see if we have timed out. keep running if not. (let loop ((count 0) (bad-sync-count 0)) (BB> "keep running: count = "count) ;; Use this opportunity to sync the inmemdb to db (let ((start-time (current-milliseconds)) (sync-time #f) (rem-time #f)) ;; following is now done in common:watchdog, commenting out. sync-time will now be 0; can live with that. ;; ;; inmemddb is a dbstruct ;; (condition-case ;; (db:sync-touched *dbstruct-db* *run-id* force-sync: #t) ;; ((sync-failed)(cond ;; ((> bad-sync-count 10) ;; time to give up ;; (rpc-transport:server-shutdown server-id rpc:listener)) ;; (else ;; (> bad-sync-count 0) ;; we've had a fail or two, delay and loop ;; (thread-sleep! 5) ;; (loop count (+ bad-sync-count 1))))) ;; ((exn) ;; (debug:print-error 0 *default-log-port* "error from sync code other than 'sync-failed. Attempting to gracefully shutdown the server ") ;; (rpc-transport:server-shutdown server-id rpc:listener))) (set! sync-time (- (current-milliseconds) start-time)) (set! rem-time (quotient (- 4000 sync-time) 1000)) (debug:print 4 *default-log-port* "SYNC: time= " sync-time ", rem-time=" rem-time) (if (and (<= rem-time 4) (> rem-time 0)) (thread-sleep! rem-time) (thread-sleep! 4))) ;; fallback for if the math is changed ... (if (< count 1) ;; 3x3 = 9 secs aprox (loop (+ count 1) bad-sync-count)) ;; BB: don't see how this is possible with RPC ;; ;; Check that iface and port have not changed (can happen if server port collides) ;; (mutex-lock! *heartbeat-mutex*) ;; (set! sdat *server-info*) ;; (mutex-unlock! *heartbeat-mutex*) ;; (if (or (not (equal? sdat (list iface port))) ;; (not server-id)) ;; (begin ;; (debug:print-info 0 *default-log-port* "interface changed, refreshing iface and port info") ;; (set! iface (car sdat)) ;; (set! port (cadr sdat)))) ;; Transfer *last-db-access* to last-access to use in checking that we are still alive (mutex-lock! *heartbeat-mutex*) (set! last-access *last-db-access*) (mutex-unlock! *heartbeat-mutex*) ;; (debug:print 11 *default-log-port* "last-access=" last-access ", server-timeout=" server-timeout) ;; ;; no_traffic, no running tests, if server 0, no running servers ;; ;; (let ((wait-on-running (configf:lookup *configdat* "server" b"wait-on-running"))) ;; wait on running tasks (if not true then exit on time out) ;; (let* ((hrs-since-start (/ (- (current-seconds) server-start-time) 3600)) (adjusted-timeout (if (> hrs-since-start 1) (- server-timeout (inexact->exact (round (* hrs-since-start 60)))) ;; subtract 60 seconds per hour server-timeout))) (if (common:low-noise-print 120 "server timeout") (debug:print-info 0 *default-log-port* "Adjusted server timeout: " adjusted-timeout)) (if (and *server-run* (> (+ last-access server-timeout) (current-seconds))) (begin (if (common:low-noise-print 120 "server continuing") (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access))) ;; ;; Consider implementing some smarts here to re-insert the record or kill self is ;; the db indicates so ;; (if (tasks:server-am-i-the-server? (db:delay-if-busy (tasks:open-db)) run-id) (tasks:server-set-state! (db:delay-if-busy (tasks:open-db)) server-id "running")) ;; (loop 0 bad-sync-count)) (begin ;;(BB> "SERVER SHUTDOWN CALLED! last-access="last-access" current-seconds="(current-seconds)" server-timeout="server-timeout) (rpc-transport:server-shutdown server-id rpc:listener))))) ;; end new loop )))) (define (rpc-transport:find-free-port-and-open port #!key ) (handle-exceptions exn (begin (print "Failed to bind to port " (rpc:default-server-port) ", trying next port") (rpc-transport:find-free-port-and-open (add1 port))) (rpc:default-server-port port) (set! *rpc-listener-port* port) ;; a bit paranoid about rpc:default-server-port parameter not changing across threads (as params are wont to do). keeping this global in my back pocket in case this causes problems (set! *rpc-listener-port-bind-timestamp* (current-milliseconds)) ;; may want to test how long it has been since the last bind attempt happened... (tcp-read-timeout 240000) (tcp-buffer-size 0) ;; gotta do this because http-transport undoes it. (tcp-listen (rpc:default-server-port) 10000) )) (define (rpc-transport:ping run-id host port) (handle-exceptions exn (begin (print "SERVER_NOT_FOUND exn="exn) (exit 1)) (let ((login-res ((rpc:procedure 'server:login host port) *toppath*))) (if login-res (begin (print "LOGIN_OK") (exit 0)) (begin (print "LOGIN_FAILED") (exit 1)))))) (define (rpc-transport:self-test run-id host port) (if (not host) (abort "host not set.")) (if (not port) (abort "port not set.")) (tcp-buffer-size 0) ;; gotta do this because http-transport undoes it. (let* ((testing-res ((rpc:procedure 'testing host port))) (login-res ((rpc:procedure 'server:login host port) *toppath*)) (res (and login-res (equal? testing-res "Just testing")))) (if login-res (begin ;;(BB> "Self test PASS. login-res="login-res" testing-res="testing-res" *toppath*="*toppath*) #t) (begin (BB> "Self test fail. login-res="login-res" testing-res="testing-res" *toppath*="*toppath*) #f)) res)) (define (rpc-transport:client-setup run-id server-dat #!key (remaining-tries 10)) ;;(BB> "entered rpc-transport:client-setup with run-id="run-id" and server-dat="server-dat" and retries="remaining-tries) (tcp-buffer-size 0) (debug:print-info 0 *default-log-port* "rpc-transport:client-setup run-id="run-id" server-dat=" server-dat ", remaining-tries=" remaining-tries) (let* ((iface (tasks:hostinfo-get-interface server-dat)) (hostname (tasks:hostinfo-get-hostname server-dat)) (port (tasks:hostinfo-get-port server-dat)) (runremote-server-dat (vector iface port #f #f #f (current-seconds) 'rpc)) ;; http version := (vector iface port api-uri api-url api-req (current-seconds) 'http ) (ping-res (retry-thunk (lambda () ;; make 3 attempts to ping. ((rpc:procedure 'server:login iface port) *toppath*)) chatty: #f retries: 3))) ;; we got here from rmt:get-connection-info on the condition that *runremote* has no entry for run-id... (if ping-res (begin (debug:print-info 0 *default-log-port* "rpc-transport:client-setup CONNECTION ESTABLISHED run-id="run-id" server-dat=" server-dat) runremote-server-dat) (begin ;; login failed but have a server record, clean out the record and try again (debug:print-info 0 *default-log-port* "rpc-transport:client-setup UNABLE TO CONNECT run-id="run-id" server-dat=" server-dat) (tasks:kill-server-run-id run-id) (tasks:server-force-clean-run-record (db:delay-if-busy (tasks:open-db)) run-id iface port " rpc-transport:client-setup (server-dat = #t)") (if (> remaining-tries 2) (thread-sleep! (+ 1 (random 5))) ;; spread out the starts a little (thread-sleep! (+ 15 (random 20)))) ;; it isn't going well. give it plenty of time (server:try-running run-id) (thread-sleep! 5) ;; give server a little time to start up (client:setup run-id remaining-tries: (sub1 remaining-tries)))))) |
Modified server.scm from [c0f30a061c] to [9cba9551df].
︙ | ︙ | |||
44 45 46 47 48 49 50 | ;; Call this to start the actual server ;; ;; all routes though here end in exit ... ;; ;; start_server ;; | | > > > > > | > | | | | | < < | | | 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 | ;; Call this to start the actual server ;; ;; all routes though here end in exit ... ;; ;; start_server ;; (define (server:launch run-id transport-type-raw) (let ((transport-type (cond ((string? transport-type-raw) (string->symbol transport-type-raw)) (else transport-type-raw)))) ;;(BB> "server:launch fired for run-id="run-id" transport-type="transport-type) (case transport-type ((http)(http-transport:launch run-id)) ;;((nmsg)(nmsg-transport:launch run-id)) ((rpc) (rpc-transport:launch run-id)) (else (debug:print-error 0 *default-log-port* "unknown server type " transport-type))))) ;;====================================================================== ;; S E R V E R U T I L I T I E S ;;====================================================================== ;; Get the transport (define (server:get-transport) (if *transport-type* *transport-type* (let ((ttype (string->symbol (or (args:get-arg "-transport") (configf:lookup *configdat* "server" "transport") *DEFAULT-TRANSPORT*)))) (set! *transport-type* ttype) ttype))) ;; Generate a unique signature for this server (define (server:mk-signature) (message-digest-string (md5-primitive) (with-output-to-string |
︙ | ︙ | |||
110 111 112 113 114 115 116 | (curr-ip (server:get-best-guess-address curr-host)) (curr-pid (current-process-id)) (homehost (common:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" )) (target-host (car homehost)) (testsuite (common:get-testsuite-name)) (logfile (conc *toppath* "/logs/server.log")) (cmdln (conc (common:get-megatest-exe) | | > | | > | 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | (curr-ip (server:get-best-guess-address curr-host)) (curr-pid (current-process-id)) (homehost (common:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" )) (target-host (car homehost)) (testsuite (common:get-testsuite-name)) (logfile (conc *toppath* "/logs/server.log")) (cmdln (conc (common:get-megatest-exe) " -server " (or target-host "-") " -run-id " 0 (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") (conc " -daemonize -log " logfile) "") " -transport " (server:get-transport) " -m testsuite:" testsuite)) ;; (conc " >> " logfile " 2>&1 &"))))) (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread"))) ;; we want the remote server to start in *toppath* so push there (push-directory *toppath*) (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") (thread-start! log-rotate) |
︙ | ︙ | |||
185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 | (file-read-access? dotfile)) (with-input-from-file dotfile (lambda () (read-line))) #f)))) ;; write a .server file in *toppath* with hostport ;; return #t on success, #f otherwise ;; (define (server:write-dotserver areapath hostport) (let ((lock-file (conc areapath "/.server.lock")) (server-file (conc areapath "/.server"))) (if (common:simple-file-lock lock-file) (let ((res (handle-exceptions exn #f ;; failed for some reason, for the moment simply return #f (with-output-to-file server-file (lambda () (print hostport))) #t))) (debug:print-info 0 *default-log-port* "server file " server-file " for " hostport " created") (common:simple-file-release-lock lock-file) res) #f))) | > > > > > > > > > > > > | | | | 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 | (file-read-access? dotfile)) (with-input-from-file dotfile (lambda () (read-line))) #f)))) (define (server:dotserver-starting) (with-output-to-file (conc *toppath* "/.starting-server") (lambda () (print (current-process-id) " on " (get-host-name))))) (define (server:dotserver-starting-remove) (delete-file* (conc *toppath* "/.starting-server"))) ;; write a .server file in *toppath* with hostport ;; return #t on success, #f otherwise ;; (define (server:write-dotserver areapath hostport) (let ((lock-file (conc areapath "/.server.lock")) (server-file (conc areapath "/.server"))) (if (common:simple-file-lock lock-file) (let ((res (handle-exceptions exn #f ;; failed for some reason, for the moment simply return #f (with-output-to-file server-file (lambda () (print hostport))) #t))) (debug:print-info 0 *default-log-port* "server file " server-file " for " hostport " created") (common:simple-file-release-lock lock-file) res) #f))) (define (server:remove-dotserver-file areapath hostport #!key (force #f)) (let ((dotserver (server:read-dotserver areapath)) (server-file (conc areapath "/.server")) (lock-file (conc areapath "/.server.lock"))) (if (or force (and dotserver (string-match (conc ".*:" hostport "$") dotserver))) ;; port matches, good enough info to decide to remove the file (if (common:simple-file-lock lock-file) (begin (handle-exceptions exn #f (delete-file* server-file)) (debug:print-info 0 *default-log-port* "server file " server-file " for " hostport " removed") (common:simple-file-release-lock lock-file)))))) ;; no longer care if multiple servers are started by accident. older servers will drop off in time. ;; (define (server:check-if-running areapath) (let* ((dotserver (server:read-dotserver areapath))) ;; tdbdat (tasks:open-db))) (if dotserver (let* ((res (case *transport-type* ((http rpc)(server:ping-server dotserver)) ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server) ))) (if res dotserver #f)) #f))) |
︙ | ︙ | |||
263 264 265 266 267 268 269 | (begin (if host-port-in (debug:print 0 *default-log-port* "ERROR: bad host:port")) (if do-exit (exit 1)) #f) (let* ((iface (car host-port)) (port (cadr host-port)) | > > | > > > > | 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 | (begin (if host-port-in (debug:print 0 *default-log-port* "ERROR: bad host:port")) (if do-exit (exit 1)) #f) (let* ((iface (car host-port)) (port (cadr host-port)) (server-dat (case (remote-transport *runremote*) ((http) (http-transport:client-connect iface port)) ((rpc) (rpc-transport:client-connect iface port)) (else (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport *runremote*) " not supported (4)") (exit)))) (login-res (rmt:login-no-auto-client-setup server-dat))) (if (and (list? login-res) (car login-res)) (begin (print "LOGIN_OK") (if do-exit (exit 0))) (begin |
︙ | ︙ |
Modified tasks.scm from [b8a3c2af2e] to [5534894005].
︙ | ︙ | |||
168 169 170 171 172 173 174 | (define (tasks:hostinfo-get-interface vec) (vector-ref vec 1)) (define (tasks:hostinfo-get-port vec) (vector-ref vec 2)) (define (tasks:hostinfo-get-pubport vec) (vector-ref vec 3)) (define (tasks:hostinfo-get-transport vec) (vector-ref vec 4)) (define (tasks:hostinfo-get-pid vec) (vector-ref vec 5)) (define (tasks:hostinfo-get-hostname vec) (vector-ref vec 6)) | | | | | | 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 | (define (tasks:hostinfo-get-interface vec) (vector-ref vec 1)) (define (tasks:hostinfo-get-port vec) (vector-ref vec 2)) (define (tasks:hostinfo-get-pubport vec) (vector-ref vec 3)) (define (tasks:hostinfo-get-transport vec) (vector-ref vec 4)) (define (tasks:hostinfo-get-pid vec) (vector-ref vec 5)) (define (tasks:hostinfo-get-hostname vec) (vector-ref vec 6)) (define (tasks:server-lock-slot mdb run-id transport-type) (tasks:server-clean-out-old-records-for-run-id mdb run-id " tasks:server-lock-slot") (if (< (tasks:num-in-available-state mdb run-id) 4) (begin (tasks:server-set-available mdb run-id transport-type) (thread-sleep! (/ (random 1500) 1000)) ;; (thread-sleep! 2) ;; Try removing this. It may not be needed. (tasks:server-am-i-the-server? mdb run-id)) #f)) ;; register that this server may come online (first to register goes though with the process) (define (tasks:server-set-available mdb run-id transport-type) (sqlite3:execute mdb "INSERT INTO servers (pid,hostname,port,pubport,start_time, priority,state,mt_version,heartbeat, interface,transport,run_id) VALUES(?, ?, ?, ?, strftime('%s','now'), ?, ?, ?,-1,?, ?, ?);" (current-process-id) ;; pid (get-host-name) ;; hostname -1 ;; port -1 ;; pubport (random 1000) ;; priority (used a tiebreaker on get-available) "available" ;; state (common:version-signature) ;; mt_version -1 ;; interface ;; (conc (server:get-transport)) ;; transport (symbol->string transport-type) ;; transport run-id )) (define (tasks:num-in-available-state mdb run-id) (let ((res 0)) (sqlite3:for-each-row (lambda (num-in-queue) |
︙ | ︙ | |||
321 322 323 324 325 326 327 | (let* ((header (list "id" "hostname" "pid" "interface" "port" "pubport" "state" "run_id" "priority" "start_time")) (selstr (string-intersperse header ",")) (res '())) (sqlite3:for-each-row (lambda (a . b) (set! res (cons (apply vector a b) res))) mdb | | < > | 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 | (let* ((header (list "id" "hostname" "pid" "interface" "port" "pubport" "state" "run_id" "priority" "start_time")) (selstr (string-intersperse header ",")) (res '())) (sqlite3:for-each-row (lambda (a . b) (set! res (cons (apply vector a b) res))) mdb (conc "SELECT " selstr " FROM servers WHERE state in ('available','running','dbprep') ORDER BY start_time DESC;") ) (vector header res))) (define (tasks:get-server mdb run-id #!key (retries 10)) (let ((res #f) (best #f)) (handle-exceptions exn |
︙ | ︙ |
Modified tests/fullrun/megatest.config from [353b25ebc0] to [eec8c46fb2].
︙ | ︙ | |||
155 156 157 158 159 160 161 | [server] # force use of server always # required yes # Use http instead of direct filesystem access | > | | 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 | [server] # force use of server always # required yes # Use http instead of direct filesystem access transport rpc # transport http # transport fs # transport nmsg synchronous 0 # If the server can't be started on this port it will try the next port until # it succeeds |
︙ | ︙ |
Modified tests/fullrun/tests/all_toplevel/testconfig from [3fb72f4d55] to [5a83007156].
1 2 | [ezsteps] calcresults megatest -list-runs $MT_RUNNAME -target $MT_TARGET | | | 1 2 3 4 5 6 7 8 9 10 | [ezsteps] calcresults megatest -list-runs $MT_RUNNAME -target $MT_TARGET check_triggers cat $MT_RUN_AREA_HOME/triggers_$MT_RUNNAME.dat [logpro] check_triggers ;; (expect:error in "LogFileBody" = 0 "No errors" #/error/i) [requirements] waiton #{getenv ALL_TOPLEVEL_TESTS} |
︙ | ︙ |
Added thunk-utils.scm version [e6dc11200a].
> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 | (use srfi-18) ;; wrap a proc with a mutex so that two threads may not call proc simultaneously. ;; will catch exceptions to ensure mutex is unlocked even if exception is thrown. ;; will generate a unique mutex for proc unless one is specified with canned-mutex: option ;; ;; example 1: (define thread-safe-+ (make-synchronized-proc +)) ;; example 2: (define thread-safe-plus ;; (make-synchronized-proc ;; (lambda (x y) ;; (+ x y)))) (define (make-synchronized-proc proc #!key (canned-mutex #f)) (let* ((guard-mutex (if canned-mutex canned-mutex (make-mutex))) (guarded-proc ;; we are guarding the thunk against exceptions. We will record whether result of evaluation is an exception or a regular result. (lambda args (mutex-lock! guard-mutex) (let* ((EXCEPTION (gensym)) ;; using gensym to avoid potential collision with a proc that returns a pair having the first element be our flag. gensym guarantees the symbol is unique. (res (condition-case (apply proc args) ;; this is what we are guarding the execution of [x () (cons EXCEPTION x)] ))) (mutex-unlock! guard-mutex) (cond ((and (pair? res) (eq? (car res) EXCEPTION)) (raise (cdr res))) (else res)))))) guarded-proc)) ;; retry an operation (depends on srfi-18) ;; ================== ;; idea here is to avoid spending time on coding retrying something. Trying to be generic here. ;; ;; Exception handling: ;; ------------------- ;; if evaluating the thunk results in exception, it will be retried. ;; on last try, if final-failure-returns-actual is true, the exception will be re-thrown to caller. ;; ;; look at options below #!key to see how to configure behavior ;; ;; (define (retry-thunk the-thunk #!key ;;;; options below (accept-result? (lambda (x) x)) ;; retry if predicate applied to thunk's result is false (retries 4) ;; how many tries (failure-value #f) ;; return this on final failure, unless following option is enabled: (final-failure-returns-actual #f) ;; on failure, on the last try, just return the result, not failure-value (retry-delay 0.1) ;; delay between tries (back-off-factor 1) ;; multiply retry-delay by this factor on retry (random-delay 0.1) ;; add a random portion of this value to wait (chatty #f) ;; print status as we go, for debugging. ) (when chatty (print) (print "Entered retry-thunk") (print "-=-=-=-=-=-")) (let* ((guarded-thunk ;; we are guarding the thunk against exceptions. We will record whether result of evaluation is an exception or a regular result. (lambda () (let* ((EXCEPTION (gensym)) ;; using gensym to avoid potential collision (res (condition-case (the-thunk) ;; this is what we are guarding the execution of [x () (cons EXCEPTION x)] ))) (cond ((and (pair? res) (eq? (car res) EXCEPTION)) (if chatty (print " - the-thunk threw exception >"(cdr res)"<")) (cons 'exception (cdr res))) (else (if chatty (print " - the-thunk returned result >"res"<")) (cons 'regular-result res))))))) (let loop ((guarded-res (guarded-thunk)) (retries-left retries) (fail-wait retry-delay)) (if chatty (print " ==========")) (let* ((wait-time (+ fail-wait (+ (* fail-wait back-off-factor) (* random-delay (/ (random 1024) 1024) )))) (res-type (car guarded-res)) (res-value (cdr guarded-res))) (cond ((and (eq? res-type 'regular-result) (accept-result? res-value)) (if chatty (print " + return result that satisfied accept-result? >"res-value"<")) res-value) ((> retries-left 0) (if chatty (print " - sleep "wait-time)) (thread-sleep! wait-time) (if chatty (print " + retry ["retries-left" tries left]")) (loop (guarded-thunk) (sub1 retries-left) wait-time)) ((eq? res-type 'regular-result) (if final-failure-returns-actual (begin (if chatty (print " + last try failed- return the result >"res-value"<")) res-value) (begin (if chatty (print " + last try failed- return canned failure value >"failure-value"<")) failure-value))) (else ;; no retries left; result was not accepted and res-type can only be 'exception (if final-failure-returns-actual (begin (if chatty (print " + last try failed with exception- re-throw it >"res-value"<")) (abort res-value)); re-raise the exception. TODO: find a way for call-history to show as though from entry to this function (begin (if chatty (print " + last try failed with exception- return canned failure value >"failure-value"<")) failure-value)))))))) |
Modified utils/installall.sh from [89ae2af8a7] to [1afa4bd805].
︙ | ︙ | |||
8 9 10 11 12 13 14 | # greater. See the accompanying file COPYING for details. # # This program is distributed WITHOUT ANY WARRANTY; without even the # implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR # PURPOSE. echo You may need to do the following first: | | < | > > > > > > > > > > > > > > > > | 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | # greater. See the accompanying file COPYING for details. # # This program is distributed WITHOUT ANY WARRANTY; without even the # implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR # PURPOSE. echo You may need to do the following first: echo sudo apt-get install libreadline-dev libsqlite3-dev libwebkitgtk-dev echo sudo apt-get install libpangox-1.0-0 zlib1g-dev libfreetype6-dev cmake echo sudo apt-get install libssl-dev uuid-dev libglu1-mesa-dev echo sudo apt-get install libmotif3 -OR- set KTYPE=26g4 echo echo Set OPTION to std, currently OPTION=$OPTION echo echo Additionally, if you want mysql-client, you will need to make sure echo mysql_config is in your path echo echo You are using PREFIX=$PREFIX echo You are using proxy="$proxy" echo echo "Set additional_libpath to help find gtk or other libraries, don't forget a leading :" if [[ "$OPTION"x == "x" ]];then OPTION=std fi SYSTEM_TYPE=$(lsb_release -irs |tr ' ' '_' |tr '\n' '-')$(uname -i)-$OPTION # default chicken version variables. Override in case statement as appropriate CHICKEN_VERSION=4.10.0 CHICKEN_BASEVER=4.10.0 # Set up variables # case $SYSTEM_TYPE in Ubuntu-16.04-x86_64-std) KTYPE=32 CDVER=5.10 IUPVER=3.17 IMVER=3.11 ;; Ubuntu-16.04-x86_64-new) KTYPE=32 CDVER=5.10 IUPVER=3.17 IMVER=3.11 CHICKEN_VERSION=4.10.0 CHICKEN_BASEVER=4.10.0 ;; Ubuntu-16.04-i686-std) KTYPE=32 CDVER=5.10 IUPVER=3.17 IMVER=3.11 ;; |
︙ | ︙ | |||
101 102 103 104 105 106 107 | echo Using KTYPE=$KTYPE fi # Put all the downloaded tar files in tgz mkdir -p tgz # http://code.call-cc.org/releases/4.8.0/chicken-4.8.0.5.tar.gz | < < | 116 117 118 119 120 121 122 123 124 125 126 127 128 129 | echo Using KTYPE=$KTYPE fi # Put all the downloaded tar files in tgz mkdir -p tgz # http://code.call-cc.org/releases/4.8.0/chicken-4.8.0.5.tar.gz chicken_targz=chicken-${CHICKEN_VERSION}.tar.gz if ! [[ -e tgz/$chicken_targz ]]; then wget http://code.call-cc.org/releases/${CHICKEN_BASEVER}/${chicken_targz} mv $chicken_targz tgz fi BUILDHOME=$PWD |
︙ | ︙ | |||
175 176 177 178 179 180 181 | fi fi cd $BUILDHOME # Some eggs are quoted since they are reserved to Bash # for f in matchable readline apropos base64 regex-literals format "regex-case" "test" coops trace csv dot-locking posix-utils posix-extras directory-utils hostinfo tcp rpc csv-xml fmt json md5; do # $CHICKEN_INSTALL $PROX -keep-installed matchable readline apropos base64 regex-literals format "regex-case" "test" coops trace csv dot-locking posix-utils posix-extras directory-utils hostinfo tcp rpc csv-xml fmt json md5 awful http-client spiffy uri-common intarweb http-client spiffy-request-vars md5 message-digest http-client spiffy-directory-listing | | | 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 | fi fi cd $BUILDHOME # Some eggs are quoted since they are reserved to Bash # for f in matchable readline apropos base64 regex-literals format "regex-case" "test" coops trace csv dot-locking posix-utils posix-extras directory-utils hostinfo tcp rpc csv-xml fmt json md5; do # $CHICKEN_INSTALL $PROX -keep-installed matchable readline apropos base64 regex-literals format "regex-case" "test" coops trace csv dot-locking posix-utils posix-extras directory-utils hostinfo tcp rpc csv-xml fmt json md5 awful http-client spiffy uri-common intarweb http-client spiffy-request-vars md5 message-digest http-client spiffy-directory-listing for egg in matchable readline apropos dbi base64 regex-literals format "regex-case" "test" \ coops trace csv dot-locking posix-utils posix-extras directory-utils hostinfo \ tcp rpc csv-xml fmt json md5 awful http-client spiffy uri-common intarweb http-client \ spiffy-request-vars s md5 message-digest spiffy-directory-listing ssax sxml-serializer \ sxml-modifications logpro z3 call-with-environment-variables \ pathname-expand typed-records simple-exceptions numbers crypt parley srfi-42 \ alist-lib ansi-escape-sequences args basic-sequences bindings chicken-doc chicken-doc-cmd \ cock condition-utils debug define-record-and-printer easyffi easyffi-base \ |
︙ | ︙ | |||
300 301 302 303 304 305 306 | cd opensrc #fossil open ../opensrc.fossil cd histstore $PREFIX/bin/csc histstore.scm -o hs cp -f hs $PREFIX/bin/hs cd ../mutils $PREFIX/bin/chicken-install | | | | 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 | cd opensrc #fossil open ../opensrc.fossil cd histstore $PREFIX/bin/csc histstore.scm -o hs cp -f hs $PREFIX/bin/hs cd ../mutils $PREFIX/bin/chicken-install # cd ../dbi # $PREFIX/bin/chicken-install cd ../margs $PREFIX/bin/chicken-install fi cd $BUILDHOME if ! [[ -e $PREFIX/bin/stmlrun ]] ; then #fossil clone http://www.kiatoa.com/fossils/stml stml.fossil |
︙ | ︙ |