Changes In Branch v1.63-stackdumpfix Through [064872f8bc] Excluding Merge-Ins
This is equivalent to a diff from 65358a4d53 to 064872f8bc
2017-01-10
| ||
00:45 | merged in solid fixes for server stability problems check-in: ebafbfa4a1 user: bjbarcla tags: v1.63 | |
2017-01-09
| ||
12:04 | updated to latest v1.63, WIP check-in: 8e63364af0 user: srehman tags: v1.63-configdbsync | |
2017-01-06
| ||
22:37 | Made mt:process-triggers run exclusively on the server side check-in: bcaec59285 user: matt tags: v1.63-stackdumpfix | |
21:54 | utils/plot-code.scm check-in: 064872f8bc user: matt tags: v1.63-stackdumpfix | |
20:28 | wip check-in: fc4e43d81e user: bjbarcla tags: v1.63-stackdumpfix | |
2017-01-05
| ||
15:43 | addressed stach dump in db:get-run-stats check-in: bd4a33953c user: bjbarcla tags: v1.63-stackdumpfix | |
11:03 | added .server verification against running server ; also touching .server periodically and checking its age to detect and recover from potential server crashes check-in: 65358a4d53 user: bjbarcla tags: v1.63 | |
09:51 | Bumped version to v1.6303 check-in: aa5fb6de80 user: mrwellan tags: v1.63, v1.6303 | |
Modified common.scm from [6953f07d9c] to [d11f96c7d3].
︙ | ︙ | |||
95 96 97 98 99 100 101 102 103 104 105 106 107 108 | (define *db-last-sync* 0) ;; last time the sync to megatest.db happened (define *db-sync-in-progress* #f) ;; if there is a sync in progress do not try to start another (define *db-multi-sync-mutex* (make-mutex)) ;; protect access to *db-sync-in-progress*, *db-last-sync* ;; task db (define *task-db* #f) ;; (vector db path-to-db) (define *db-access-allowed* #t) ;; flag to allow access (define *db-access-mutex* (make-mutex)) (define *db-cache-path* #f) ;; SERVER (define *my-client-signature* #f) (define *transport-type* 'http) ;; override with [server] transport http|rpc|nmsg (define *runremote* #f) ;; if set up for server communication this will hold <host port> (define *max-cache-size* 0) | > | 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 | (define *db-last-sync* 0) ;; last time the sync to megatest.db happened (define *db-sync-in-progress* #f) ;; if there is a sync in progress do not try to start another (define *db-multi-sync-mutex* (make-mutex)) ;; protect access to *db-sync-in-progress*, *db-last-sync* ;; task db (define *task-db* #f) ;; (vector db path-to-db) (define *db-access-allowed* #t) ;; flag to allow access (define *db-access-mutex* (make-mutex)) (define *db-transaction-mutex* (make-mutex)) (define *db-cache-path* #f) ;; SERVER (define *my-client-signature* #f) (define *transport-type* 'http) ;; override with [server] transport http|rpc|nmsg (define *runremote* #f) ;; if set up for server communication this will hold <host port> (define *max-cache-size* 0) |
︙ | ︙ |
Modified db.scm from [31eac1d5ff] to [b31dceb2af].
︙ | ︙ | |||
1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 | "SELECT state,status,count(id) AS count FROM tests WHERE run_id=? AND NOT(uname='n/a' AND item_path='') GROUP BY state,status;;" run-id)))) ;; Update run_stats for given run_id ;; input data is a list (state status count) ;; (define (db:update-run-stats dbstruct run-id stats) (db:with-db dbstruct #f #f (lambda (db) ;; remove previous data (let* ((stmt1 (sqlite3:prepare db "DELETE FROM run_stats WHERE run_id=? AND state=? AND status=?;")) (stmt2 (sqlite3:prepare db "INSERT INTO run_stats (run_id,state,status,count) VALUES (?,?,?,?);")) (res (sqlite3:with-transaction db (lambda () (for-each (lambda (dat) (sqlite3:execute stmt1 run-id (car dat)(cadr dat)) (apply sqlite3:execute stmt2 run-id dat)) stats))))) (sqlite3:finalize! stmt1) (sqlite3:finalize! stmt2) res)))) (define (db:get-main-run-stats dbstruct run-id) (db:with-db dbstruct #f ;; this data comes from main #f | > > > > | 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 | "SELECT state,status,count(id) AS count FROM tests WHERE run_id=? AND NOT(uname='n/a' AND item_path='') GROUP BY state,status;;" run-id)))) ;; Update run_stats for given run_id ;; input data is a list (state status count) ;; (define (db:update-run-stats dbstruct run-id stats) (mutex-lock! *db-transaction-mutex*) (db:with-db dbstruct #f #f (lambda (db) ;; remove previous data (let* ((stmt1 (sqlite3:prepare db "DELETE FROM run_stats WHERE run_id=? AND state=? AND status=?;")) (stmt2 (sqlite3:prepare db "INSERT INTO run_stats (run_id,state,status,count) VALUES (?,?,?,?);")) (res (sqlite3:with-transaction db (lambda () (for-each (lambda (dat) (sqlite3:execute stmt1 run-id (car dat)(cadr dat)) (apply sqlite3:execute stmt2 run-id dat)) stats))))) (sqlite3:finalize! stmt1) (sqlite3:finalize! stmt2) (mutex-unlock! *db-transaction-mutex*) res)))) (define (db:get-main-run-stats dbstruct run-id) (db:with-db dbstruct #f ;; this data comes from main #f |
︙ | ︙ | |||
3169 3170 3171 3172 3173 3174 3175 | (test-id (db:test-get-id testdat)) (test-name (if (number? test-name) (db:test-get-testname testdat) test-name)) (item-path (db:test-get-item-path testdat)) (tl-testdat (db:get-test-info dbstruct run-id test-name "")) (tl-test-id (db:test-get-id tl-testdat))) | > > | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | > > | 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 | (test-id (db:test-get-id testdat)) (test-name (if (number? test-name) (db:test-get-testname testdat) test-name)) (item-path (db:test-get-item-path testdat)) (tl-testdat (db:get-test-info dbstruct run-id test-name "")) (tl-test-id (db:test-get-id tl-testdat))) (mutex-lock! *db-transaction-mutex*) (let ((tr-res (sqlite3:with-transaction db (lambda () (db:test-set-state-status-by-id dbstruct run-id test-id state status comment) (if (not (equal? item-path "")) ;; only roll up IF incoming test is an item (let* ((state-status-counts (db:get-all-state-status-counts-for-test db run-id test-name item-path)) ;; item-path is used to exclude current state/status of THIS test (running (length (filter (lambda (x) (member (dbr:counts-state x) *common:running-states*)) state-status-counts))) (bad-not-started (length (filter (lambda (x) (and (equal? (dbr:counts-state x) "NOT_STARTED") (not (member (dbr:counts-status x) *common:not-started-ok-statuses*)))) state-status-counts))) (all-curr-states (common:special-sort ;; worst -> best (sort of) (delete-duplicates (cons state (map dbr:counts-state state-status-counts))) *common:std-states* >)) (all-curr-statuses (common:special-sort ;; worst -> best (delete-duplicates (cons status (map dbr:counts-status state-status-counts))) *common:std-statuses* >)) (newstate (if (> running 0) "RUNNING" (if (> bad-not-started 0) "COMPLETED" (car all-curr-states)))) (newstatus (if (> bad-not-started 0) "CHECK" (car all-curr-statuses)))) ;; (print "Setting toplevel to: " newstate "/" newstatus) (db:test-set-state-status-by-id dbstruct run-id tl-test-id newstate newstatus #f)))))) (mutex-unlock! *db-transaction-mutex*)) tr-res))) (define db:roll-up-pass-fail-counts db:set-state-status-and-roll-up-items) ;; call with state = #f to roll up with out accounting for state/status of this item ;; ;; (define (db:roll-up-pass-fail-counts dbstruct run-id test-name item-path state status) ;; (if (not (equal? item-path "")) ;; if asked to do this for a specific item then do an incremental update |
︙ | ︙ |
Modified launch.scm from [580823485a] to [5acd6fa77a].
︙ | ︙ | |||
1119 1120 1121 1122 1123 1124 1125 | (conc (if dir (conc dir "/") "") (case (string->symbol exe) ((dboard) "../megatest") ((mtest) "../megatest") ((dashboard) "megatest") (else exe))))) (launcher (common:get-launcher *configdat* test-name item-path)) ;; (config-lookup *configdat* "jobtools" "launcher")) | | | | 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 | (conc (if dir (conc dir "/") "") (case (string->symbol exe) ((dboard) "../megatest") ((mtest) "../megatest") ((dashboard) "megatest") (else exe))))) (launcher (common:get-launcher *configdat* test-name item-path)) ;; (config-lookup *configdat* "jobtools" "launcher")) (test-sig (conc (common:get-testsuite-name) ":" test-name ":" item-path)) ;; (item-list->path itemdat))) ;; test-path is the full path including the item-path (work-area #f) (toptest-work-area #f) ;; for iterated tests the top test contains data relevant for all (diskpath #f) (cmdparms #f) (fullcmd #f) ;; (define a (with-output-to-string (lambda ()(write x)))) (mt-bindir-path #f) (testinfo (rmt:get-test-info-by-id run-id test-id)) (mt_target (string-intersperse (map cadr keyvals) "/")) |
︙ | ︙ |
Modified rmt.scm from [b725604f3b] to [ef5ad8a7b9].
︙ | ︙ | |||
73 74 75 76 77 78 79 | ;; ensure we have a record for our connection for given area ((not *runremote*) (set! *runremote* (make-remote)) (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 1") (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; ensure we have a homehost record | | | 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | ;; ensure we have a record for our connection for given area ((not *runremote*) (set! *runremote* (make-remote)) (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 1") (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; ensure we have a homehost record ((not (pair? (remote-hh-dat *runremote*))) ;; not on homehost (thread-sleep! 0.1) ;; since we shouldn't get here, delay a little (remote-hh-dat-set! *runremote* (common:get-homehost)) (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 2") (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; on homehost and this is a read ((and (cdr (remote-hh-dat *runremote*)) ;; on homehost |
︙ | ︙ | |||
139 140 141 142 143 144 145 | ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5.2") ;; (tasks:start-and-wait-for-server (tasks:open-db) 0 15) ;; (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;) ;) ;;;; ;; if not on homehost ensure we have a connection to a live server ;; NOTE: we *have* a homehost record by now | > | | | | | | | | > | 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 | ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5.2") ;; (tasks:start-and-wait-for-server (tasks:open-db) 0 15) ;; (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;) ;) ;;;; ;; if not on homehost ensure we have a connection to a live server ;; NOTE: we *have* a homehost record by now ;; ((and (not (cdr (remote-hh-dat *runremote*))) ;; not on a homehost ;; (not (remote-conndat *runremote*)) ;; and no connection ;; (server:read-dotserver *toppath*)) ;; .server file exists ;; ;; something caused the server entry in tdb to disappear, but the server is still running ;; (server:remove-dotserver-file *toppath* ".*") ;; (mutex-unlock! *rmt-mutex*) ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 20") ;; (rmt:send-receive cmd rid params attemptnum: (add1 attemptnum))) ((and (not (cdr (remote-hh-dat *runremote*))) ;; not on a homehost (not (remote-conndat *runremote*))) ;; and no connection (debug:print-info 12 *default-log-port* "rmt:send-receive, case 6 hh-dat: " (remote-hh-dat *runremote*) " conndat: " (remote-conndat *runremote*)) (mutex-unlock! *rmt-mutex*) (tasks:start-and-wait-for-server (tasks:open-db) 0 15) (remote-conndat-set! *runremote* (rmt:get-connection-info *toppath*)) ;; calls client:setup which calls client:setup-http (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as |
︙ | ︙ | |||
176 177 178 179 180 181 182 | ((exn)(vector #f "other fail" (print-call-chain))))) (else (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport *runremote*) " not supported") (exit)))) (success (if (vector? dat) (vector-ref dat 0) #f)) (res (if (vector? dat) (vector-ref dat 1) #f))) (if (vector? conninfo)(http-transport:server-dat-update-last-access conninfo)) ;; refresh access time | | | 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 | ((exn)(vector #f "other fail" (print-call-chain))))) (else (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport *runremote*) " not supported") (exit)))) (success (if (vector? dat) (vector-ref dat 0) #f)) (res (if (vector? dat) (vector-ref dat 1) #f))) (if (vector? conninfo)(http-transport:server-dat-update-last-access conninfo)) ;; refresh access time (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9. conninfo=" conninfo " dat=" dat " *runremote* = "*runremote*) (if success (case (remote-transport *runremote*) ((http) res) (else (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport *runremote*) " is unknown") (exit 1))) (begin |
︙ | ︙ |
Modified tasks.scm from [a0c6ff1ee2] to [0bc99f47ad].
︙ | ︙ | |||
446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 | FROM servers WHERE run_id=? AND state NOT LIKE 'defunct%' ORDER BY start_time DESC;" run-id) (reverse res))) ;; no elegance here ... ;; (define (tasks:kill-server hostname pid #!key (kill-switch "")) (debug:print-info 0 *default-log-port* "Attempting to kill server process " pid " on host " hostname) (setenv "TARGETHOST" hostname) (setenv "TARGETHOST_LOGF" "server-kills.log") (system (conc "nbfake kill "kill-switch" "pid)) (unsetenv "TARGETHOST_LOGF") (unsetenv "TARGETHOST")) ;; look up a server by run-id and send it a kill, also delete the record for that server ;; (define (tasks:kill-server-run-id run-id #!key (tag "default")) (let* ((tdbdat (tasks:open-db)) (sdat (tasks:get-server (db:delay-if-busy tdbdat) run-id))) (if sdat (let ((hostname (vector-ref sdat 6)) (pid (vector-ref sdat 5)) (server-id (vector-ref sdat 0))) (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "killed") (debug:print-info 0 *default-log-port* "Killing server " server-id " for run-id " run-id " on host " hostname " with pid " pid) (tasks:kill-server hostname pid) (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) ) (debug:print-info 0 *default-log-port* "No server found for run-id " run-id ", nothing to kill")) ;; (sqlite3:finalize! tdb) )) ;;====================================================================== | > > > | 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 | FROM servers WHERE run_id=? AND state NOT LIKE 'defunct%' ORDER BY start_time DESC;" run-id) (reverse res))) ;; no elegance here ... ;; (define (tasks:kill-server hostname pid #!key (kill-switch "")) (server:remove-dotserver-file *toppath* ".*") (debug:print-info 0 *default-log-port* "Attempting to kill server process " pid " on host " hostname) (setenv "TARGETHOST" hostname) (setenv "TARGETHOST_LOGF" "server-kills.log") (system (conc "nbfake kill "kill-switch" "pid)) (unsetenv "TARGETHOST_LOGF") (unsetenv "TARGETHOST")) ;; look up a server by run-id and send it a kill, also delete the record for that server ;; (define (tasks:kill-server-run-id run-id #!key (tag "default")) (let* ((tdbdat (tasks:open-db)) (sdat (tasks:get-server (db:delay-if-busy tdbdat) run-id))) (if sdat (let ((hostname (vector-ref sdat 6)) (pid (vector-ref sdat 5)) (server-id (vector-ref sdat 0))) (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "killed") (debug:print-info 0 *default-log-port* "Killing server " server-id " for run-id " run-id " on host " hostname " with pid " pid) (server:remove-dotserver-file *toppath* ".*") (tasks:kill-server hostname pid) (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) ) (debug:print-info 0 *default-log-port* "No server found for run-id " run-id ", nothing to kill")) ;; (sqlite3:finalize! tdb) )) ;;====================================================================== |
︙ | ︙ |
Modified utils/nbfake from [9de79bbac2] to [df0eb253b8].
︙ | ︙ | |||
68 69 70 71 72 73 74 | __EOF if [[ -z "$MY_NBFAKE_HOST" ]]; then # Run locally sh -c "cd $CURRWD;export DISPLAY=$DISPLAY; export PATH=$PATH; nohup $* >> $MY_NBFAKE_LOG 2>&1 &" else # run remotely | | | 68 69 70 71 72 73 74 75 76 | __EOF if [[ -z "$MY_NBFAKE_HOST" ]]; then # Run locally sh -c "cd $CURRWD;export DISPLAY=$DISPLAY; export PATH=$PATH; nohup $* >> $MY_NBFAKE_LOG 2>&1 &" else # run remotely ssh -X -n -f $MY_NBFAKE_HOST "sh -c \"cd $CURRWD;export DISPLAY=$DISPLAY; export PATH=$PATH; nohup $* >> $MY_NBFAKE_LOG 2>&1 &\"" fi |
Modified utils/plot-code.scm from [cd37a2db38] to [903f29a75e].
1 2 3 4 5 6 7 8 9 10 11 12 | #!/mfs/pkgs/chicken/4.8.0.5/bin/csi -nbq ;; Coming soon (right?) Usage: plot-code file1.scm,file2.scm "fun1,fun2,x*" *.scm > plot.dot ;; Usage: plot-code file1.scm,file2.scm *.scm > plot.dot ;; dot -Tpdf plot.dot > plot.pdf ;; first param is comma separated list of files to include in the map, use - to do all ;; second param is list of regexs for functions to include in the map ;; third param is list of files to scan (use regex srfi-69 srfi-13) (define targs #f) | | > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | #!/mfs/pkgs/chicken/4.8.0.5/bin/csi -nbq ;; Coming soon (right?) Usage: plot-code file1.scm,file2.scm "fun1,fun2,x*" *.scm > plot.dot ;; Usage: plot-code file1.scm,file2.scm *.scm > plot.dot ;; dot -Tpdf plot.dot > plot.pdf ;; first param is comma separated list of files to include in the map, use - to do all ;; second param is list of regexs for functions to include in the map ;; third param is list of files to scan (use regex srfi-69 srfi-13) (define targs #f) (define files (cdr (cddddr (argv)))) (let ((targdat (cadddr (argv)))) (if (equal? targdat "-") (set! targs files) (set! targs (string-split targdat ",")))) (define function-patt (car (cdr (cdddr (argv))))) (define function-rx (regexp function-patt)) (define filedat-defns (make-hash-table)) (define filedat-usages (make-hash-table)) (define defn-rx (regexp "^\\s*\\(define\\s+\\(([^\\s\\)]+).*")) (define all-regexs (make-hash-table)) (define all-fns '()) |
︙ | ︙ | |||
44 45 46 47 48 49 50 | (lambda () (let loop ((inl (read-line))) (if (not (eof-object? inl)) (let ((match (string-match defn-rx inl))) (if match (let ((fnname (cadr match))) ;; (print " " fnname) | > | | 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 | (lambda () (let loop ((inl (read-line))) (if (not (eof-object? inl)) (let ((match (string-match defn-rx inl))) (if match (let ((fnname (cadr match))) ;; (print " " fnname) (if (string-match function-rx fnname) (set! all-fns (cons fnname all-fns))) (hash-table-set! filedat-defns fname (cons fnname (hash-table-ref/default filedat-defns fname '()))) )) (loop (read-line)))))))) files) |
︙ | ︙ |