Overview
Comment: | Automated merge of server-log-handshaking/a642f429b1/integ into integ-home |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | integ-home |
Files: | files | file ages | folders |
SHA1: |
ad7df3e1eb7f7ceae9259acd9c5b2292 |
User & Date: | matt on 2017-01-29 16:48:09 |
Other Links: | branch diff | manifest | tags |
Context
2017-02-01
| ||
11:07 | Automated merge of server-log-handshaking/a05b1e5025/integ into integ-home check-in: eb064be6a3 user: matt tags: integ-home | |
2017-01-29
| ||
16:48 | Automated merge of server-log-handshaking/a642f429b1/integ into integ-home check-in: ad7df3e1eb user: matt tags: integ-home | |
16:33 | Deprecate api parallel message check-in: a642f429b1 user: matt tags: server-log-handshaking | |
2017-01-28
| ||
22:48 | Automated merge of server-log-handshaking/2a497f95de/integ into integ-home check-in: ebc39ec805 user: matt tags: integ-home | |
Changes
Modified common.scm from [9ceabb42c1] to [3380145d50].
︙ | ︙ | |||
570 571 572 573 574 575 576 | (define *wdnum* 0) (define *wdnum*mutex (make-mutex)) ;; currently the primary job of the watchdog is to run the sync back to megatest.db from the db in /tmp ;; if we are on the homehost and we are a server (by definition we are on the homehost if we are a server) ;; (define (common:watchdog) | < | < | > > < > > > | | > | 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 | (define *wdnum* 0) (define *wdnum*mutex (make-mutex)) ;; currently the primary job of the watchdog is to run the sync back to megatest.db from the db in /tmp ;; if we are on the homehost and we are a server (by definition we are on the homehost if we are a server) ;; (define (common:watchdog) (thread-sleep! 0.05) ;; delay for startup (let ((legacy-sync (common:run-sync?)) (debug-mode (debug:debug-mode 1)) (last-time (current-seconds)) (this-wd-num (begin (mutex-lock! *wdnum*mutex) (let ((x *wdnum*)) (set! *wdnum* (add1 *wdnum*)) (mutex-unlock! *wdnum*mutex) x)))) (debug:print-info 0 *default-log-port* "watchdog starting. legacy-sync is " legacy-sync" pid="(current-process-id)" this-wd-num="this-wd-num) (if (and legacy-sync (not *time-to-exit*)) (let* ((dbstruct (db:setup)) (mtdb (dbr:dbstruct-mtdb dbstruct)) (mtpath (db:dbdat-get-path mtdb))) (debug:print-info 0 *default-log-port* "Server running, periodic sync started.") (let loop () ;; sync for filesystem local db writes ;; (mutex-lock! *db-multi-sync-mutex*) (let* ((need-sync (>= *db-last-access* *db-last-sync*)) ;; no sync since last write (sync-in-progress *db-sync-in-progress*) (should-sync (and (not *time-to-exit*) (> (- (current-seconds) *db-last-sync*) 5))) ;; sync every five seconds minimum (start-time (current-seconds)) (mt-mod-time (file-modification-time mtpath)) (recently-synced (> (- start-time mt-mod-time) 4)) (will-sync (and (or need-sync should-sync) (not sync-in-progress) (not recently-synced)))) ;; (if recently-synced (debug:print-info 0 *default-log-port* "Skipping sync due to recently-synced flag=" recently-synced)) ;; (debug:print-info 0 *default-log-port* "need-sync: " need-sync " sync-in-progress: " sync-in-progress " should-sync: " should-sync " will-sync: " will-sync) (if will-sync (set! *db-sync-in-progress* #t)) (mutex-unlock! *db-multi-sync-mutex*) (if will-sync (let ((res (common:sync-to-megatest.db dbstruct))) ;; did we sync any data? If so need to set the db touched flag to keep the server alive (if (> res 0) ;; some records were transferred, keep the db alive (begin |
︙ | ︙ |
Modified db.scm from [0ef09f1d37] to [cf38571740].
︙ | ︙ | |||
11 12 13 14 15 16 17 | ;;====================================================================== ;; Database access ;;====================================================================== ;; dbstruct vector containing all the relevant dbs like main.db, megatest.db, run.db etc | | | | 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | ;;====================================================================== ;; Database access ;;====================================================================== ;; dbstruct vector containing all the relevant dbs like main.db, megatest.db, run.db etc (use (srfi 18) extras tcp stack) (use sqlite3 srfi-1 posix regex regex-case srfi-69 csv-xml s11n md5 message-digest base64 format dot-locking z3 typed-records matchable) (import (prefix sqlite3 sqlite3:)) (import (prefix base64 base64:)) (declare (unit db)) (declare (uses common)) (declare (uses keys)) (declare (uses ods)) (declare (uses client)) (declare (uses mt)) |
︙ | ︙ | |||
141 142 143 144 145 146 147 | (db:dbdat-get-db dbdat) dbstruct)) (use-mutex (> *api-process-request-count* 25))) (if (and use-mutex (common:low-noise-print 120 "over-50-parallel-api-requests")) (debug:print-info 0 *default-log-port* *api-process-request-count* " parallel api requests being processed in process " (current-process-id) ", throttling access")) (if (common:low-noise-print 600 (conc "parallel-api-requests" *max-api-process-requests*)) | | | 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 | (db:dbdat-get-db dbdat) dbstruct)) (use-mutex (> *api-process-request-count* 25))) (if (and use-mutex (common:low-noise-print 120 "over-50-parallel-api-requests")) (debug:print-info 0 *default-log-port* *api-process-request-count* " parallel api requests being processed in process " (current-process-id) ", throttling access")) (if (common:low-noise-print 600 (conc "parallel-api-requests" *max-api-process-requests*)) (debug:print-info 2 *default-log-port* "Parallel api request count: " *api-process-request-count* " max parallel requests: " *max-api-process-requests*)) (handle-exceptions exn (begin (print-call-chain (current-error-port)) (debug:print-error 0 *default-log-port* "sqlite3 issue in db:with-db, dbstruct=" dbstruct ", run-id=" run-id ", proc=" proc ", params=" params " error: " ((condition-property-accessor 'exn 'message) exn)) ;; there is no recovering at this time. exit (exit 50)) |
︙ | ︙ | |||
2038 2039 2040 2041 2042 2043 2044 2045 2046 | (lambda (run-id) (set! run-ids (cons run-id run-ids))) db "SELECT id FROM runs WHERE state != 'deleted' ORDER BY event_time DESC;") (reverse run-ids))))) ;; get some basic run stats ;; ;; ( (runname (( state count ) ... )) | > > | > | 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 | (lambda (run-id) (set! run-ids (cons run-id run-ids))) db "SELECT id FROM runs WHERE state != 'deleted' ORDER BY event_time DESC;") (reverse run-ids))))) ;; get some basic run stats ;; ;; data structure: ;; ;; ( (runname (( state count ) ... )) ;; ( ... ;; (define (db:get-run-stats dbstruct) (let* ((totals (make-hash-table)) (curr (make-hash-table)) (res '()) (runs-info '())) ;; First get all the runname/run-ids (db:with-db |
︙ | ︙ |
Modified dcommon.scm from [8fb41f6f61] to [4355903cc1].
︙ | ︙ | |||
618 619 620 621 622 623 624 | #:numcol 7 #:numcol-visible 7 #:numlin-visible 5 )) (colnames (list "Id" "MTver" "Pid" "Host" "Interface:OutPort" "RunTime" "State" "RunId")) (updater (lambda () (if (dashboard:monitor-changed? commondat tabdat) | | | | 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 | #:numcol 7 #:numcol-visible 7 #:numlin-visible 5 )) (colnames (list "Id" "MTver" "Pid" "Host" "Interface:OutPort" "RunTime" "State" "RunId")) (updater (lambda () (if (dashboard:monitor-changed? commondat tabdat) (let ((servers (server:get-list *toppath* limit: 10))) ;; (tasks:get-all-servers (db:delay-if-busy tdbdat)))) (iup:attribute-set! servers-matrix "NUMLIN" (length servers)) ;; (set! colnum 0) ;; (for-each (lambda (colname) ;; ;; (print "colnum: " colnum " colname: " colname) ;; (iup:attribute-set! servers-matrix (conc "0:" colnum) colname) ;; (set! colnum (+ 1 colnum))) ;; colnames) (set! rownum 1) (for-each (lambda (server) (set! colnum 0) (match-let (((mod-time host port start-time pid) server)) (let* ((uptime (- (current-seconds) mod-time)) (runtime (if start-time (- mod-time start-time) 0)) (vals (list "-" ;; (vector-ref server 0) ;; Id "-" ;; (vector-ref server 9) ;; MT-Ver pid ;; (vector-ref server 1) ;; Pid host ;; (vector-ref server 2) ;; Hostname (conc host ":" port) ;; (conc (vector-ref server 3) ":" (vector-ref server 4)) ;; IP:Port (seconds->hr-min-sec runtime) ;; (- (current-seconds) start-time)) ;; (vector-ref server 6))) |
︙ | ︙ |
Modified http-transport.scm from [05f38775ca] to [e6b844f91c].
︙ | ︙ | |||
377 378 379 380 381 382 383 | (start-time (current-milliseconds))) ;; Use this opportunity to sync the tmp db to megatest.db (if (not server-going) ;; *dbstruct-db* (begin (debug:print 0 *default-log-port* "SERVER: dbprep") (set! *dbstruct-db* (db:setup)) ;; run-id)) (set! server-going #t) | | | 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 | (start-time (current-milliseconds))) ;; Use this opportunity to sync the tmp db to megatest.db (if (not server-going) ;; *dbstruct-db* (begin (debug:print 0 *default-log-port* "SERVER: dbprep") (set! *dbstruct-db* (db:setup)) ;; run-id)) (set! server-going #t) (debug:print 0 *default-log-port* "SERVER: running, megatest version: " (common:get-full-version)) ;; NOTE: the server is NOT yet marked as running in the log. We do that in the keep-running routine. (thread-start! *watchdog*))) ;; when things go wrong we don't want to be doing the various queries too often ;; so we strive to run this stuff only every four seconds or so. (let* ((sync-time (- (current-milliseconds) start-time)) (rem-time (quotient (- 4000 sync-time) 1000))) (if (and (<= rem-time 4) |
︙ | ︙ | |||
423 424 425 426 427 428 429 | (adjusted-timeout (if (> hrs-since-start 1) (- server-timeout (inexact->exact (round (* hrs-since-start 60)))) ;; subtract 60 seconds per hour server-timeout))) (if (common:low-noise-print 120 "server timeout") (debug:print-info 0 *default-log-port* "Adjusted server timeout: " adjusted-timeout)) (cond ((and *server-run* | | | > | | 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 | (adjusted-timeout (if (> hrs-since-start 1) (- server-timeout (inexact->exact (round (* hrs-since-start 60)))) ;; subtract 60 seconds per hour server-timeout))) (if (common:low-noise-print 120 "server timeout") (debug:print-info 0 *default-log-port* "Adjusted server timeout: " adjusted-timeout)) (cond ((and *server-run* (> (+ last-access server-timeout) (current-seconds)) (< (- (current-seconds) server-start-time) 3600)) ;; do not update log or touch log if we've been running for more than one hour. (if (common:low-noise-print 120 "server continuing") (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access)) (let ((curr-time (current-seconds))) (change-file-times server-log-file curr-time curr-time))) (loop 0 server-state bad-sync-count (current-milliseconds))) (else (debug:print-info 0 *default-log-port* "Server timed out. seconds since last db access: " (- (current-seconds) last-access)) (http-transport:server-shutdown port))))))) (define (http-transport:server-shutdown port) (let ((tdbdat (tasks:open-db))) ;;(BB> "http-transport:server-shutdown called") (debug:print-info 0 *default-log-port* "Starting to shutdown the server. pid="(current-process-id)) ;; |
︙ | ︙ |
Modified server.scm from [52a242fcdc] to [b68dac663e].
︙ | ︙ | |||
158 159 160 161 162 163 164 | (string->number (cadr dat)) ;; port (string->number (caddr dat)))))) (list #f #f #f))))))) ;; get a list of servers with all relevant data ;; ( mod-time host port start-time pid ) ;; | | | > | > | > > > | > > > | | > > > | | | 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 | (string->number (cadr dat)) ;; port (string->number (caddr dat)))))) (list #f #f #f))))))) ;; get a list of servers with all relevant data ;; ( mod-time host port start-time pid ) ;; (define (server:get-list areapath #!key (limit #f)) (let ((fname-rx (regexp "^(|.*/)server-(\\d+)-(\\S+).log$")) (day-seconds (* 24 60 60))) ;; if the directory exists continue to get the list ;; otherwise attempt to create the logs dir and then ;; continue (if (if (directory-exists? (conc areapath "/logs")) #t (if (file-write-access? areapath) (begin (condition-case (create-directory (conc areapath "/logs") #t) (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs"))) (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list."))) (directory-exists? (conc areapath "/logs"))) #f)) (let* ((server-logs (glob (conc areapath "/logs/server-*.log"))) (num-serv-logs (length server-logs))) (if (null? server-logs) '() (let loop ((hed (car server-logs)) (tal (cdr server-logs)) (res '())) (let* ((mod-time (file-modification-time hed)) (down-time (- (current-seconds) mod-time)) (serv-dat (if (or (< num-serv-logs 10) (< down-time day-seconds)) (server:logf-get-start-info hed) '())) ;; don't waste time processing server files not touched in the past day if there are more than ten servers to look at (serv-rec (cons mod-time serv-dat)) (fmatch (string-match fname-rx hed)) (pid (if fmatch (string->number (list-ref fmatch 2)) #f)) (new-res (if (null? serv-dat) res (cons (append serv-rec (list pid)) res)))) (if (null? tal) (if (and limit (> (length new-res) limit)) new-res ;; (take new-res limit) <= need intelligent sorting before this will work new-res) (loop (car tal)(cdr tal) new-res))))))))) ;; given a list of servers get a list of valid servers, i.e. at least ;; 10 seconds old, has started and is less than 1 hour old and is ;; active (i.e. mod-time < 10 seconds ;; ;; mod-time host port start-time pid ;; ;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off ;; and servers should stick around for about two hours or so. ;; (define (server:get-best srvlst) (let ((now (current-seconds))) (sort (filter (lambda (rec) (let ((start-time (list-ref rec 3)) (mod-time (list-ref rec 0))) ;; (print "start-time: " start-time " mod-time: " mod-time) (and start-time mod-time (> (- now start-time) 1) ;; been running at least 1 seconds (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds (< (- now start-time) 3600) ;; under one hour running time ))) srvlst) (lambda (a b) (< (list-ref a 3) (list-ref b 3)))))) |
︙ | ︙ |