Overview
Comment: | Most needed changes made for multi-server with log handshaking. However there is a segfault ... |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | server-log-handshaking |
Files: | files | file ages | folders |
SHA1: |
8a4205f90b698a30371e28480f9e0717 |
User & Date: | matt on 2017-01-27 22:47:55 |
Other Links: | branch diff | manifest | tags |
Context
2017-01-28
| ||
12:25 | Fixed couple missing changes needed for server log handshaking check-in: 5a64939577 user: matt tags: server-log-handshaking | |
2017-01-27
| ||
22:47 | Most needed changes made for multi-server with log handshaking. However there is a segfault ... check-in: 8a4205f90b user: matt tags: server-log-handshaking | |
2017-01-25
| ||
23:48 | Get list of servers now works check-in: f2556f210b user: matt tags: server-log-handshaking | |
Changes
Modified common.scm from [040477db58] to [9ceabb42c1].
︙ | ︙ | |||
139 140 141 142 143 144 145 | (define *run-info-cache* (make-hash-table)) ;; run info is stable, no need to reget (define *launch-setup-mutex* (make-mutex)) ;; need to be able to call launch:setup often so mutex it and re-call the real deal only if *toppath* not set (define *homehost-mutex* (make-mutex)) (defstruct remote (hh-dat (common:get-homehost)) ;; homehost record ( addr . hhflag ) | | | 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 | (define *run-info-cache* (make-hash-table)) ;; run info is stable, no need to reget (define *launch-setup-mutex* (make-mutex)) ;; need to be able to call launch:setup often so mutex it and re-call the real deal only if *toppath* not set (define *homehost-mutex* (make-mutex)) (defstruct remote (hh-dat (common:get-homehost)) ;; homehost record ( addr . hhflag ) (server-url (if *toppath* (server:check-if-running *toppath*))) ;; (server:check-if-running *toppath*) #f)) (last-server-check 0) ;; last time we checked to see if the server was alive (conndat #f) (transport *transport-type*) (server-timeout (or (server:get-timeout) 100))) ;; default to 100 seconds ;; launching and hosts (defstruct host |
︙ | ︙ |
Modified dashboard.scm from [29cdcf21f3] to [bb7acd661f].
︙ | ︙ | |||
97 98 99 100 101 102 103 104 105 106 107 108 109 110 | (exit))) (if (args:get-arg "-h") (begin (print help) (exit))) ;; TODO: Move this inside (main) ;; (if (not (launch:setup)) (begin (print "Failed to find megatest.config, exiting") (exit 1))) | > > > > | 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 | (exit))) (if (args:get-arg "-h") (begin (print help) (exit))) (if (not (common:on-homehost?)) (begin (debug:print 0 "ERROR: Current policy requires running dashboard on homehost: " (common:get-homehost)))) ;; TODO: Move this inside (main) ;; (if (not (launch:setup)) (begin (print "Failed to find megatest.config, exiting") (exit 1))) |
︙ | ︙ |
Modified db.scm from [a6e4ada759] to [0ef09f1d37].
︙ | ︙ | |||
12 13 14 15 16 17 18 | ;;====================================================================== ;; Database access ;;====================================================================== ;; dbstruct vector containing all the relevant dbs like main.db, megatest.db, run.db etc (use (srfi 18) extras tcp stack) ;; RADT => use of require-extension? | | | 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 | ;;====================================================================== ;; Database access ;;====================================================================== ;; dbstruct vector containing all the relevant dbs like main.db, megatest.db, run.db etc (use (srfi 18) extras tcp stack) ;; RADT => use of require-extension? (use sqlite3 srfi-1 posix regex regex-case srfi-69 csv-xml s11n md5 message-digest base64 format dot-locking z3 typed-records matchable) (import (prefix sqlite3 sqlite3:)) (import (prefix base64 base64:)) ;; RADT => prefix?? (declare (unit db)) (declare (uses common)) (declare (uses keys)) (declare (uses ods)) |
︙ | ︙ | |||
823 824 825 826 827 828 829 | (define (db:multi-db-sync dbstruct . options) (if (not (launch:setup)) (debug:print 0 *default-log-port* "ERROR: not able to setup up for megatest.") (let* ((mtdb (dbr:dbstruct-mtdb dbstruct)) (tmpdb (db:get-db dbstruct)) (refndb (dbr:dbstruct-refndb dbstruct)) (allow-cleanup #t) ;; (if run-ids #f #t)) | | | | > | | 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 | (define (db:multi-db-sync dbstruct . options) (if (not (launch:setup)) (debug:print 0 *default-log-port* "ERROR: not able to setup up for megatest.") (let* ((mtdb (dbr:dbstruct-mtdb dbstruct)) (tmpdb (db:get-db dbstruct)) (refndb (dbr:dbstruct-refndb dbstruct)) (allow-cleanup #t) ;; (if run-ids #f #t)) ;; (tdbdat (tasks:open-db)) (servers (server:get-list *toppath*)) ;; (tasks:get-all-servers (db:delay-if-busy tdbdat))) (data-synced 0)) ;; count of changed records (I hope) ;; kill servers (if (member 'killservers options) (for-each (lambda (server) (match-let (((mod-time host port start-time pid) server)) (if (and host pid) (tasks:kill-server host pid)))) servers)) ;; clear out junk records ;; (if (member 'dejunk options) (begin (db:delay-if-busy mtdb) ;; ok to delay on mtdb |
︙ | ︙ |
Modified megatest.scm from [d7a15310c1] to [9fc67a21aa].
︙ | ︙ | |||
350 351 352 353 354 355 356 | ;; The watchdog is to keep an eye on things like db sync etc. ;; (define *watchdog* (make-thread common:watchdog "Watchdog thread")) (if (not (args:get-arg "-server")) (thread-start! *watchdog*)) ;; if starting a server; wait till we get to running state before kicking off watchdog | < > | > > > | > | | 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 | ;; The watchdog is to keep an eye on things like db sync etc. ;; (define *watchdog* (make-thread common:watchdog "Watchdog thread")) (if (not (args:get-arg "-server")) (thread-start! *watchdog*)) ;; if starting a server; wait till we get to running state before kicking off watchdog (if (or (args:get-arg "-log")(args:get-arg "-server")) ;; redirect the log always when a server (let* ((tl (or (args:get-arg "-log")(launch:setup))) ;; run launch:setup if -server (logf (or (args:get-arg "-log") ;; use -log unless we are a server, then craft a logfile name (conc tl "/logs/server-" (current-process-id) "-" (get-host-name) ".log"))) (oup (open-output-file logf))) (if (not (args:get-arg "-log")) (hash-table-set! args:arg-hash "-log" logf)) ;; fake out future queries of -log (debug:print-info 0 *default-log-port* "Sending log output to " logf) (set! *default-log-port* oup))) (if (or (args:get-arg "-h") (args:get-arg "-help") (args:get-arg "--help")) (begin (print help) |
︙ | ︙ | |||
697 698 699 700 701 702 703 | (debug:print-error 0 *default-log-port* "Parameter to -envdelta should be new=star-end"))))) ;;====================================================================== ;; Start the server - can be done in conjunction with -runall or -runtests (one day...) ;; we start the server if not running else start the client thread ;;====================================================================== | < < | | > < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 | (debug:print-error 0 *default-log-port* "Parameter to -envdelta should be new=star-end"))))) ;;====================================================================== ;; Start the server - can be done in conjunction with -runall or -runtests (one day...) ;; we start the server if not running else start the client thread ;;====================================================================== ;; Server? Start up here. ;; (if (args:get-arg "-server") (let ((tl (launch:setup)) (transport-type (string->symbol (or (args:get-arg "-transport") "http")))) (server:launch 0 transport-type) (set! *didsomething* #t))) (if (or (args:get-arg "-list-servers") (args:get-arg "-stop-server") (args:get-arg "-kill-server")) (let ((tl (launch:setup))) (if tl (let* ((tdbdat (tasks:open-db)) |
︙ | ︙ |
Modified rmt.scm from [e2d5384f31] to [0b9221a215].
︙ | ︙ | |||
90 91 92 93 94 95 96 | (debug:print-info 12 *default-log-port* "rmt:send-receive, case 3") (rmt:open-qry-close-locally cmd 0 params)) ;; on homehost and this is a write, we already have a server, but server has died ((and (cdr (remote-hh-dat *runremote*)) ;; on homehost (not (member cmd api:read-only-queries)) ;; this is a write (remote-server-url *runremote*) ;; have a server | | < < < < < < < < < | < | | | | < < < < < < < < < < < < < < < < < < < < < < | 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 | (debug:print-info 12 *default-log-port* "rmt:send-receive, case 3") (rmt:open-qry-close-locally cmd 0 params)) ;; on homehost and this is a write, we already have a server, but server has died ((and (cdr (remote-hh-dat *runremote*)) ;; on homehost (not (member cmd api:read-only-queries)) ;; this is a write (remote-server-url *runremote*) ;; have a server (not (server:check-if-running *toppath*))) ;; server has died. (set! *runremote* #f) (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4.1") (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; on homehost and this is a write, we already have a server ((and (cdr (remote-hh-dat *runremote*)) ;; on homehost (not (member cmd api:read-only-queries)) ;; this is a write (remote-server-url *runremote*)) ;; have a server (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4") (rmt:open-qry-close-locally cmd 0 params)) ;; on homehost, no server contact made and this is a write, passively start a server ((and (cdr (remote-hh-dat *runremote*)) ; new (not (remote-server-url *runremote*)) (not (member cmd api:read-only-queries))) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5") (let ((server-url (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call (if server-url (remote-server-url-set! *runremote* server-url) ;; the string can be consumed by the client setup if needed (server:kind-run *toppath*))) (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5.1") (rmt:open-qry-close-locally cmd 0 params)) ((and (not (cdr (remote-hh-dat *runremote*))) ;; not on a homehost (not (remote-conndat *runremote*))) ;; and no connection (debug:print-info 12 *default-log-port* "rmt:send-receive, case 6 hh-dat: " (remote-hh-dat *runremote*) " conndat: " (remote-conndat *runremote*)) (mutex-unlock! *rmt-mutex*) (tasks:start-and-wait-for-server (tasks:open-db) 0 15) (remote-conndat-set! *runremote* (rmt:get-connection-info *toppath*)) ;; calls client:setup which calls client:setup-http |
︙ | ︙ |
Modified server.scm from [e489317c03] to [81e0620ae8].
1 2 3 4 5 6 7 8 9 10 11 12 | ;; Copyright 2006-2012, Matthew Welland. ;; ;; This program is made available under the GNU GPL version 2.0 or ;; greater. See the accompanying file COPYING for details. ;; ;; This program is distributed WITHOUT ANY WARRANTY; without even the ;; implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR ;; PURPOSE. (require-extension (srfi 18) extras tcp s11n) | | | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | ;; Copyright 2006-2012, Matthew Welland. ;; ;; This program is made available under the GNU GPL version 2.0 or ;; greater. See the accompanying file COPYING for details. ;; ;; This program is distributed WITHOUT ANY WARRANTY; without even the ;; implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR ;; PURPOSE. (require-extension (srfi 18) extras tcp s11n) (use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest directory-utils posix-extras matchable) ;; (use zmq) (use spiffy uri-common intarweb http-client spiffy-request-vars) (declare (unit server)) (declare (uses common)) |
︙ | ︙ | |||
107 108 109 110 111 112 113 | ;; (attempt-in-progress (server:start-attempted? areapath)) ;; (dot-server-url (server:check-if-running areapath)) (curr-ip (server:get-best-guess-address curr-host)) (curr-pid (current-process-id)) (homehost (common:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" )) (target-host (car homehost)) (testsuite (common:get-testsuite-name)) | | | | 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 | ;; (attempt-in-progress (server:start-attempted? areapath)) ;; (dot-server-url (server:check-if-running areapath)) (curr-ip (server:get-best-guess-address curr-host)) (curr-pid (current-process-id)) (homehost (common:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" )) (target-host (car homehost)) (testsuite (common:get-testsuite-name)) (logfile (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log")) (cmdln (conc (common:get-megatest-exe) " -server " (or target-host "-") (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") " -daemonize " "") ;; " -log " logfile " -m testsuite:" testsuite)) ;; (conc " >> " logfile " 2>&1 &"))))) (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread"))) ;; we want the remote server to start in *toppath* so push there (push-directory areapath) (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") (thread-start! log-rotate) |
︙ | ︙ | |||
163 164 165 166 167 168 169 | (string->number (caddr dat)))))) (list #f #f #f))))))) ;; get a list of servers with all relevant data ;; ( mod-time host port start-time ) ;; (define (server:get-list areapath) | > | | | | | | | | | | > > | | | | | | | > > > > > > | 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 | (string->number (caddr dat)))))) (list #f #f #f))))))) ;; get a list of servers with all relevant data ;; ( mod-time host port start-time ) ;; (define (server:get-list areapath) (let ((fname-rx (regexp "^(|.*/)server-(\\d+)-(\\S+).log$"))) (if (directory-exists? areapath) (let ((server-logs (glob (conc areapath "/logs/server-*.log")))) (if (null? server-logs) '() (let loop ((hed (car server-logs)) (tal (cdr server-logs)) (res '())) (let* ((mod-time (file-modification-time hed)) (serv-dat (server:logf-get-start-info hed)) (serv-rec (cons mod-time serv-dat)) (fmatch (string-match fname-rx hed)) (pid (if fmatch (string->number (list-ref fmatch 2)) #f)) (new-res (cons (append serv-rec (list pid)) res))) (if (null? tal) new-res (loop (car tal)(cdr tal) new-res))))))))) ;; given a list of servers get a list of valid servers, i.e. at least ;; 10 seconds old, has started and is less than 1 hour old and is ;; active (i.e. mod-time < 10 seconds ;; ;; mod-time host port start-time pid ;; ;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off ;; and servers should stick around for about two hours or so. ;; (define (server:get-best srvlst) (let ((now (current-seconds))) (sort (filter (lambda (rec) (let ((start-time (list-ref rec 3)) (mod-time (list-ref rec 0))) ;; (print "start-time: " start-time " mod-time: " mod-time) (and start-time mod-time (> (- now start-time) 1) ;; been running at least 1 seconds (< (- now mod-time) 10) ;; still alive - file touched in last 10 seconds (< (- now start-time) 3600) ;; under one hour running time ))) srvlst) (lambda (a b) (< (list-ref a 3) (list-ref b 3)))))) (define (server:record->url servr) (match-let (((mod-time host port start-time pid) servr)) (if (and host port) (conc host ":" port) #f))) (define (server:get-client-signature) ;; BB> why is this proc named "get-"? it returns nothing -- set! has not return value. (if *my-client-signature* *my-client-signature* (let ((sig (server:mk-signature))) (set! *my-client-signature* sig) *my-client-signature*))) |
︙ | ︙ | |||
358 359 360 361 362 363 364 | ;; (common:simple-file-release-lock lock-file)) ;; (debug:print-info 0 *default-log-port* "server file " server-file " for " hostport " NOT removed - could not get lock.")) ;; (debug:print-info 0 *default-log-port* "server file " server-file " for " hostport " NOT removed - dotserver-url("dotserver-url") did not match hostport pattern ("hostport")")))) ;; no longer care if multiple servers are started by accident. older servers will drop off in time. ;; (define (server:check-if-running areapath) | > > > > | | > > > > > | | 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 | ;; (common:simple-file-release-lock lock-file)) ;; (debug:print-info 0 *default-log-port* "server file " server-file " for " hostport " NOT removed - could not get lock.")) ;; (debug:print-info 0 *default-log-port* "server file " server-file " for " hostport " NOT removed - dotserver-url("dotserver-url") did not match hostport pattern ("hostport")")))) ;; no longer care if multiple servers are started by accident. older servers will drop off in time. ;; (define (server:check-if-running areapath) (let* ((servers (server:get-best (server:get-list areapath))) (best-server (if (null? servers) #f (car servers))) (dotserver-url (if best-server (server:record->url best-server) #f))) ;; (server:read-dotserver->url areapath))) ;; tdbdat (tasks:open-db))) (if dotserver-url (let* ((res (case *transport-type* ((http)(server:ping-server dotserver-url)) ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server) ))) (if res dotserver-url (begin (server:kill best-server) #f))) #f))) (define (server:kill servr) (match-let (((mod-time hostname port start-time pid) servr)) (tasks:kill-server hostname pid))) ;; called in megatest.scm, host-port is string hostname:port ;; ;; NOTE: This is NOT called directly from clients as not all transports support a client running ;; in the same process as the server. ;; (define (server:ping host-port-in #!key (do-exit #f)) (let ((host:port (if (not host-port-in) ;; use read-dotserver to find (server:check-if-running *toppath*) (if (number? host-port-in) ;; we were handed a server-id (let ((srec (tasks:get-server-by-id (db:delay-if-busy (tasks:open-db)) host-port-in))) ;; (print "srec: " srec " host-port-in: " host-port-in) (if srec (conc (vector-ref srec 3) ":" (vector-ref srec 4)) (conc "no such server-id " host-port-in))) host-port-in)))) |
︙ | ︙ |
Modified utils/mk_wrapper from [de6ec68c4a] to [9bb7f8caf7].
︙ | ︙ | |||
26 27 28 29 30 31 32 | # fi # echo "exec $prefix/bin/.\$(lsb_release -sr)/$cmd \"\$@\"" >> $target echo "#!/bin/bash" > $target if [[ $cmd =~ dboard ]]; then cat >> $target <<'EOF' | | | | | | | | | | | | | 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | # fi # echo "exec $prefix/bin/.\$(lsb_release -sr)/$cmd \"\$@\"" >> $target echo "#!/bin/bash" > $target if [[ $cmd =~ dboard ]]; then cat >> $target <<'EOF' # # disable if not running on homehost # if [[ -e .homehost ]]; then # homehostname=$( host `cat .homehost` | awk '{print $NF}' | sed 's/\.$//' ) # hostname=$( hostname -f ) # # if [[ ! ($homehostname == $hostname) ]]; then # echo "ERROR: this host ($hostname) is not the homehost ($homehostname) for this megatest run area. Cannot start dashboard." # echo " Please log into homehost before launching dashboard." # exit 1 # fi # fi # check that $DISPLAY is set if [[ -z $DISPLAY ]]; then echo 'ERROR: $DISPLAY environment variable is not set; megatest dashboard requires X display address to be set in $DISPLAY.' exit 1 fi |
︙ | ︙ |