Index: client.scm ================================================================== --- client.scm +++ client.scm @@ -12,11 +12,11 @@ ;; C L I E N T S ;;====================================================================== (require-extension (srfi 18) extras tcp s11n) -(use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest) +(use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest matchable) ;; (use zmq) (use (prefix sqlite3 sqlite3:)) (use spiffy uri-common intarweb http-client spiffy-request-vars uri-common intarweb directory-utils) @@ -48,102 +48,16 @@ ((rpc) (rpc:client-connect iface port)) ((http) (http:client-connect iface port)) ((zmq) (zmq:client-connect iface port)) (else (rpc:client-connect iface port)))) -(define (client:setup run-id #!key (remaining-tries 10) (failed-connects 0)) +(define (client:setup areapath #!key (remaining-tries 100) (failed-connects 0)) (case (server:get-transport) ((rpc) (rpc-transport:client-setup run-id remaining-tries: remaining-tries failed-connects: failed-connects)) ;;(client:setup-rpc run-id)) - ((http)(client:setup-http run-id remaining-tries: remaining-tries failed-connects: failed-connects)) + ((http)(client:setup-http areapath remaining-tries: remaining-tries failed-connects: failed-connects)) (else (rpc-transport:client-setup run-id remaining-tries: remaining-tries failed-connects: failed-connects)))) ;; (client:setup-rpc run-id)))) -;; (define (client:login-no-auto-setup server-info run-id) -;; (case (server:get-transport) -;; ((rpc) (rpc:login-no-auto-client-setup server-info run-id)) -;; ((http) (rmt:login-no-auto-client-setup server-info run-id)) -;; (else (rpc:login-no-auto-client-setup server-info run-id)))) -;; -;; (define (client:setup-rpc run-id) -;; (debug:print 0 *default-log-port* "INFO: client:setup remaining-tries=" remaining-tries) -;; (if (<= remaining-tries 0) -;; (begin -;; (debug:print-error 0 *default-log-port* "failed to start or connect to server for run-id " run-id) -;; (exit 1)) -;; (let ((host-info (hash-table-ref/default *runremote* run-id #f))) -;; (debug:print-info 0 *default-log-port* "client:setup host-info=" host-info ", remaining-tries=" remaining-tries) -;; (if host-info -;; (let* ((iface (car host-info)) -;; (port (cadr host-info)) -;; (start-res (client:connect iface port)) -;; ;; (ping-res (server:ping-server run-id iface port)) -;; (ping-res (client:login-no-auto-setup start-res run-id))) -;; (if ping-res ;; sucessful login? -;; (begin -;; (hash-table-set! *runremote* run-id start-res) -;; start-res) ;; return the server info -;; (if (member remaining-tries '(3 4 6)) -;; (begin ;; login failed -;; (debug:print 25 *default-log-port* "INFO: client:setup start-res=" start-res ", run-id=" run-id ", server-dat=" host-info) -;; (hash-table-delete! *runremote* run-id) -;; (open-run-close tasks:server-force-clean-run-record -;; tasks:open-db -;; run-id -;; (car host-info) -;; (cadr host-info) -;; " client:setup (host-info=#t)") -;; (thread-sleep! 5) -;; (client:setup run-id remaining-tries: 10)) ;; (- remaining-tries 1))) -;; (begin -;; (debug:print 25 *default-log-port* "INFO: client:setup failed to connect, start-res=" start-res ", run-id=" run-id ", host-info=" host-info) -;; (thread-sleep! 5) -;; (client:setup run-id remaining-tries: (- remaining-tries 1)))))) -;; ;; YUK: rename server-dat here -;; (let* ((server-dat (open-run-close tasks:get-server tasks:open-db run-id))) -;; (debug:print-info 0 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) -;; (if server-dat -;; (let* ((iface (tasks:hostinfo-get-interface server-dat)) -;; (port (tasks:hostinfo-get-port server-dat)) -;; (start-res (http-transport:client-connect iface port)) -;; ;; (ping-res (server:ping-server run-id iface port)) -;; (ping-res (rmt:login-no-auto-client-setup start-res run-id))) -;; (if start-res -;; (begin -;; (hash-table-set! *runremote* run-id start-res) -;; start-res) -;; (if (member remaining-tries '(2 5)) -;; (begin ;; login failed -;; (debug:print 25 *default-log-port* "INFO: client:setup start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat) -;; (hash-table-delete! *runremote* run-id) -;; (open-run-close tasks:server-force-clean-run-record -;; tasks:open-db -;; run-id -;; (tasks:hostinfo-get-interface server-dat) -;; (tasks:hostinfo-get-port server-dat) -;; " client:setup (server-dat = #t)") -;; (thread-sleep! 2) -;; (server:try-running run-id) -;; (thread-sleep! 10) ;; give server a little time to start up -;; (client:setup run-id remaining-tries: 10)) ;; (- remaining-tries 1))) -;; (begin -;; (debug:print 25 *default-log-port* "INFO: client:setup start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat) -;; (thread-sleep! 5) -;; (client:setup run-id remaining-tries: (- remaining-tries 1)))))) -;; (begin ;; no server registered -;; (if (eq? remaining-tries 2) -;; (begin -;; ;; (open-run-close tasks:server-clean-out-old-records-for-run-id tasks:open-db run-id " client:setup (server-dat=#f)") -;; (client:setup run-id remaining-tries: 10)) -;; (begin -;; (thread-sleep! 2) -;; (debug:print 25 *default-log-port* "INFO: client:setup start-res (not defined here), run-id=" run-id ", server-dat=" server-dat) -;; (if (< (open-run-close tasks:num-in-available-state tasks:open-db run-id) 3) -;; (begin -;; ;; (open-run-close tasks:server-clean-out-old-records-for-run-id tasks:open-db run-id " client:setup (server-dat=#f)") -;; (server:try-running run-id))) -;; (thread-sleep! 10) ;; give server a little time to start up -;; (client:setup run-id remaining-tries: (- remaining-tries 1))))))))))) - ;; Do all the connection work, look up the transport type and set up the ;; connection if required. ;; ;; There are two scenarios. ;; 1. We are a test manager and we received *transport-type* and *runremote* via cmdline @@ -152,100 +66,50 @@ ;; ;; client:setup ;; ;; lookup_server, need to remove *runremote* stuff ;; -(define (client:setup-http run-id #!key (remaining-tries 10) (failed-connects 0)) + +(define (client:setup-http areapath #!key (remaining-tries 100) (failed-connects 0)) (debug:print-info 2 *default-log-port* "client:setup remaining-tries=" remaining-tries) - (let* ((tdbdat (tasks:open-db))) - (if (<= remaining-tries 0) - (begin - (debug:print-error 0 *default-log-port* "failed to start or connect to server for run-id " run-id) - (exit 1)) - (let* ((server-dat (tasks:get-server (db:delay-if-busy tdbdat) run-id))) - (debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) - (if server-dat - (let* ((iface (tasks:hostinfo-get-interface server-dat)) - (hostname (tasks:hostinfo-get-hostname server-dat)) - (port (tasks:hostinfo-get-port server-dat)) - (start-res (case *transport-type* - ((http)(http-transport:client-connect iface port)) - ;;((nmsg)(nmsg-transport:client-connect hostname port)) - )) - (ping-res (case *transport-type* - ((http)(rmt:login-no-auto-client-setup start-res)) - ;; ((nmsg)(let ((logininfo (rmt:login-no-auto-client-setup start-res run-id))) - ;; (if logininfo - ;; (car (vector-ref logininfo 1)) - ;; #f))) - - ))) - (if (and start-res - ping-res) - (begin - (remote-conndat-set! *runremote* start-res) ;; (hash-table-set! *runremote* run-id start-res) - (debug:print-info 2 *default-log-port* "connected to " (http-transport:server-dat-make-url start-res)) - start-res) - (begin ;; login failed but have a server record, clean out the record and try again - (debug:print-info 0 *default-log-port* "client:setup, login failed, will attempt to start server ... start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat) - (case *transport-type* - ((http)(http-transport:close-connections run-id))) - (remote-conndat-set! *runremote* #f) ;; (hash-table-delete! *runremote* run-id) - (tasks:kill-server-run-id run-id) - (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat) - run-id - (tasks:hostinfo-get-interface server-dat) - (tasks:hostinfo-get-port server-dat) - " client:setup (server-dat = #t)") - (if (> remaining-tries 8) - (thread-sleep! (+ 1 (random 5))) ;; spread out the starts a little - (thread-sleep! (+ 15 (random 20)))) ;; it isn't going well. give it plenty of time - (server:try-running *toppath*) - (thread-sleep! 5) ;; give server a little time to start up - (client:setup run-id remaining-tries: (- remaining-tries 1)) - ))) - (begin ;; no server registered - (let ((num-available (tasks:num-in-available-state (db:dbdat-get-db tdbdat) run-id))) - (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries " num-available=" num-available) - (if (< num-available 2) - (server:try-running *toppath*)) - (thread-sleep! (+ 5 (random (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms. - (client:setup run-id remaining-tries: (- remaining-tries 1))))))))) - -;; keep this as a function to ease future -(define (client:start run-id server-info) - (http-transport:client-connect (tasks:hostinfo-get-interface server-info) - (tasks:hostinfo-get-port server-info))) - -;; ;; client:signal-handler -;; (define (client:signal-handler signum) -;; (signal-mask! signum) -;; (set! *time-to-exit* #t) -;; (handle-exceptions -;; exn -;; (debug:print 0 *default-log-port* " ... exiting ...") -;; (let ((th1 (make-thread (lambda () -;; "") ;; do nothing for now (was flush out last call if applicable) -;; "eat response")) -;; (th2 (make-thread (lambda () -;; (debug:print-error 0 *default-log-port* "Received ^C, attempting clean exit. Please be patient and wait a few seconds before hitting ^C again.") -;; (thread-sleep! 1) ;; give the flush one second to do it's stuff -;; (debug:print 0 *default-log-port* " Done.") -;; (exit 4)) -;; "exit on ^C timer"))) -;; (thread-start! th2) -;; (thread-start! th1) -;; (thread-join! th2)))) -;; -;; ;; client:launch -;; ;; Need to set the signal handler somewhere other than here as this -;; ;; routine will go away. -;; ;; -;; (define (client:launch run-id) -;; (set-signal-handler! signal/int client:signal-handler) -;; (set-signal-handler! signal/term client:signal-handler) -;; (if (client:setup run-id) -;; (debug:print-info 2 *default-log-port* "connected as client") -;; (begin -;; (debug:print-error 0 *default-log-port* "Failed to connect as client") -;; (exit)))) -;; + (server:start-and-wait areapath) + (if (<= remaining-tries 0) + (begin + (debug:print-error 0 *default-log-port* "failed to start or connect to server") + (exit 1)) + ;; + ;; Alternatively here, we can get the list of candidate servers and work our way + ;; through them searching for a good one. + ;; + (let* ((server-dat (server:get-first-best areapath))) + (if (not server-dat) ;; no server found + (client:setup-http areapath remaining-tries: (- remaining-tries 1)) + (let ((host (cadr server-dat)) + (port (caddr server-dat))) + (debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) + (if (not *runremote*)(set! *runremote* (make-remote))) + (if (and host port) + (let* ((start-res (case *transport-type* + ((http)(http-transport:client-connect host port)))) + (ping-res (case *transport-type* + ((http)(rmt:login-no-auto-client-setup start-res))))) + (if (and start-res + ping-res) + (begin + (remote-conndat-set! *runremote* start-res) ;; (hash-table-set! *runremote* run-id start-res) + (debug:print-info 2 *default-log-port* "connected to " (http-transport:server-dat-make-url start-res)) + start-res) + (begin ;; login failed but have a server record, clean out the record and try again + (debug:print-info 0 *default-log-port* "client:setup, login failed, will attempt to start server ... start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat) + (case *transport-type* + ((http)(http-transport:close-connections run-id))) + (remote-conndat-set! *runremote* #f) ;; (hash-table-delete! *runremote* run-id) + (thread-sleep! 1) + (client:setup-http areapath remaining-tries: (- remaining-tries 1)) + ))) + (begin ;; no server registered + (server:kind-run areapath) + (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries) + (thread-sleep! 1) ;; (+ 5 (random (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms. + (server:start-and-wait areapath) + (client:setup-http areapath remaining-tries: (- remaining-tries 1))))))))) + Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -141,11 +141,11 @@ (define *launch-setup-mutex* (make-mutex)) ;; need to be able to call launch:setup often so mutex it and re-call the real deal only if *toppath* not set (define *homehost-mutex* (make-mutex)) (defstruct remote (hh-dat (common:get-homehost)) ;; homehost record ( addr . hhflag ) - (server-url (if *toppath* (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*) #f)) + (server-url (if *toppath* (server:check-if-running *toppath*))) ;; (server:check-if-running *toppath*) #f)) (last-server-check 0) ;; last time we checked to see if the server was alive (conndat #f) (transport *transport-type*) (server-timeout (or (server:get-timeout) 100))) ;; default to 100 seconds @@ -242,19 +242,32 @@ ;; (define (common:rotate-logs) (if (not (directory-exists? "logs"))(create-directory "logs")) (directory-fold (lambda (file rem) - (if (and (string-match "^.*.log" file) - (> (file-size (conc "logs/" file)) 200000)) - (let ((gzfile (conc "logs/" file ".gz"))) - (if (file-exists? gzfile) - (begin - (debug:print-info 0 *default-log-port* "removing " gzfile) - (delete-file gzfile))) - (debug:print-info 0 *default-log-port* "compressing " file) - (system (conc "gzip logs/" file))))) + (handle-exceptions + exn + (debug:print-info 0 *default-log-port* "failed to rotate log " file ", probably handled by another process.") + (let* ((fullname (conc "logs/" file)) + (file-age (- (current-seconds)(file-modification-time fullname)))) + (if (or (and (string-match "^.*.log" file) + (> (file-size fullname) 200000)) + (and (string-match "^server-.*.log" file) + (> (- (current-seconds) (file-modification-time fullname)) + (* 8 60 60)))) + (let ((gzfile (conc fullname ".gz"))) + (if (file-exists? gzfile) + (begin + (debug:print-info 0 *default-log-port* "removing " gzfile) + (delete-file gzfile))) + (debug:print-info 0 *default-log-port* "compressing " file) + (system (conc "gzip " fullname))) + (if (> file-age (* (string->number (or (configf:lookup *configdat* "setup" "log-expire-days") "30")) 24 3600)) + (handle-exceptions + exn + #f + (delete-file fullname))))))) '() "logs")) ;; Force a megatest cleanup-db if version is changed and skip-version-check not specified ;; @@ -572,33 +585,36 @@ (define *wdnum*mutex (make-mutex)) ;; currently the primary job of the watchdog is to run the sync back to megatest.db from the db in /tmp ;; if we are on the homehost and we are a server (by definition we are on the homehost if we are a server) ;; (define (common:watchdog) - (thread-sleep! 0.05) ;; delay for startup (let ((legacy-sync (common:run-sync?)) (debug-mode (debug:debug-mode 1)) (last-time (current-seconds)) - (this-wd-num (begin (mutex-lock! *wdnum*mutex) (let ((x *wdnum*)) (set! *wdnum* (add1 *wdnum*)) (mutex-unlock! *wdnum*mutex) x))) - ) + (this-wd-num (begin (mutex-lock! *wdnum*mutex) (let ((x *wdnum*)) (set! *wdnum* (add1 *wdnum*)) (mutex-unlock! *wdnum*mutex) x)))) (debug:print-info 0 *default-log-port* "watchdog starting. legacy-sync is " legacy-sync" pid="(current-process-id)" this-wd-num="this-wd-num) (if (and legacy-sync (not *time-to-exit*)) - (let ((dbstruct (db:setup))) + (let* ((dbstruct (db:setup)) + (mtdb (dbr:dbstruct-mtdb dbstruct)) + (mtpath (db:dbdat-get-path mtdb))) (debug:print-info 0 *default-log-port* "Server running, periodic sync started.") (let loop () - ;;(BB> "watchdog loop. pid="(current-process-id)" this-wd-num="this-wd-num" *time-to-exit*="*time-to-exit*) ;; sync for filesystem local db writes ;; (mutex-lock! *db-multi-sync-mutex*) (let* ((need-sync (>= *db-last-access* *db-last-sync*)) ;; no sync since last write (sync-in-progress *db-sync-in-progress*) (should-sync (and (not *time-to-exit*) (> (- (current-seconds) *db-last-sync*) 5))) ;; sync every five seconds minimum + (start-time (current-seconds)) + (mt-mod-time (file-modification-time mtpath)) + (recently-synced (> (- start-time mt-mod-time) 4)) (will-sync (and (or need-sync should-sync) - (not sync-in-progress))) - (start-time (current-seconds))) + (not sync-in-progress) + (not recently-synced)))) + ;; (if recently-synced (debug:print-info 0 *default-log-port* "Skipping sync due to recently-synced flag=" recently-synced)) ;; (debug:print-info 0 *default-log-port* "need-sync: " need-sync " sync-in-progress: " sync-in-progress " should-sync: " should-sync " will-sync: " will-sync) (if will-sync (set! *db-sync-in-progress* #t)) (mutex-unlock! *db-multi-sync-mutex*) (if will-sync (let ((res (common:sync-to-megatest.db dbstruct))) ;; did we sync any data? If so need to set the db touched flag to keep the server alive Index: dashboard.scm ================================================================== --- dashboard.scm +++ dashboard.scm @@ -99,10 +99,14 @@ (if (args:get-arg "-h") (begin (print help) (exit))) +(if (not (common:on-homehost?)) + (begin + (debug:print 0 *default-log-port* "ERROR: Current policy requires running dashboard on homehost: " (common:get-homehost)))) + ;; TODO: Move this inside (main) ;; (if (not (launch:setup)) (begin (print "Failed to find megatest.config, exiting") Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -13,14 +13,14 @@ ;; Database access ;;====================================================================== ;; dbstruct vector containing all the relevant dbs like main.db, megatest.db, run.db etc -(use (srfi 18) extras tcp stack) ;; RADT => use of require-extension? -(use sqlite3 srfi-1 posix regex regex-case srfi-69 csv-xml s11n md5 message-digest base64 format dot-locking z3 typed-records) +(use (srfi 18) extras tcp stack) +(use sqlite3 srfi-1 posix regex regex-case srfi-69 csv-xml s11n md5 message-digest base64 format dot-locking z3 typed-records matchable) (import (prefix sqlite3 sqlite3:)) -(import (prefix base64 base64:)) ;; RADT => prefix?? +(import (prefix base64 base64:)) (declare (unit db)) (declare (uses common)) (declare (uses keys)) (declare (uses ods)) @@ -143,11 +143,11 @@ (use-mutex (> *api-process-request-count* 25))) (if (and use-mutex (common:low-noise-print 120 "over-50-parallel-api-requests")) (debug:print-info 0 *default-log-port* *api-process-request-count* " parallel api requests being processed in process " (current-process-id) ", throttling access")) (if (common:low-noise-print 600 (conc "parallel-api-requests" *max-api-process-requests*)) - (debug:print-info 0 *default-log-port* "Parallel api request count: " *api-process-request-count* " max parallel requests: " *max-api-process-requests*)) + (debug:print-info 2 *default-log-port* "Parallel api request count: " *api-process-request-count* " max parallel requests: " *max-api-process-requests*)) (handle-exceptions exn (begin (print-call-chain (current-error-port)) (debug:print-error 0 *default-log-port* "sqlite3 issue in db:with-db, dbstruct=" dbstruct ", run-id=" run-id ", proc=" proc ", params=" params " error: " ((condition-property-accessor 'exn 'message) exn)) @@ -825,20 +825,21 @@ (debug:print 0 *default-log-port* "ERROR: not able to setup up for megatest.") (let* ((mtdb (dbr:dbstruct-mtdb dbstruct)) (tmpdb (db:get-db dbstruct)) (refndb (dbr:dbstruct-refndb dbstruct)) (allow-cleanup #t) ;; (if run-ids #f #t)) - (tdbdat (tasks:open-db)) - (servers (tasks:get-all-servers (db:delay-if-busy tdbdat))) + ;; (tdbdat (tasks:open-db)) + (servers (server:get-list *toppath*)) ;; (tasks:get-all-servers (db:delay-if-busy tdbdat))) (data-synced 0)) ;; count of changed records (I hope) ;; kill servers (if (member 'killservers options) (for-each (lambda (server) - (tasks:server-delete-record (db:delay-if-busy tdbdat) (vector-ref server 0) "dbmigration") - (tasks:kill-server (vector-ref server 2)(vector-ref server 1))) + (match-let (((mod-time host port start-time pid) server)) + (if (and host pid) + (tasks:kill-server host pid)))) servers)) ;; clear out junk records ;; (if (member 'dejunk options) @@ -2039,13 +2040,16 @@ db "SELECT id FROM runs WHERE state != 'deleted' ORDER BY event_time DESC;") (reverse run-ids))))) ;; get some basic run stats +;; +;; data structure: ;; ;; ( (runname (( state count ) ... )) -;; ( ... +;; ( ... +;; (define (db:get-run-stats dbstruct) (let* ((totals (make-hash-table)) (curr (make-hash-table)) (res '()) (runs-info '())) Index: dcommon.scm ================================================================== --- dcommon.scm +++ dcommon.scm @@ -12,11 +12,11 @@ (use format) (require-library iup) (import (prefix iup iup:)) (use canvas-draw) (import canvas-draw-iup) -(use regex typed-records) +(use regex typed-records matchable) (declare (unit dcommon)) (declare (uses megatest-version)) (declare (uses gutils)) @@ -620,11 +620,12 @@ #:numlin-visible 5 )) (colnames (list "Id" "MTver" "Pid" "Host" "Interface:OutPort" "RunTime" "State" "RunId")) (updater (lambda () (if (dashboard:monitor-changed? commondat tabdat) - (let ((servers (tasks:get-all-servers (db:delay-if-busy tdbdat)))) + (let ((servers (server:get-list *toppath* limit: 10))) + ;; (tasks:get-all-servers (db:delay-if-busy tdbdat)))) (iup:attribute-set! servers-matrix "NUMLIN" (length servers)) ;; (set! colnum 0) ;; (for-each (lambda (colname) ;; ;; (print "colnum: " colnum " colname: " colname) ;; (iup:attribute-set! servers-matrix (conc "0:" colnum) colname) @@ -632,36 +633,40 @@ ;; colnames) (set! rownum 1) (for-each (lambda (server) (set! colnum 0) - (let* ((vals (list (vector-ref server 0) ;; Id - (vector-ref server 9) ;; MT-Ver - (vector-ref server 1) ;; Pid - (vector-ref server 2) ;; Hostname - (conc (vector-ref server 3) ":" (vector-ref server 4)) ;; IP:Port - (seconds->hr-min-sec (- (current-seconds)(vector-ref server 6))) - ;; (vector-ref server 5) ;; Pubport - ;; (vector-ref server 10) ;; Last beat - ;; (vector-ref server 6) ;; Start time - ;; (vector-ref server 7) ;; Priority - ;; (vector-ref server 8) ;; State - (vector-ref server 8) ;; State - (vector-ref server 12) ;; RunId - ))) - (for-each (lambda (val) - (let* ((row-col (conc rownum ":" colnum)) - (curr-val (iup:attribute servers-matrix row-col))) - (if (not (equal? (conc val) curr-val)) - (begin - (iup:attribute-set! servers-matrix row-col val) - (iup:attribute-set! servers-matrix "FITTOTEXT" (conc "C" colnum)))) - (set! colnum (+ 1 colnum)))) - vals) - (set! rownum (+ rownum 1))) - (iup:attribute-set! servers-matrix "REDRAW" "ALL")) - servers)))))) + (match-let (((mod-time host port start-time pid) + server)) + (let* ((uptime (- (current-seconds) mod-time)) + (runtime (if start-time + (- mod-time start-time) + 0)) + (vals (list "-" ;; (vector-ref server 0) ;; Id + "-" ;; (vector-ref server 9) ;; MT-Ver + pid ;; (vector-ref server 1) ;; Pid + host ;; (vector-ref server 2) ;; Hostname + (conc host ":" port) ;; (conc (vector-ref server 3) ":" (vector-ref server 4)) ;; IP:Port + (seconds->hr-min-sec runtime) ;; (- (current-seconds) start-time)) ;; (vector-ref server 6))) + (cond + ((< uptime 5) "alive") + ((< uptime 16) "probably alive");; less than 15 seconds since mod, call it alive (vector-ref server 8) ;; State + (else "dead")) + "-" ;; (vector-ref server 12) ;; RunId + ))) + (for-each (lambda (val) + (let* ((row-col (conc rownum ":" colnum)) + (curr-val (iup:attribute servers-matrix row-col))) + (if (not (equal? (conc val) curr-val)) + (begin + (iup:attribute-set! servers-matrix row-col val) + (iup:attribute-set! servers-matrix "FITTOTEXT" (conc "C" colnum)))) + (set! colnum (+ 1 colnum)))) + vals) + (set! rownum (+ rownum 1))) + (iup:attribute-set! servers-matrix "REDRAW" "ALL"))) + (sort servers (lambda (a b)(> (car a)(car b)))))))))) (set! colnum 0) (for-each (lambda (colname) (iup:attribute-set! servers-matrix (conc "0:" colnum) colname) (iup:attribute-set! servers-matrix "FITTOTEXT" (conc "C" colnum)) (set! colnum (+ colnum 1))) Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -8,11 +8,11 @@ ;; implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR ;; PURPOSE. (require-extension (srfi 18) extras tcp s11n) -(use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest) ;; sqlite3 +(use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest posix-extras) ;; sqlite3 ;; (import (prefix sqlite3 sqlite3:)) (use spiffy uri-common intarweb http-client spiffy-request-vars intarweb spiffy-directory-listing) ;; Configurations for server @@ -47,11 +47,11 @@ ;; Call this to start the actual server ;; (define *db:process-queue-mutex* (make-mutex)) -(define (http-transport:run hostn run-id server-id) +(define (http-transport:run hostn) (debug:print 2 *default-log-port* "Attempting to start the server ...") (let* ((db #f) ;; (open-db)) ;; we don't want the server to be opening and closing the db unnecesarily (hostname (get-host-name)) (ipaddrstr (let ((ipstr (if (string=? "-" hostn) ;; (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".") @@ -104,18 +104,17 @@ ((equal? (uri-path (request-uri (current-request))) '(/ "hey")) (send-response body: "hey there!\n" headers: '((content-type text/plain)))) (else (continue)))))))) - (http-transport:try-start-server run-id ipaddrstr start-port server-id))) + (http-transport:try-start-server ipaddrstr start-port))) ;; This is recursively run by http-transport:run until sucessful ;; -(define (http-transport:try-start-server run-id ipaddrstr portnum server-id) - (let ((config-hostname (configf:lookup *configdat* "server" "hostname")) - (tdbdat (tasks:open-db))) - (debug:print-info 0 *default-log-port* "http-transport:try-start-server time=" (seconds->time-string (current-seconds)) " run-id=" run-id " ipaddrsstr=" ipaddrstr " portnum=" portnum " server-id=" server-id " config-hostname=" config-hostname) +(define (http-transport:try-start-server ipaddrstr portnum) + (let ((config-hostname (configf:lookup *configdat* "server" "hostname"))) + (debug:print-info 0 *default-log-port* "http-transport:try-start-server time=" (seconds->time-string (current-seconds)) " ipaddrsstr=" ipaddrstr " portnum=" portnum " config-hostname=" config-hostname) (handle-exceptions exn (begin (print-error-message exn) (if (< portnum 64000) @@ -126,34 +125,26 @@ (portlogger:open-run-close portlogger:set-failed portnum) (debug:print 0 *default-log-port* "WARNING: failed to start on portnum: " portnum ", trying next port") (thread-sleep! 0.1) ;; get_next_port goes here - (http-transport:try-start-server run-id - ipaddrstr - (portlogger:open-run-close portlogger:find-port) - server-id)) + (http-transport:try-start-server ipaddrstr + (portlogger:open-run-close portlogger:find-port))) (begin - (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat) run-id ipaddrstr portnum " http-transport:try-start-server") (print "ERROR: Tried and tried but could not start the server")))) ;; any error in following steps will result in a retry (set! *server-info* (list ipaddrstr portnum)) - (tasks:server-set-interface-port - (db:delay-if-busy tdbdat) - server-id - ipaddrstr portnum) (debug:print 0 *default-log-port* "INFO: Trying to start server on " ipaddrstr ":" portnum) ;; This starts the spiffy server ;; NEED WAY TO SET IP TO #f TO BIND ALL ;; (start-server bind-address: ipaddrstr port: portnum) (if config-hostname ;; this is a hint to bind directly (start-server port: portnum bind-address: (if (equal? config-hostname "-") ipaddrstr config-hostname)) (start-server port: portnum)) - ;; (portlogger:open-run-close portlogger:set-port portnum "released") - (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat) run-id ipaddrstr portnum " http-transport:try-start-server") + (portlogger:open-run-close portlogger:set-port portnum "released") (debug:print 1 *default-log-port* "INFO: server has been stopped")))) ;;====================================================================== ;; S E R V E R U T I L I T I E S ;;====================================================================== @@ -341,17 +332,16 @@ server-dat)) ;; run http-transport:keep-running in a parallel thread to monitor that the db is being ;; used and to shutdown after sometime if it is not. ;; -(define (http-transport:keep-running server-id run-id) +(define (http-transport:keep-running) ;; if none running or if > 20 seconds since ;; server last used then start shutdown ;; This thread waits for the server to come alive - (debug:print-info 0 *default-log-port* "Starting the sync-back, keep alive thread in server for run-id=" run-id) - (let* ((tdbdat (tasks:open-db)) - (server-start-time (current-seconds)) + (debug:print-info 0 *default-log-port* "Starting the sync-back, keep alive thread in server") + (let* ((server-start-time (current-seconds)) (server-info (let loop ((start-time (current-seconds)) (changed #t) (last-sdat "not this")) (let ((sdat #f)) (thread-sleep! 0.01) @@ -368,48 +358,33 @@ (begin (debug:print-info 0 *default-log-port* "Still waiting, last-sdat=" last-sdat) (sleep 4) (if (> (- (current-seconds) start-time) 120) ;; been waiting for two minutes (begin - (debug:print-error 0 *default-log-port* "transport appears to have died, exiting server " server-id " for run " run-id) - (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id "failed to start, never received server alive signature") + (debug:print-error 0 *default-log-port* "transport appears to have died, exiting server") (exit)) (loop start-time (equal? sdat last-sdat) sdat))))))) (iface (car server-info)) (port (cadr server-info)) (last-access 0) (server-timeout (server:get-timeout)) - (server-going #f)) + (server-going #f) + (server-log-file (args:get-arg "-log"))) ;; always set when we are a server (let loop ((count 0) (server-state 'available) (bad-sync-count 0) (start-time (current-milliseconds))) - ;;(BB> "http-transport: top of loop; count="count" server-state="server-state" bad-sync-count="bad-sync-count" server-going="server-going) ;; Use this opportunity to sync the tmp db to megatest.db (if (not server-going) ;; *dbstruct-db* - ;; Removed code is pasted below (keeping it around until we are clear it is not needed). - ;; no *dbstruct-db* yet, set running after our first pass through and start the db - (if (eq? server-state 'available) - (let ((new-server-id (tasks:server-am-i-the-server? (db:delay-if-busy tdbdat) run-id))) ;; try to ensure no double registering of servers - (if (equal? new-server-id server-id) - (begin - (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "dbprep") - ;;(BB> "http-transport: ->dbprep") - (thread-sleep! 0.5) ;; give some margin for queries to complete before switching from file based access to server based access - (set! *dbstruct-db* (db:setup)) ;; run-id)) - (set! server-going #t) - (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running") - ;;(BB> "http-transport: ->running") - (server:write-dotserver *toppath* iface port (current-process-id) 'http) - (thread-start! *watchdog*) - (server:complete-attempt *toppath*)) - (begin ;; gotta exit nicely - ;;(BB> "http-transport: ->collision") - (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "collision") - (http-transport:server-shutdown server-id port)))))) + (begin + (debug:print 0 *default-log-port* "SERVER: dbprep") + (set! *dbstruct-db* (db:setup)) ;; run-id)) + (set! server-going #t) + (debug:print 0 *default-log-port* "SERVER: running, megatest version: " (common:get-full-version)) ;; NOTE: the server is NOT yet marked as running in the log. We do that in the keep-running routine. + (thread-start! *watchdog*))) ;; when things go wrong we don't want to be doing the various queries too often ;; so we strive to run this stuff only every four seconds or so. (let* ((sync-time (- (current-milliseconds) start-time)) (rem-time (quotient (- 4000 sync-time) 1000))) @@ -423,183 +398,111 @@ ;; Check that iface and port have not changed (can happen if server port collides) (mutex-lock! *heartbeat-mutex*) (set! sdat *server-info*) (mutex-unlock! *heartbeat-mutex*) - (if (or (not (equal? sdat (list iface port))) - (not server-id)) - (begin - (debug:print-info 0 *default-log-port* "interface changed, refreshing iface and port info") - (set! iface (car sdat)) - (set! port (cadr sdat)) - (server:write-dotserver *toppath* iface port (current-process-id) 'http))) + (if (not (equal? sdat (list iface port))) + (let ((new-iface (car sdat)) + (new-port (cadr sdat))) + (debug:print-info 0 *default-log-port* "WARNING: interface changed, refreshing iface and port info") + (set! iface new-iface) + (set! port new-port) + (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds)) + (flush-output *default-log-port*))) ;; Transfer *db-last-access* to last-access to use in checking that we are still alive (mutex-lock! *heartbeat-mutex*) (set! last-access *db-last-access*) (mutex-unlock! *heartbeat-mutex*) + + (if (common:low-noise-print 120 (conc "server running on " iface ":" port)) + (begin + (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds)) + (flush-output *default-log-port*))) - ;; (debug:print 11 *default-log-port* "last-access=" last-access ", server-timeout=" server-timeout) - ;; - ;; no_traffic, no running tests, if server 0, no running servers - ;; - ;; (let ((wait-on-running (configf:lookup *configdat* "server" b"wait-on-running"))) ;; wait on running tasks (if not true then exit on time out) - ;; (let* ((hrs-since-start (/ (- (current-seconds) server-start-time) 3600)) (adjusted-timeout (if (> hrs-since-start 1) (- server-timeout (inexact->exact (round (* hrs-since-start 60)))) ;; subtract 60 seconds per hour server-timeout))) (if (common:low-noise-print 120 "server timeout") (debug:print-info 0 *default-log-port* "Adjusted server timeout: " adjusted-timeout)) (cond - ((not (server:confirm-dotserver *toppath* iface port (current-process-id) 'http)) - (debug:print-info 0 *default-log-port* "Server .server file does not exist or contents do not match. Initiate server shutdown.") - (http-transport:server-shutdown server-id port)) ((and *server-run* - (> (+ last-access server-timeout) - (current-seconds))) + (> (+ last-access server-timeout) + (current-seconds)) + (< (- (current-seconds) server-start-time) 3600)) ;; do not update log or touch log if we've been running for more than one hour. (if (common:low-noise-print 120 "server continuing") - (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access))) - ;; - ;; Consider implementing some smarts here to re-insert the record or kill self is - ;; the db indicates so - ;; - ;; (if (tasks:server-am-i-the-server? tdb run-id) - ;; (tasks:server-set-state! tdb server-id "running")) - ;; + (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access)) + (let ((curr-time (current-seconds))) + (change-file-times server-log-file curr-time curr-time))) (loop 0 server-state bad-sync-count (current-milliseconds))) (else - (debug:print-info 0 *default-log-port* "Server timeed out. seconds since last db access: " (- (current-seconds) last-access)) - (http-transport:server-shutdown server-id port))))))) - -;; code cut out from above -;; -;; (condition-case -;; ;; (if (and (member (mutex-state *db-sync-mutex*) '(abandoned not-abandoned)) -;; ;; (> (- (current-seconds) *db-last-sync*) 5)) ;; if not currently being synced nor recently synced -;; (db:sync-touched *dbstruct-db* *run-id* force-sync: #t) ;; usually done in the watchdog, not here. -;; ((sync-failed)(cond -;; ((> bad-sync-count 10) ;; time to give up -;; (http-transport:server-shutdown server-id port)) -;; (else ;; (> bad-sync-count 0) ;; we've had a fail or two, delay and loop -;; (thread-sleep! 5) -;; (loop count server-state (+ bad-sync-count 1))))) -;; ((exn) -;; (debug:print-error 0 *default-log-port* "error from sync code other than 'sync-failed. Attempting to gracefully shutdown the server") -;; (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running crashed") -;; (exit))) -;; (set! sync-time (- (current-milliseconds) start-time)) -;; (set! rem-time (quotient (- 4000 sync-time) 1000)) -;; (debug:print 4 *default-log-port* "SYNC: time= " sync-time ", rem-time=" rem-time) -;; -;; (if (and (<= rem-time 4) -;; (> rem-time 0)) -;; (thread-sleep! rem-time) -;; (thread-sleep! 4))) ;; fallback for if the math is changed ... - -(define (http-transport:server-shutdown server-id port) + (debug:print-info 0 *default-log-port* "Server timed out. seconds since last db access: " (- (current-seconds) last-access)) + (http-transport:server-shutdown port))))))) + +(define (http-transport:server-shutdown port) (let ((tdbdat (tasks:open-db))) ;;(BB> "http-transport:server-shutdown called") (debug:print-info 0 *default-log-port* "Starting to shutdown the server. pid="(current-process-id)) ;; ;; start_shutdown ;; - (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "shutting-down") + ;; (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "shutting-down") (set! *time-to-exit* #t) ;; tell on-exit to be fast as we've already cleaned up (portlogger:open-run-close portlogger:set-port port "released") - (thread-sleep! 5) -;; (debug:print-info 0 *default-log-port* "Max cached queries was " *max-cache-size*) -;; (debug:print-info 0 *default-log-port* "Number of cached writes " *number-of-writes*) -;; (debug:print-info 0 *default-log-port* "Average cached write time " -;; (if (eq? *number-of-writes* 0) -;; "n/a (no writes)" -;; (/ *writes-total-delay* -;; *number-of-writes*)) -;; " ms") -;; (debug:print-info 0 *default-log-port* "Number non-cached queries " *number-non-write-queries*) -;; (debug:print-info 0 *default-log-port* "Average non-cached time " -;; (if (eq? *number-non-write-queries* 0) -;; "n/a (no queries)" -;; (/ *total-non-write-delay* -;; *number-non-write-queries*)) - ;; " ms") - + (thread-sleep! 1) + + ;; (debug:print-info 0 *default-log-port* "Max cached queries was " *max-cache-size*) + ;; (debug:print-info 0 *default-log-port* "Number of cached writes " *number-of-writes*) + ;; (debug:print-info 0 *default-log-port* "Average cached write time " + ;; (if (eq? *number-of-writes* 0) + ;; "n/a (no writes)" + ;; (/ *writes-total-delay* + ;; *number-of-writes*)) + ;; " ms") + ;; (debug:print-info 0 *default-log-port* "Number non-cached queries " *number-non-write-queries*) + ;; (debug:print-info 0 *default-log-port* "Average non-cached time " + ;; (if (eq? *number-non-write-queries* 0) + ;; "n/a (no queries)" + ;; (/ *total-non-write-delay* + ;; *number-non-write-queries*)) + ;; " ms") + (db:print-current-query-stats) (debug:print-info 0 *default-log-port* "Server shutdown complete. Exiting") - (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running complete") - ;; if the .server file contained :myport then we can remove it - (server:remove-dotserver-file *toppath* port) - ;;(BB> "http-transport:server-shutdown -> exit") (exit))) ;; all routes though here end in exit ... ;; ;; start_server? ;; -(define (http-transport:launch run-id) - (server:attempting-start *toppath*) - (let* ((tdbdat (tasks:open-db))) - (set! *run-id* run-id) - (if (args:get-arg "-daemonize") - (begin - (daemon:ize) - (if *alt-log-file* ;; we should re-connect to this port, I think daemon:ize disrupts it - (begin - (current-error-port *alt-log-file*) - (current-output-port *alt-log-file*))))) - (if (and (server:read-dotserver *toppath*) - (server:check-if-running run-id)) - (begin - (debug:print 0 *default-log-port* "INFO: Server for run-id " run-id " already running") - (exit 0)) - (begin ;; ok, no server detected, clean out any lingering records - (tasks:server-force-clean-running-records-for-run-id (db:delay-if-busy tdbdat) run-id "notresponding"))) - (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id)) - (remtries 4)) - (if (not server-id) - (if (> remtries 0) - (begin - (thread-sleep! 2) - (loop (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id) - (- remtries 1))) - (begin - ;; since we didn't get the server lock we are going to clean up and bail out - (debug:print-info 2 *default-log-port* "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue") - (tasks:server-delete-records-for-this-pid (db:delay-if-busy tdbdat) " http-transport:launch") - (server:complete-attempt *toppath*) - )) - (let* ((th2 (make-thread (lambda () - (debug:print-info 0 *default-log-port* "Server run thread started") - (http-transport:run - (if (args:get-arg "-server") - (args:get-arg "-server") - "-") - run-id - server-id)) "Server run")) - (th3 (make-thread (lambda () - (debug:print-info 0 *default-log-port* "Server monitor thread started") - (http-transport:keep-running server-id run-id)) - "Keep running"))) - (thread-start! th2) - (thread-sleep! 0.25) ;; give the server time to settle before starting the keep-running monitor. - (thread-start! th3) - (set! *didsomething* #t) - (thread-join! th2) - (exit)))))) - -;; (define (http:ping run-id host-port) -;; (let* ((server-dat (http-transport:client-connect (car host-port)(cadr host-port))) -;; (login-res (rmt:login-no-auto-client-setup server-dat run-id))) -;; (if (and (list? login-res) -;; (car login-res)) -;; (begin -;; (print "LOGIN_OK") -;; (exit 0)) -;; (begin -;; (print "LOGIN_FAILED") -;; (exit 1))))) +(define (http-transport:launch) + (if (args:get-arg "-daemonize") + (begin + (daemon:ize) + (if *alt-log-file* ;; we should re-connect to this port, I think daemon:ize disrupts it + (begin + (current-error-port *alt-log-file*) + (current-output-port *alt-log-file*))))) + (let* ((th2 (make-thread (lambda () + (debug:print-info 0 *default-log-port* "Server run thread started") + (http-transport:run + (if (args:get-arg "-server") + (args:get-arg "-server") + "-") + )) "Server run")) + (th3 (make-thread (lambda () + (debug:print-info 0 *default-log-port* "Server monitor thread started") + (http-transport:keep-running) + "Keep running")))) + (thread-start! th2) + (thread-sleep! 0.25) ;; give the server time to settle before starting the keep-running monitor. + (thread-start! th3) + (set! *didsomething* #t) + (thread-join! th2) + (exit))) (define (http-transport:server-signal-handler signum) (signal-mask! signum) (handle-exceptions exn Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -352,15 +352,19 @@ ;; (define *watchdog* (make-thread common:watchdog "Watchdog thread")) (if (not (args:get-arg "-server")) (thread-start! *watchdog*)) ;; if starting a server; wait till we get to running state before kicking off watchdog -;;(BB> "thread-start! watchdog") -(if (args:get-arg "-log") - (let ((oup (open-output-file (args:get-arg "-log")))) - (debug:print-info 0 *default-log-port* "Sending log output to " (args:get-arg "-log")) +(if (or (args:get-arg "-log")(args:get-arg "-server")) ;; redirect the log always when a server + (let* ((tl (or (args:get-arg "-log")(launch:setup))) ;; run launch:setup if -server + (logf (or (args:get-arg "-log") ;; use -log unless we are a server, then craft a logfile name + (conc tl "/logs/server-" (current-process-id) "-" (get-host-name) ".log"))) + (oup (open-output-file logf))) + (if (not (args:get-arg "-log")) + (hash-table-set! args:arg-hash "-log" logf)) ;; fake out future queries of -log + (debug:print-info 0 *default-log-port* "Sending log output to " logf) (set! *default-log-port* oup))) (if (or (args:get-arg "-h") (args:get-arg "-help") (args:get-arg "--help")) @@ -699,50 +703,17 @@ ;;====================================================================== ;; Start the server - can be done in conjunction with -runall or -runtests (one day...) ;; we start the server if not running else start the client thread ;;====================================================================== +;; Server? Start up here. +;; (if (args:get-arg "-server") - - ;; Server? Start up here. - ;; (let ((tl (launch:setup)) - ;; (run-id (and (args:get-arg "-run-id") - ;; (string->number (args:get-arg "-run-id")))) (transport-type (string->symbol (or (args:get-arg "-transport") "http")))) - ;; (if run-id - ;; (begin (server:launch 0 transport-type) (set! *didsomething* #t))) -;; ;; (debug:print-error 0 *default-log-port* "server requires run-id be specified with -run-id"))) -;; -;; ;; Not a server? This section will decide how to communicate -;; ;; -;; ;; Setup client for all expect listed here -;; (if (null? (lset-intersection -;; equal? -;; (hash-table-keys args:arg-hash) -;; '("-list-servers" -;; "-stop-server" -;; "-kill-server" -;; "-show-cmdinfo" -;; "-list-runs" -;; "-ping"))) -;; (if (launch:setup) -;; (let ((run-id (and (args:get-arg "-run-id") -;; (string->number (args:get-arg "-run-id"))))) -;; ;; (set! *fdb* (filedb:open-db (conc *toppath* "/db/paths.db"))) -;; ;; if not list or kill then start a client (if appropriate) -;; (if (or (args-defined? "-h" "-version" "-create-megatest-area" "-create-test") -;; (eq? (length (hash-table-keys args:arg-hash)) 0)) -;; (debug:print-info 1 *default-log-port* "Server connection not needed") -;; (begin -;; ;; (if run-id -;; ;; (client:launch run-id) -;; ;; (client:launch 0) ;; without run-id we'll start a server for "0" -;; #t -;; )))))) (if (or (args:get-arg "-list-servers") (args:get-arg "-stop-server") (args:get-arg "-kill-server")) (let ((tl (launch:setup))) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -36,12 +36,12 @@ (define (rmt:get-connection-info areapath) ;; TODO: push areapath down. (let ((cinfo (remote-conndat *runremote*)) (run-id 0)) (if cinfo cinfo - (if (tasks:server-running-or-starting? (db:delay-if-busy (tasks:open-db)) run-id) - (client:setup run-id) + (if (server:check-if-running areapath) + (client:setup areapath) #f)))) (define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id ;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname)) @@ -62,11 +62,11 @@ (debug:print 0 *default-log-port* "ERROR: 15 tries to start/connect to server. Giving up.") (exit 1)) ;; reset the connection if it has been unused too long ((and *runremote* (remote-conndat *runremote*) - (let ((expire-time (- start-time (remote-server-timeout *runremote*)))) + (let ((expire-time (+ (- start-time (remote-server-timeout *runremote*))(random 30)))) ;; add 30 seconds of noise so that not all running tests expire at the same time causing a storm of server starts (< (http-transport:server-dat-get-last-access (remote-conndat *runremote*)) expire-time))) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 8") (remote-conndat-set! *runremote* #f) (mutex-unlock! *rmt-mutex*) (rmt:send-receive cmd rid params attemptnum: attemptnum)) @@ -92,12 +92,12 @@ ;; on homehost and this is a write, we already have a server, but server has died ((and (cdr (remote-hh-dat *runremote*)) ;; on homehost (not (member cmd api:read-only-queries)) ;; this is a write (remote-server-url *runremote*) ;; have a server - (not (server:read-dotserver *toppath*))) ;; server has died. - (set! *runremote* #f) + (not (server:check-if-running *toppath*))) ;; server has died. + (set! *runremote* (make-remote)) (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4.1") (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; on homehost and this is a write, we already have a server @@ -106,60 +106,28 @@ (remote-server-url *runremote*)) ;; have a server (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4") (rmt:open-qry-close-locally cmd 0 params)) - ;; commented by bb; this was blocking server passive start on write on homehost (case 5) - ;; ;; on homehost and this is a write, we have a server (we know because case 4 checked) - ;; ((and (cdr (remote-hh-dat *runremote*)) ;; on homehost - ;; (not (member cmd api:read-only-queries))) - ;; (mutex-unlock! *rmt-mutex*) - ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4.1") - ;; (rmt:open-qry-close-locally cmd 0 params)) - - ;; on homehost, no server contact made and this is a write, passively start a server ((and (cdr (remote-hh-dat *runremote*)) ; new (not (remote-server-url *runremote*)) (not (member cmd api:read-only-queries))) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5") - (let ((server-url (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call + (let ((server-url (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call (if server-url (remote-server-url-set! *runremote* server-url) ;; the string can be consumed by the client setup if needed - (if (not (server:start-attempted? *toppath*)) - (server:kind-run *toppath*)))) - (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5.1") - (rmt:open-qry-close-locally cmd 0 params)) - - - - ;;; - ;; (begin ;; not on homehost, start server and wait - ;; (mutex-unlock! *rmt-mutex*) - ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5.2") - ;; (tasks:start-and-wait-for-server (tasks:open-db) 0 15) - ;; (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;) ;) -;;;; - - ;; if not on homehost ensure we have a connection to a live server - ;; NOTE: we *have* a homehost record by now - - ;; ((and (not (cdr (remote-hh-dat *runremote*))) ;; not on a homehost - ;; (not (remote-conndat *runremote*)) ;; and no connection - ;; (server:read-dotserver *toppath*)) ;; .server file exists - ;; ;; something caused the server entry in tdb to disappear, but the server is still running - ;; (server:remove-dotserver-file *toppath* ".*") - ;; (mutex-unlock! *rmt-mutex*) - ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 20") - ;; (rmt:send-receive cmd rid params attemptnum: (add1 attemptnum))) + (server:kind-run *toppath*))) + (mutex-unlock! *rmt-mutex*) + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5.1") + (rmt:open-qry-close-locally cmd 0 params)) ((and (not (cdr (remote-hh-dat *runremote*))) ;; not on a homehost (not (remote-conndat *runremote*))) ;; and no connection (debug:print-info 12 *default-log-port* "rmt:send-receive, case 6 hh-dat: " (remote-hh-dat *runremote*) " conndat: " (remote-conndat *runremote*)) (mutex-unlock! *rmt-mutex*) - (tasks:start-and-wait-for-server (tasks:open-db) 0 15) + (server:start-and-wait *toppath*) (remote-conndat-set! *runremote* (rmt:get-connection-info *toppath*)) ;; calls client:setup which calls client:setup-http (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as ;; all set up if get this far, dispatch the query ((cdr (remote-hh-dat *runremote*)) ;; we are on homehost (mutex-unlock! *rmt-mutex*) @@ -198,11 +166,11 @@ (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum) (remote-conndat-set! *runremote* #f) (remote-server-url-set! *runremote* #f) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9.1") (mutex-unlock! *rmt-mutex*) - (tasks:start-and-wait-for-server (tasks:open-db) 0 15) + (server:start-and-wait *toppath*) (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))))))))) ;; (define (rmt:update-db-stats run-id rawcmd params duration) ;; (mutex-lock! *db-stats-mutex*) ;; (handle-exceptions Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -8,11 +8,11 @@ ;; implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR ;; PURPOSE. (require-extension (srfi 18) extras tcp s11n) -(use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest directory-utils posix-extras) +(use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest directory-utils posix-extras matchable) ;; (use zmq) (use spiffy uri-common intarweb http-client spiffy-request-vars) (declare (unit server)) @@ -47,35 +47,15 @@ ;; all routes though here end in exit ... ;; ;; start_server ;; (define (server:launch run-id transport-type) - ;;(BB> "server:launch fired for run-id="run-id" transport-type="transport-type) - - (let ((attempt-in-progress (server:start-attempted? *toppath*))) ; check for .server-starting - (when attempt-in-progress - (debug:print-info 0 *default-log-port* "Server start attempt in progress in other process (=> "attempt-in-progress"<=). Aborting server launch attempt in this process ("(current-process-id)")") - (exit))) - - (let ((dotserver-url (server:check-if-running *toppath*))) ;; check for .server - (when dotserver-url - (debug:print-info 0 *default-log-port* "Server already running (=> "dotserver-url"<=). Aborting server launch attempt in this process ("(current-process-id)")") - (exit) - )) - (case transport-type - ((http)(http-transport:launch run-id)) + ((http)(http-transport:launch)) ;;((nmsg)(nmsg-transport:launch run-id)) ((rpc) (rpc-transport:launch run-id)) - (else (debug:print-error 0 *default-log-port* "unknown server type " transport-type))) - - ;; is this a good place to print server exit stats? - (debug:print 0 "SERVER: max parallel api requests: " *max-api-process-requests*) - - ) -;; (else (debug:print-error 0 *default-log-port* "No known transport set, transport=" transport ", using rpc") -;; (rpc-transport:launch run-id))))) + (else (debug:print-error 0 *default-log-port* "unknown server type " transport-type)))) ;;====================================================================== ;; S E R V E R U T I L I T I E S ;;====================================================================== @@ -106,14 +86,10 @@ ;; (send-message pubsock target send-more: #t) ;; (send-message pubsock (case (server:get-transport) ((rpc) (db:obj->string (vector success/fail query-sig result))) ((http) (db:obj->string (vector success/fail query-sig result))) - ((zmq) - (let ((pub-socket (vector-ref *runremote* 1))) - (send-message pub-socket return-addr send-more: #t) - (send-message pub-socket (db:obj->string (vector success/fail query-sig result))))) ((fs) result) (else (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*) result))) @@ -122,241 +98,256 @@ ;; try running on that host ;; incidental: rotate logs in logs/ dir. ;; (define (server:run areapath) ;; areapath is *toppath* for a given testsuite area (let* ((curr-host (get-host-name)) - (attempt-in-progress (server:start-attempted? areapath)) - (dot-server-url (server:check-if-running areapath)) + ;; (attempt-in-progress (server:start-attempted? areapath)) + ;; (dot-server-url (server:check-if-running areapath)) (curr-ip (server:get-best-guess-address curr-host)) (curr-pid (current-process-id)) (homehost (common:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" )) (target-host (car homehost)) (testsuite (common:get-testsuite-name)) - (logfile (conc areapath "/logs/server.log")) + (logfile (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log")) (cmdln (conc (common:get-megatest-exe) - " -server " (or target-host "-") " -run-id " 0 (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") - (conc " -daemonize -log " logfile) - "") + " -server " (or target-host "-") (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") + " -daemonize " + "") + ;; " -log " logfile " -m testsuite:" testsuite)) ;; (conc " >> " logfile " 2>&1 &"))))) (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread"))) ;; we want the remote server to start in *toppath* so push there (push-directory areapath) - (cond - (attempt-in-progress - (debug:print 0 *default-log-port* "INFO: Not trying to start server because attempt is in progress: "attempt-in-progress)) - (dot-server-url - (debug:print 0 *default-log-port* "INFO: Not trying to start server because one is already running : "dot-server-url)) - (else - (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") - (thread-start! log-rotate) - - ;; host.domain.tld match host? - (if (and target-host - ;; look at target host, is it host.domain.tld or ip address and does it - ;; match current ip or hostname - (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host)) - (not (equal? curr-ip target-host))) - (begin - (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) - (setenv "TARGETHOST" target-host))) - - (setenv "TARGETHOST_LOGF" logfile) - (common:wait-for-normalized-load 4 " delaying server start due to load" remote-host: (get-environment-variable "TARGETHOST")) ;; do not try starting servers on an already overloaded machine, just wait forever - (system (conc "nbfake " cmdln)) - (unsetenv "TARGETHOST_LOGF") - (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) - (thread-join! log-rotate) - (pop-directory))))) - + (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") + (thread-start! log-rotate) + + ;; host.domain.tld match host? + (if (and target-host + ;; look at target host, is it host.domain.tld or ip address and does it + ;; match current ip or hostname + (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host)) + (not (equal? curr-ip target-host))) + (begin + (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) + (setenv "TARGETHOST" target-host))) + + (setenv "TARGETHOST_LOGF" logfile) + (common:wait-for-normalized-load 4 " delaying server start due to load" remote-host: (get-environment-variable "TARGETHOST")) ;; do not try starting servers on an already overloaded machine, just wait forever + (system (conc "nbfake " cmdln)) + (unsetenv "TARGETHOST_LOGF") + (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) + (thread-join! log-rotate) + (pop-directory))) + +;; given a path to a server log return: host port startseconds +;; +(define (server:logf-get-start-info logf) + (let ((rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+)"))) ;; SERVER STARTED: host:port AT timesecs + (with-input-from-file + logf + (lambda () + (let loop ((inl (read-line)) + (lnum 0)) + (if (not (eof-object? inl)) + (let ((mlst (string-match rx inl))) + (if (not mlst) + (if (< lnum 500) ;; give up if more than 500 lines of server log read + (loop (read-line)(+ lnum 1)) + (list #f #f #f)) + (let ((dat (cdr mlst))) + (list (car dat) ;; host + (string->number (cadr dat)) ;; port + (string->number (caddr dat)))))) + (list #f #f #f))))))) + +;; get a list of servers with all relevant data +;; ( mod-time host port start-time pid ) +;; +(define (server:get-list areapath #!key (limit #f)) + (let ((fname-rx (regexp "^(|.*/)server-(\\d+)-(\\S+).log$")) + (day-seconds (* 24 60 60))) + ;; if the directory exists continue to get the list + ;; otherwise attempt to create the logs dir and then + ;; continue + (if (if (directory-exists? (conc areapath "/logs")) + #t + (if (file-write-access? areapath) + (begin + (condition-case + (create-directory (conc areapath "/logs") #t) + (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs"))) + (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list."))) + (directory-exists? (conc areapath "/logs"))) + #f)) + (let* ((server-logs (glob (conc areapath "/logs/server-*.log"))) + (num-serv-logs (length server-logs))) + (if (null? server-logs) + '() + (let loop ((hed (car server-logs)) + (tal (cdr server-logs)) + (res '())) + (let* ((mod-time (file-modification-time hed)) + (down-time (- (current-seconds) mod-time)) + (serv-dat (if (or (< num-serv-logs 10) + (< down-time day-seconds)) + (server:logf-get-start-info hed) + '())) ;; don't waste time processing server files not touched in the past day if there are more than ten servers to look at + (serv-rec (cons mod-time serv-dat)) + (fmatch (string-match fname-rx hed)) + (pid (if fmatch (string->number (list-ref fmatch 2)) #f)) + (new-res (if (null? serv-dat) + res + (cons (append serv-rec (list pid)) res)))) + (if (null? tal) + (if (and limit + (> (length new-res) limit)) + new-res ;; (take new-res limit) <= need intelligent sorting before this will work + new-res) + (loop (car tal)(cdr tal) new-res))))))))) + +;; given a list of servers get a list of valid servers, i.e. at least +;; 10 seconds old, has started and is less than 1 hour old and is +;; active (i.e. mod-time < 10 seconds +;; +;; mod-time host port start-time pid +;; +;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off +;; and servers should stick around for about two hours or so. +;; +(define (server:get-best srvlst) + (let ((now (current-seconds))) + (sort + (filter (lambda (rec) + (let ((start-time (list-ref rec 3)) + (mod-time (list-ref rec 0))) + ;; (print "start-time: " start-time " mod-time: " mod-time) + (and start-time mod-time + (> (- now start-time) 0) ;; been running at least 0 seconds + (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds + (< (- now start-time) + (+ (- (string->number (or (configf:lookup *configdat* "server" "runtime") "3600")) + 180) + (random 360))) ;; under one hour running time +/- 180 + ))) + srvlst) + (lambda (a b) + (< (list-ref a 3) + (list-ref b 3)))))) + +(define (server:get-first-best areapath) + (let ((srvrs (server:get-best (server:get-list areapath)))) + (if (and srvrs + (not (null? srvrs))) + (car srvrs) + #f))) + +(define (server:record->url servr) + (match-let (((mod-time host port start-time pid) + servr)) + (if (and host port) + (conc host ":" port) + #f))) + (define (server:get-client-signature) ;; BB> why is this proc named "get-"? it returns nothing -- set! has not return value. (if *my-client-signature* *my-client-signature* (let ((sig (server:mk-signature))) (set! *my-client-signature* sig) *my-client-signature*))) ;; kind start up of servers, wait 40 seconds before allowing another server for a given ;; run-id to be launched (define (server:kind-run areapath) - (let ((last-run-time (hash-table-ref/default *server-kind-run* areapath #f))) - (if (or (not last-run-time) - (> (- (current-seconds) last-run-time) 30)) - (begin - (server:run areapath) - (hash-table-set! *server-kind-run* areapath (current-seconds)))))) + (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun + (call-num (car last-run-dat)) + (when-run (cadr last-run-dat)) + (run-delay (+ (case call-num + ((0) 0) + ((1) 20) + ((2) 300) + (else 600)) + (random 5)))) ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously + (if (> (- (current-seconds) when-run) run-delay) + (server:run areapath)) + (hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds))))) + +(define (server:start-and-wait areapath #!key (timeout 60)) + (let ((give-up-time (+ (current-seconds) timeout))) + (let loop ((server-url (server:check-if-running areapath))) + (if (or server-url + (> (current-seconds) give-up-time)) + server-url + (let ((num-ok (length (server:get-best (server:get-list areapath))))) + (if (< num-ok 2) ;; if there are no decent candidates for servers then try starting a new one + (server:kind-run areapath)) + (thread-sleep! 5) + (loop (server:check-if-running areapath))))))) (define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG. -(define (server:attempting-start areapath) - (with-output-to-file - (conc areapath "/.starting-server") - (lambda () - (print (current-process-id) " on " (get-host-name))))) - -(define (server:complete-attempt areapath) - (delete-file* (conc areapath "/.starting-server"))) - -(define (server:start-attempted? areapath) - (let ((flagfile (conc areapath "/.starting-server"))) - (handle-exceptions - exn - #f ;; if things go wrong pretend we can't see the file - (cond - ((and (file-exists? flagfile) - (< (- (current-seconds) - (file-modification-time flagfile)) - 15)) ;; exists and less than 15 seconds old - (with-input-from-file flagfile (lambda () (read-line)))) - ((file-exists? flagfile) ;; it is stale. - (server:complete-attempt areapath) - #f) - (else #f))))) - -(define (server:read-dotserver areapath) - (let ((dotfile (conc areapath "/.server"))) - (handle-exceptions - exn - #f ;; if things go wrong pretend we can't see the file - (cond - ((not (file-exists? dotfile)) - #f) - ((not (file-read-access? dotfile)) - #f) - ((> (server:dotserver-age-seconds areapath) (+ 5 (server:get-timeout))) - (server:remove-dotserver-file areapath ".*") - #f) - (else - (let* ((line - (with-input-from-file - dotfile - (lambda () - (read-line)))) - (tokens (if (string? line) (string-split line ":") #f))) - (cond - ((eq? 4 (length tokens)) - tokens) - (else #f)))))))) - -(define (server:read-dotserver->url areapath) - (let ((dotserver-tokens (server:read-dotserver areapath))) - (if dotserver-tokens - (conc (list-ref dotserver-tokens 0) ":" (list-ref dotserver-tokens 1)) - #f))) - -;; write a .server file in *toppath* with hostport -;; return #t on success, #f otherwise -;; -(define (server:write-dotserver areapath host port pid transport) - (let ((lock-file (conc areapath "/.server.lock")) - (server-file (conc areapath "/.server"))) - (if (common:simple-file-lock lock-file) - (let ((res (handle-exceptions - exn - #f ;; failed for some reason, for the moment simply return #f - (with-output-to-file server-file - (lambda () - (print (conc host ":" port ":" pid ":" transport)))) - #t))) - (debug:print-info 0 *default-log-port* "server file " server-file " for " host ":" port " created pid="pid) - (common:simple-file-release-lock lock-file) - res) - #f))) - - -;; this will check that the .server file present matches the server calling this procedure. -;; if parameters match (this-pid and transport) the file will be touched and #t returned -;; otherwise #f will be returned. -(define (server:confirm-dotserver areapath this-iface this-port this-pid this-transport) - (let* ((tokens (server:read-dotserver areapath))) - (cond - ((not tokens) - (debug:print-info 0 *default-log-port* "INFO: .server file does not exist.") - #f) - ((not (eq? 4 (length tokens))) - (debug:print-info 0 *default-log-port* "INFO: .server file is corrupt. There are not 4 tokens as expeted; there are "(length tokens)".") - #f) - ((not (equal? this-iface (list-ref tokens 0))) - (debug:print-info 0 *default-log-port* "INFO: .server file mismatch. for iface, server has value >"(list-ref tokens 0)"< but this server's value is >"this-iface"<") - #f) - ((not (equal? (->string this-port) (list-ref tokens 1))) - (debug:print-info 0 *default-log-port* "INFO: .server file mismatch. for port, .server has value >"(list-ref tokens 1)"< but this server's value is >"(->string this-port)"<") - #f) - ((not (equal? (->string this-pid) (list-ref tokens 2))) - (debug:print-info 0 *default-log-port* "INFO: .server file mismatch. for pid, .server has value >"(list-ref tokens 2)"< but this server's value is >"(->string this-pid)"<") - #f) - ((not (equal? (->string this-transport) (->string (list-ref tokens 3)))) - (debug:print-info 0 *default-log-port* "INFO: .server file mismatch. for transport, .server has value >"(list-ref tokens 3)"< but this server's value is >"this-transport"<") - #f) - (else (server:touch-dotserver areapath) - #t)))) - -(define (server:touch-dotserver areapath) - (let ((server-file (conc areapath "/.server"))) - (change-file-times server-file (current-seconds) (current-seconds)))) - (define (server:dotserver-age-seconds areapath) (let ((server-file (conc areapath "/.server"))) (begin (handle-exceptions exn #f (- (current-seconds) (file-modification-time server-file)))))) -(define (server:remove-dotserver-file areapath hostport) - (let ((dotserver-url (server:read-dotserver->url areapath)) - (server-file (conc areapath "/.server")) - (lock-file (conc areapath "/.server.lock"))) - (if (and dotserver-url (string-match (conc ".*:" hostport "$") dotserver-url)) ;; port matches, good enough info to decide to remove the file - (if (common:simple-file-lock lock-file) - (begin - (handle-exceptions - exn - #f - (delete-file* server-file)) - (debug:print-info 0 *default-log-port* "server file " server-file " for " hostport " removed") - (common:simple-file-release-lock lock-file)) - (debug:print-info 0 *default-log-port* "server file " server-file " for " hostport " NOT removed - could not get lock.")) - (debug:print-info 0 *default-log-port* "server file " server-file " for " hostport " NOT removed - dotserver-url("dotserver-url") did not match hostport pattern ("hostport")")))) - ;; no longer care if multiple servers are started by accident. older servers will drop off in time. ;; (define (server:check-if-running areapath) - (let* ((dotserver-url (server:read-dotserver->url areapath))) ;; tdbdat (tasks:open-db))) - (if dotserver-url - (let* ((res (case *transport-type* - ((http)(server:ping-server dotserver-url)) - ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server) - ))) - (if res - dotserver-url - (begin - (server:remove-dotserver-file areapath ".*") ;; remove stale dotserver - #f))) - #f))) + (let* ((servers (server:get-best (server:get-list areapath)))) + (if (null? servers) + #f + (let loop ((hed (car servers)) + (tal (cdr servers))) + (let ((res (server:check-server hed))) + (if res + res + (if (null? tal) + #f + (loop (car tal)(cdr tal))))))))) + +;; ping the given server +;; +(define (server:check-server server-record) + (let* ((server-url (server:record->url server-record)) + (res (case *transport-type* + ((http)(server:ping server-url)) + ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server) + ))) + (if res + server-url + #f))) + +(define (server:kill servr) + (match-let (((mod-time hostname port start-time pid) + servr)) + (tasks:kill-server hostname pid))) ;; called in megatest.scm, host-port is string hostname:port ;; ;; NOTE: This is NOT called directly from clients as not all transports support a client running ;; in the same process as the server. ;; (define (server:ping host-port-in #!key (do-exit #f)) (let ((host:port (if (not host-port-in) ;; use read-dotserver to find - (server:read-dotserver->url *toppath*) - (if (number? host-port-in) ;; we were handed a server-id - (let ((srec (tasks:get-server-by-id (db:delay-if-busy (tasks:open-db)) host-port-in))) - ;; (print "srec: " srec " host-port-in: " host-port-in) - (if srec - (conc (vector-ref srec 3) ":" (vector-ref srec 4)) - (conc "no such server-id " host-port-in))) - host-port-in)))) + #f ;; (server:check-if-running *toppath*) + ;; (if (number? host-port-in) ;; we were handed a server-id + ;; (let ((srec (tasks:get-server-by-id (db:delay-if-busy (tasks:open-db)) host-port-in))) + ;; ;; (print "srec: " srec " host-port-in: " host-port-in) + ;; (if srec + ;; (conc (vector-ref srec 3) ":" (vector-ref srec 4)) + ;; (conc "no such server-id " host-port-in))) + host-port-in))) ;; ) (let* ((host-port (if host:port (let ((slst (string-split host:port ":"))) (if (eq? (length slst) 2) (list (car slst)(string->number (cadr slst))) #f)) - #f)) - (toppath (launch:setup))) + #f))) +;; (toppath (launch:setup))) ;; (print "host-port=" host-port) (if (not host-port) (begin (if host-port-in (debug:print 0 *default-log-port* "ERROR: bad host:port")) @@ -367,15 +358,17 @@ (server-dat (http-transport:client-connect iface port)) (login-res (rmt:login-no-auto-client-setup server-dat))) (if (and (list? login-res) (car login-res)) (begin - (print "LOGIN_OK") - (if do-exit (exit 0))) + ;; (print "LOGIN_OK") + (if do-exit (exit 0)) + #t) (begin - (print "LOGIN_FAILED") - (if do-exit (exit 1))))))))) + ;; (print "LOGIN_FAILED") + (if do-exit (exit 1)) + #f))))))) ;; run ping in separate process, safest way in some cases ;; (define (server:ping-server ifaceport) (with-input-from-pipe Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -170,315 +170,24 @@ (define (tasks:hostinfo-get-pubport vec) (vector-ref vec 3)) (define (tasks:hostinfo-get-transport vec) (vector-ref vec 4)) (define (tasks:hostinfo-get-pid vec) (vector-ref vec 5)) (define (tasks:hostinfo-get-hostname vec) (vector-ref vec 6)) -(define (tasks:server-lock-slot mdb run-id) - (tasks:server-clean-out-old-records-for-run-id mdb run-id " tasks:server-lock-slot") - (if (< (tasks:num-in-available-state mdb run-id) 4) - (begin - (tasks:server-set-available mdb run-id) - (thread-sleep! (/ (random 1500) 1000)) ;; (thread-sleep! 2) ;; Try removing this. It may not be needed. - (tasks:server-am-i-the-server? mdb run-id)) - #f)) - -;; register that this server may come online (first to register goes though with the process) -(define (tasks:server-set-available mdb run-id) - (sqlite3:execute - mdb - "INSERT INTO servers (pid,hostname,port,pubport,start_time, priority,state,mt_version,heartbeat, interface,transport,run_id) - VALUES(?, ?, ?, ?, strftime('%s','now'), ?, ?, ?,-1,?, ?, ?);" - (current-process-id) ;; pid - (get-host-name) ;; hostname - -1 ;; port - -1 ;; pubport - (random 1000) ;; priority (used a tiebreaker on get-available) - "available" ;; state - (common:version-signature) ;; mt_version - -1 ;; interface - ;; (conc (server:get-transport)) ;; transport - (conc *transport-type*) ;; transport - run-id - )) - -(define (tasks:num-in-available-state mdb run-id) - (let ((res 0)) - (sqlite3:for-each-row - (lambda (num-in-queue) - (set! res num-in-queue)) - mdb - "SELECT count(id) FROM servers WHERE run_id=? AND state = 'available' AND (strftime('%s','now') - start_time) < 30 ;" - run-id) - res)) - -(define (tasks:num-servers-non-zero-running mdb) - (let ((res 0)) - (sqlite3:for-each-row - (lambda (num-running) - (set! res num-running)) - mdb - "SELECT count(id) FROM servers WHERE run_id != 0 AND state = 'running';") - res)) - -(define (tasks:server-clean-out-old-records-for-run-id mdb run-id tag) - (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE state in ('available','dbprep','shutting-down') AND (strftime('%s','now') - start_time) > 50 AND run_id=?;" - (conc "defunct" tag) run-id)) - -(define (tasks:server-force-clean-running-records-for-run-id mdb run-id tag) - (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE state = 'running' AND run_id=?;" - (conc "defunct" tag) run-id)) - -(define (tasks:server-force-clean-run-record mdb run-id iface port tag) - (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE state = 'running' AND run_id=? AND interface=? AND port=?;" - (conc "defunct" tag) run-id iface port)) - - -;; BB> adding missing func for --list-servers -(define (tasks:server-deregister mdb hostname #!key (pullport #f) (pid #f) (action #f)) ;;pullport pid: pid action: 'delete)) - (if (eq? action 'delete) - (sqlite3:execute mdb "DELETE FROM servers WHERE pid=? AND port=? AND hostname=?;" pid pullport hostname) - (sqlite3:execute mdb "UPDATE servers SET state='defunct', heartbeat=strftime('%s','now') WHERE hostname=? AND pid=?;" - hostname pid))) - -(define (tasks:server-delete-records-for-this-pid mdb tag) - (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE hostname=? AND pid=?;" - (conc "defunct" tag) (get-host-name) (current-process-id))) - -(define (tasks:server-delete-record mdb server-id tag) - (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE id=?;" - (conc "defunct" tag) server-id) - ;; use this opportuntity to clean out records over one month old or over 10 minutes old with port = -1 (i.e. a never used placeholder) - (sqlite3:execute mdb "DELETE FROM servers WHERE state not in ('running','shutting-down','dbprep') AND (strftime('%s','now') - start_time) > 2628000;") - (sqlite3:execute mdb "DELETE FROM servers WHERE state like 'defunct%' AND port=-1 AND (strftime('%s','now') - start_time) > 600;") - ) - -(define (tasks:server-set-state! mdb server-id state) - (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE id=?;" state server-id)) - -(define (tasks:server-set-interface-port mdb server-id interface port) - (sqlite3:execute mdb "UPDATE servers SET interface=?,port=?,heartbeat=strftime('%s','now') WHERE id=?;" interface port server-id)) - -;; Get random port not used in long time -;; -(define (tasks:server-get-next-port mdb) - (let* ((lownum 30000) - (highnum 64000) - (used-ports '()) - (get-rand-port (lambda () - (+ lownum (random (- highnum lownum))))) - (port-param (if (and (args:get-arg "-port") - (string->number (args:get-arg "-port"))) - (string->number (args:get-arg "-port")) - #f)) - ;; (config-port (if (and (config-lookup *configdat* "server" "port") - ;; (string->number (config-lookup *configdat* "server" "port"))) - ;; (string->number (config-lookup *configdat* "server" "port")) - ;; #f)) - ) - (sqlite3:for-each-row - (lambda (port) - (set! used-ports (cons port used-ports))) - mdb - "SELECT port FROM servers;") - (cond - ((and port-param res) (if (> res port-param) res port-param)) - (port-param port-param) - ;; ((and config-port res) (if (> res config-port) res config-port)) - ;; (config-port config-port) - (else - (let loop ((port (get-rand-port)) - (remtries 100)) - (if (member port used-ports) - (if (> remtries 0) - (loop (get-rand-port)(- remtries 1)) - (get-rand-port)) - port)))))) - -(define (tasks:server-am-i-the-server? mdb run-id) - (let* ((all (tasks:server-get-servers-vying-for-run-id mdb run-id)) - (first (if (null? all) - #f;; (begin (debug:print-error 0 *default-log-port* "no servers listed, should be at least one by now.") - ;; (sqlite3:finalize! mdb) - ;; (exit 1)) - (car (db:get-rows all))))) - (if first - (let* ((header (db:get-header all)) - (id (db:get-value-by-header first header "id")) - (hostname (db:get-value-by-header first header "hostname")) - (pid (db:get-value-by-header first header "pid")) - (priority (db:get-value-by-header first header "priority"))) - ;; (debug:print 0 *default-log-port* "INFO: am-i-the-server got record " first) - ;; for now a basic check. add tiebreaking by priority later - (if (and (equal? hostname (get-host-name)) - (equal? pid (current-process-id))) - id - #f)) - #f))) - -;; Use: (db:get-value-by-header (car (db:get-rows dat)) (db:get-header dat) "fieldname") -;; to extract info from the structure returned -;; -(define (tasks:server-get-servers-vying-for-run-id mdb run-id) - (let* ((header (list "id" "hostname" "pid" "interface" "port" "pubport" "state" "run_id" "priority" "start_time")) - (selstr (string-intersperse header ",")) - (res '())) - (sqlite3:for-each-row - (lambda (a . b) - (set! res (cons (apply vector a b) res))) - mdb - (conc "SELECT " selstr " FROM servers WHERE state in ('available','running','dbprep') ORDER BY start_time DESC;") - ) - (vector header res))) - -(define (tasks:get-server mdb run-id #!key (retries 10)) - (let ((res #f) - (best #f)) - (handle-exceptions - exn - (begin - (print-call-chain (current-error-port)) - (debug:print 0 *default-log-port* "WARNING: tasks:get-server db access error.") - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) - (debug:print 0 *default-log-port* " for run " run-id) - (print-call-chain (current-error-port)) - (if (> retries 0) - (begin - (debug:print 0 *default-log-port* " trying call to tasks:get-server again in 10 seconds") - (thread-sleep! 10) - (tasks:get-server mdb run-id retries: (- retries 0))) - (debug:print 0 *default-log-port* "10 tries of tasks:get-server all crashed and burned. Giving up and returning \"no server found\""))) - (sqlite3:for-each-row - (lambda (id interface port pubport transport pid hostname) - (set! res (vector id interface port pubport transport pid hostname))) - mdb - ;; removed: - ;; strftime('%s','now')-heartbeat < 10 AND mt_version = ? - "SELECT id,interface,port,pubport,transport,pid,hostname FROM servers - WHERE run_id=? AND state='running' - ORDER BY start_time DESC LIMIT 1;" run-id) ;; (common:version-signature) run-id) - res))) - -(define (tasks:server-running-or-starting? mdb run-id) - (let ((res #f)) - (sqlite3:for-each-row - (lambda (id) - (set! res id)) - mdb ;; NEEDS dbprep ADDED - "SELECT id FROM servers WHERE run_id=? AND (state = 'running' OR (state = 'dbprep' AND (strftime('%s','now') - start_time) < 60));" run-id) - res)) - -(define (tasks:server-running? mdb run-id) - (let ((res #f)) - (sqlite3:for-each-row - (lambda (id) - (set! res id)) - mdb ;; NEEDS dbprep ADDED - "SELECT id FROM servers WHERE run_id=? AND state = 'running';" run-id) - res)) - (define (tasks:need-server run-id) (equal? (configf:lookup *configdat* "server" "required") "yes")) -;; (maxqry (cdr (rmt:get-max-query-average run-id))) -;; (threshold (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "10")))) -;; (cond -;; (forced -;; (if (common:low-noise-print 60 run-id "server required is set") -;; (debug:print-info 0 *default-log-port* "Server required is set, starting server for run-id " run-id ".")) -;; #t) -;; ((> maxqry threshold) -;; (if (common:low-noise-print 60 run-id "Max query time execeeded") -;; (debug:print-info 0 *default-log-port* "Max avg query time of " maxqry "ms exceeds limit of " threshold "ms, server needed for run-id " run-id ".")) -;; #t) -;; (else -;; #f)))) - -;; try to start a server and wait for it to be available -;; -(define (tasks:start-and-wait-for-server tdbdat run-id delay-max-tries) - ;; ensure a server is running for this run - (let loop ((server-dat (tasks:get-server (db:delay-if-busy tdbdat) run-id)) - (delay-time 0)) - (if (and (not server-dat) - (< delay-time delay-max-tries)) - (begin - (if (common:low-noise-print 60 "tasks:start-and-wait-for-server" run-id) - (debug:print 0 *default-log-port* "Try starting server for run-id " run-id)) - (thread-sleep! (/ (random 2000) 1000)) - (server:kind-run *toppath*) - (thread-sleep! (min delay-time 1)) - (if (not (or (server:start-attempted? *toppath*) - (server:read-dotserver *toppath*))) ;; no point in trying - (loop (tasks:get-server (db:delay-if-busy tdbdat) run-id)(+ delay-time 1)) - #f)) - #f))) - -(define (tasks:get-all-servers mdb) - (let ((res '())) - (sqlite3:for-each-row - (lambda (id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id) - ;; 0 1 2 3 4 5 6 7 8 9 10 11 12 - (set! res (cons (vector id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id) res))) - mdb - "SELECT id,pid,hostname,interface,port,pubport,start_time,priority,state,mt_version,strftime('%s','now')-heartbeat AS last_update,transport,run_id - FROM servers WHERE state NOT LIKE 'defunct%' ORDER BY start_time DESC;") - res)) - -(define (tasks:get-server-by-id mdb id) - (let ((res #f)) - (sqlite3:for-each-row - (lambda (id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id) - ;; 0 1 2 3 4 5 6 7 8 9 10 11 12 - (set! res (vector id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id))) - mdb - "SELECT id,pid,hostname,interface,port,pubport,start_time,priority,state,mt_version,strftime('%s','now')-heartbeat AS last_update,transport,run_id - FROM servers WHERE id=?;" - id) - res)) - -(define (tasks:get-server-records mdb run-id) - (let ((res '())) - (sqlite3:for-each-row - (lambda (id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id) - ;; 0 1 2 3 4 5 6 7 8 9 10 11 12 - (set! res (cons (vector id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id) res))) - mdb - "SELECT id,pid,hostname,interface,port,pubport,start_time,priority,state,mt_version,strftime('%s','now')-heartbeat AS last_update,transport,run_id - FROM servers WHERE run_id=? AND state NOT LIKE 'defunct%' ORDER BY start_time DESC;" - run-id) - (reverse res))) - ;; no elegance here ... ;; (define (tasks:kill-server hostname pid #!key (kill-switch "")) - (server:remove-dotserver-file *toppath* ".*") (debug:print-info 0 *default-log-port* "Attempting to kill server process " pid " on host " hostname) (setenv "TARGETHOST" hostname) (setenv "TARGETHOST_LOGF" "server-kills.log") (system (conc "nbfake kill "kill-switch" "pid)) (unsetenv "TARGETHOST_LOGF") (unsetenv "TARGETHOST")) -;; look up a server by run-id and send it a kill, also delete the record for that server -;; -(define (tasks:kill-server-run-id run-id #!key (tag "default")) - (let* ((tdbdat (tasks:open-db)) - (sdat (tasks:get-server (db:delay-if-busy tdbdat) run-id))) - (if sdat - (let ((hostname (vector-ref sdat 6)) - (pid (vector-ref sdat 5)) - (server-id (vector-ref sdat 0))) - (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "killed") - (debug:print-info 0 *default-log-port* "Killing server " server-id " for run-id " run-id " on host " hostname " with pid " pid) - (server:remove-dotserver-file *toppath* ".*") - (tasks:kill-server hostname pid) - (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) ) - (debug:print-info 0 *default-log-port* "No server found for run-id " run-id ", nothing to kill")) - ;; (sqlite3:finalize! tdb) - )) - ;;====================================================================== ;; M O N I T O R S ;;====================================================================== (define (tasks:remove-monitor-record mdb) ADDED utils/lock-stats.sh Index: utils/lock-stats.sh ================================================================== --- /dev/null +++ utils/lock-stats.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +while IFS=': ' read x x x x p x x i x; do + if ! [[ ${i}x == "x" ]];then + if ! $(echo $i|grep EOF >/dev/null);then + fname=$(find -L "/proc/$p/fd" -maxdepth 1 -inum "$i" -exec readlink {} \; -quit) + if $(echo $fname | grep megatest.db > /dev/null) || \ + $(echo $fname | egrep '.db/\d+.db' > /dev/null);then + echo $fname + fi + fi + fi +done < /proc/locks Index: utils/mk_wrapper ================================================================== --- utils/mk_wrapper +++ utils/mk_wrapper @@ -28,21 +28,21 @@ echo "#!/bin/bash" > $target if [[ $cmd =~ dboard ]]; then cat >> $target <<'EOF' -# disable if not running on homehost -if [[ -e .homehost ]]; then - homehostname=$( host `cat .homehost` | awk '{print $NF}' | sed 's/\.$//' ) - hostname=$( hostname -f ) - - if [[ ! ($homehostname == $hostname) ]]; then - echo "ERROR: this host ($hostname) is not the homehost ($homehostname) for this megatest run area. Cannot start dashboard." - echo " Please log into homehost before launching dashboard." - exit 1 - fi -fi +# # disable if not running on homehost +# if [[ -e .homehost ]]; then +# homehostname=$( host `cat .homehost` | awk '{print $NF}' | sed 's/\.$//' ) +# hostname=$( hostname -f ) +# +# if [[ ! ($homehostname == $hostname) ]]; then +# echo "ERROR: this host ($hostname) is not the homehost ($homehostname) for this megatest run area. Cannot start dashboard." +# echo " Please log into homehost before launching dashboard." +# exit 1 +# fi +# fi # check that $DISPLAY is set if [[ -z $DISPLAY ]]; then echo 'ERROR: $DISPLAY environment variable is not set; megatest dashboard requires X display address to be set in $DISPLAY.' exit 1