Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -1275,11 +1275,11 @@ (define (cdb:flush-queue serverdat) (cdb:client-call serverdat 'flush #f *default-numtries*)) (define (cdb:kill-server serverdat) - (cdb:client-call serverdat 'killserver #f *default-numtries*)) + (cdb:client-call serverdat 'killserver #t *default-numtries*)) (define (cdb:roll-up-pass-fail-counts serverdat run-id test-name item-path status) (cdb:client-call serverdat 'immediate #f *default-numtries* open-run-close db:roll-up-pass-fail-counts #f run-id test-name item-path status)) (define (cdb:get-test-info serverdat run-id test-name item-path) @@ -1333,11 +1333,12 @@ login immediate flush sync set-verbosity - killserver)) + killserver + )) ;; not used, intended to indicate to run in calling process (define db:run-local-queries '()) ;; rollup-tests-pass-fail)) (define (db:process-cached-writes db) @@ -1468,11 +1469,11 @@ ;; otherwise if appropriate flush the queue (this is a read or complex query) (begin (cond ((member stmt-key db:special-queries) (let ((starttime (current-milliseconds))) - (debug:print-info 11 "Handling special statement " stmt-key) + (debug:print-info 9 "Handling special statement " stmt-key) (case stmt-key ((immediate) ;; This is a read or mixed read-write query, must clear the cache (case *transport-type* ((http) Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -121,21 +121,24 @@ (if (< portnum 9000) (begin (print "WARNING: failed to start on portnum: " portnum ", trying next port") (thread-sleep! 0.1) ;; (open-run-close tasks:remove-server-records tasks:open-db) + (open-run-close tasks:server-delete tasks:open-db ipaddrstr portnum) (http-transport:try-start-server ipaddrstr (+ portnum 1))) (print "ERROR: Tried and tried but could not start the server"))) + ;; any error in following steps will result in a retry (set! *runremote* (list ipaddrstr portnum)) ;; (open-run-close tasks:remove-server-records tasks:open-db) (open-run-close tasks:server-register tasks:open-db (current-process-id) ipaddrstr portnum 0 'live 'http) (print "INFO: Trying to start server on " ipaddrstr ":" portnum) ;; This starts the spiffy server (start-server port: portnum) + (open-run-close tasks:server-delete tasks:open-db ipaddrstr portnum) (print "INFO: server has been stopped"))) ;;====================================================================== ;; S E R V E R U T I L I T I E S ;;====================================================================== @@ -229,10 +232,22 @@ (let () ;; (queue-len (cdb:client-call server-info 'sync #t 1))) ;; (print "Server running, count is " count) (if (< count 1) ;; 3x3 = 9 secs aprox (loop (+ count 1))) + ;; Check that iface and port have not changed (can happen if server port collides) + (mutex-lock! *heartbeat-mutex*) + (set! sdat *runremote*) + (mutex-unlock! *heartbeat-mutex*) + + (if (not (equal? sdat (list iface port))) + (begin + (debug:print-info 1 "interface changed, refreshing iface and port info") + (set! iface (car sdat)) + (set! port (cadr sdat)) + (set! spid (tasks:server-get-server-id tdb #f iface port #f)))) + ;; NOTE: Get rid of this mechanism! It really is not needed... (tasks:server-update-heartbeat tdb spid) ;; (if ;; (or (> numrunning 0) ;; stay alive for two days after last access (mutex-lock! *heartbeat-mutex*) Index: megatest-version.scm ================================================================== --- megatest-version.scm +++ megatest-version.scm @@ -1,7 +1,7 @@ ;; Always use two digit decimal ;; 1.01, 1.02...1.10,1.11 ... 1.99,2.00.. (declare (unit megatest-version)) -(define megatest-version 1.5414) +(define megatest-version 1.5415) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -33,29 +33,29 @@ (include "db_records.scm") (include "megatest-fossil-hash.scm") ;; (use trace dot-locking) ;; (trace -;; thread-sleep! +;; cdb:client-call +;; cdb:remote-run +;; cdb:test-set-status-state +;; change-directory +;; db:process-queue-item +;; db:test-get-logfile-info +;; db:teststep-set-status! +;; nice-path +;; obtain-dot-lock +;; open-run-close +;; read-config +;; runs:can-run-more-tests ;; sqlite3:execute ;; sqlite3:for-each-row -;; open-run-close -;; runs:can-run-more-tests -;; cdb:remote-run -;; nice-path -;; read-config -;; db:teststep-set-status! -;; tests:test-set-status! -;; cdb:test-set-status-state -;; cdb:client-call -;; tests:check-waiver-eligibility +;; tests:check-waiver-eligibility ;; tests:summarize-items -;; db:test-get-logfile-info -;; obtain-dot-lock -;; change-directory -;; cdb:remote-run -;; ) +;; tests:test-set-status! +;; thread-sleep! +;;) (define help (conc " Megatest, documentation at http://www.kiatoa.com/fossils/megatest version " megatest-version " @@ -130,10 +130,11 @@ -server -|hostname : start the server (reduces contention on megatest.db), use - to automatically figure out hostname -transport http|zmq : use http or zmq for transport (default is http) -daemonize : fork into background and disconnect from stdin/out -list-servers : list the servers + -stop-server id : stop server specified by id (see output of -list-servers) -repl : start a repl (useful for extending megatest) -load file.scm : load and run file.scm Spreadsheet generation -extract-ods fname.ods : extract an open document spreadsheet from the database @@ -153,11 +154,10 @@ Called as " (string-intersperse (argv) " ") " Version " megatest-version ", built from " megatest-fossil-hash )) ;; -gui : start a gui interface ;; -config fname : override the runconfig file with fname -;; -kill-server host:port|pid : kill server specified by host:port or pid ;; process args (define remargs (args:get-args (argv) (list "-runtests" ;; run a specific test @@ -193,11 +193,11 @@ ":tol" ":units" ;; misc "-server" "-transport" - "-kill-server" + "-stop-server" "-port" "-extract-ods" "-pathmod" "-env2file" "-setvars" @@ -325,25 +325,25 @@ (thread-sleep! 3)) ;; give the server a few seconds to start (debug:print 0 "INFO: Servers already running " servers) ))))) -(if (args:get-arg "-list-servers") - ;; (args:get-arg "-kill-server")) +(if (or (args:get-arg "-list-servers") + (args:get-arg "-stop-server")) (let ((tl (setup-for-run))) (if tl - (let ((servers (open-run-close tasks:get-all-servers tasks:open-db)) - (fmtstr "~5a~8a~8a~20a~20a~10a~10a~10a~10a~10a\n") - (servers-to-kill '())) + (let* ((servers (open-run-close tasks:get-all-servers tasks:open-db)) + (fmtstr "~5a~8a~8a~20a~20a~10a~10a~10a~10a~10a\n") + (servers-to-kill '()) + (killinfo (args:get-arg "-stop-server")) + (khost-port (if killinfo (if (substring-index ":" killinfo)(string-split ":") #f) #f)) + (sid (if killinfo (if (substring-index ":" killinfo) #f (string->number killinfo)) #f))) (format #t fmtstr "Id" "MTver" "Pid" "Host" "Interface" "OutPort" "InPort" "LastBeat" "State" "Transport") (format #t fmtstr "==" "=====" "===" "====" "=========" "=======" "======" "========" "=====" "=========") (for-each (lambda (server) - (let* (;; (killinfo (args:get-arg "-kill-server")) - ;; (khost-port (if killinfo (if (substring-index ":" killinfo)(string-split ":") #f) #f)) - ;; (kpid (if killinfo (if (substring-index ":" killinfo) #f (string->number killinfo)) #f)) - (id (vector-ref server 0)) + (let* ((id (vector-ref server 0)) (pid (vector-ref server 1)) (hostname (vector-ref server 2)) (interface (vector-ref server 3)) (pullport (vector-ref server 4)) (pubport (vector-ref server 5)) @@ -361,13 +361,16 @@ (if (equal? state "dead") (if (> last-update (* 25 60 60)) ;; keep records around for slighly over a day. (open-run-close tasks:server-deregister tasks:open-db hostname pullport: pullport pid: pid action: 'delete)) (if (> last-update 20) ;; Mark as dead if not updated in last 20 seconds (open-run-close tasks:server-deregister tasks:open-db hostname pullport: pullport pid: pid))) - (format #t fmtstr id mt-ver pid hostname interface pullport pubport last-update - (if status "alive" "dead") transport))) + (if status "alive" "dead") transport) + (if (equal? id sid) + (begin + (debug:print-info 0 "Attempting to stop server with pid " pid) + (tasks:kill-server status hostname pullport pid transport))))) servers) (debug:print-info 1 "Done with listservers") (set! *didsomething* #t) (exit) ;; must do, would have to add checks to many/all calls below ) Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -23,18 +23,10 @@ ;;====================================================================== (define (tasks:open-db) (let* ((dbpath (conc *toppath* "/monitor.db")) (exists (file-exists? dbpath)) - ;; ;; BUGGISHNESS: Remove this code in six months. Today is 11/13/2012 - ;; (if (< (file-change-time dbpath) 1352851396.0) - ;; (begin - ;; (debug:print 0 "NOTE: removing old db file " dbpath) - ;; (delete-file dbpath) - ;; #f) - ;; #t) - ;; #f)) (mdb (sqlite3:open-database dbpath)) ;; (never-give-up-open-db dbpath)) (handler (make-busy-timeout 36000))) (sqlite3:set-busy-handler! mdb handler) (sqlite3:execute mdb (conc "PRAGMA synchronous = 0;")) (if (not exists) @@ -125,10 +117,14 @@ (else (sqlite3:execute mdb "UPDATE servers SET state='dead' WHERE hostname=? AND port=?;" hostname port))) (debug:print 0 "ERROR: tasks:server-deregister called with neither pid nor port specified")))) (define (tasks:server-deregister-self mdb hostname) (tasks:server-deregister mdb hostname pid: (current-process-id))) + +;; need a simple call for robustly removing records given host and port +(define (tasks:server-delete mdb hostname port) + (tasks:server-deregister mdb hostname port: port action: 'delete)) (define (tasks:server-get-server-id mdb hostname iface port pid) (debug:print-info 12 "tasks:server-get-server-id " mdb " " hostname " " iface " " port " " pid) (let ((res #f)) (sqlite3:for-each-row @@ -229,17 +225,17 @@ ;; (loop (car tal)(cdr tal)))))))))) (define (tasks:remove-server-records mdb) (sqlite3:execute mdb "DELETE FROM servers;")) -(define (tasks:mark-server hostname port pid state) +(define (tasks:mark-server hostname port pid state transport) (if port (open-run-close tasks:server-deregister tasks:open-db hostname port: port) (open-run-close tasks:server-deregister tasks:open-db hostname pid: pid))) -(define (tasks:kill-server status hostname port pid) +(define (tasks:kill-server status hostname port pid transport) (debug:print-info 1 "Removing defunct server record for " hostname ":" port) (if port (open-run-close tasks:server-deregister tasks:open-db hostname port: port) (open-run-close tasks:server-deregister tasks:open-db hostname pid: pid)) (if status ;; #t means alive @@ -250,14 +246,20 @@ (debug:print-info 0 "server may or may not be dead, check for megatest -server running as pid " pid "\n" " EXCEPTION: " ((condition-property-accessor 'exn 'message) exn)) (debug:print 1 "Sending signal/term to " pid " on " hostname) (process-signal pid signal/term) (thread-sleep! 5) ;; give it five seconds to die peacefully then do a brutal kill - (process-signal pid signal/kill)) ;; local machine, send sig term + ;;(process-signal pid signal/kill) + ) ;; local machine, send sig term (begin - (debug:print-info 1 "Telling alive server on " hostname ":" port " to commit servercide") - (cdb:kill-server zmq-socket)))) ;; remote machine, try telling server to commit suicide + (debug:print-info 1 "Stopping remote servers not yet supported.")))) + ;; (debug:print-info 1 "Telling alive server on " hostname ":" port " to commit servercide") + ;; (let ((serverdat (list hostname port))) + ;; (case (string->symbol transport) + ;; ((http)(http-transport:client-connect hostname port)) + ;; (else (debug:print "ERROR: remote stopping servers of type " transport " not supported yet"))) + ;; (cdb:kill-server serverdat))))) ;; remote machine, try telling server to commit suicide (begin (if status (if (equal? hostname (get-host-name)) (begin (debug:print-info 1 "Sending signal/term to " pid " on " hostname) Index: tests/fullrun/megatest.config ================================================================== --- tests/fullrun/megatest.config +++ tests/fullrun/megatest.config @@ -57,11 +57,11 @@ [server] # If the server can't be started on this port it will try the next port until # it succeeds -port 8099 +port 8080 # This server will keep running this number of hours after last access. # Three minutes is 0.05 hours timeout 0.05