Megatest

Check-in [99ca17a0cc]
Login
Overview
Comment:Fixed issue with server record not reflecting actual server when have port collisions.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | development | v1.5415
Files: files | file ages | folders
SHA1: 99ca17a0cc61ca660eaa51f6305bcf67bdd8f6cf
User & Date: mrwellan on 2013-04-22 16:38:28
Other Links: branch diff | manifest | tags
Context
2013-04-23
08:06
Released version v1.5415 check-in: a97c05c022 user: mrwellan tags: trunk, v1.5415
2013-04-22
23:38
Trying fork instead of system for launching server, added better guesser for ip address to bind to (should bind to all?) check-in: 1b71a45029 user: matt tags: development
19:47
Merged dev into runcontrol check-in: df98c96bb1 user: matt tags: runcontrol
16:38
Fixed issue with server record not reflecting actual server when have port collisions. check-in: 99ca17a0cc user: mrwellan tags: development, v1.5415
2013-04-17
15:28
Converted test steps to matrix, added browsing of log check-in: d187e07c8b user: mrwellan tags: development, v1.5414
Changes

Modified db.scm from [a540e1036f] to [cc06b656a5].

1273
1274
1275
1276
1277
1278
1279
1280

1281
1282
1283
1284
1285
1286
1287
1273
1274
1275
1276
1277
1278
1279

1280
1281
1282
1283
1284
1285
1286
1287







-
+







			(list item-path ""))))
    (cdb:client-call serverdat 'register-test #t *default-numtries* run-id test-name item-path)))

(define (cdb:flush-queue serverdat)
  (cdb:client-call serverdat 'flush #f *default-numtries*))

(define (cdb:kill-server serverdat)
  (cdb:client-call serverdat 'killserver #f *default-numtries*))
  (cdb:client-call serverdat 'killserver #t *default-numtries*))

(define (cdb:roll-up-pass-fail-counts serverdat run-id test-name item-path status)
  (cdb:client-call serverdat 'immediate #f *default-numtries* open-run-close db:roll-up-pass-fail-counts #f run-id test-name item-path status))

(define (cdb:get-test-info serverdat run-id test-name item-path)
  (cdb:client-call serverdat 'immediate #f *default-numtries* open-run-close db:get-test-info #f run-id test-name item-path))

1331
1332
1333
1334
1335
1336
1337
1338


1339
1340
1341
1342
1343
1344
1345
1331
1332
1333
1334
1335
1336
1337

1338
1339
1340
1341
1342
1343
1344
1345
1346







-
+
+







(define db:special-queries   '(rollup-tests-pass-fail
			       db:roll-up-pass-fail-counts
                               login
                               immediate
			       flush
			       sync
			       set-verbosity
			       killserver))
			       killserver
			       ))

;; not used, intended to indicate to run in calling process
(define db:run-local-queries '()) ;; rollup-tests-pass-fail))

(define (db:process-cached-writes db)
  (let ((queries    (make-hash-table))
	(data       #f))
1466
1467
1468
1469
1470
1471
1472
1473

1474
1475
1476
1477
1478
1479
1480
1467
1468
1469
1470
1471
1472
1473

1474
1475
1476
1477
1478
1479
1480
1481







-
+







	  (debug:print-info 7 "Received " response " from wrapped write")
	  (server:reply return-address qry-sig response response))
	;; otherwise if appropriate flush the queue (this is a read or complex query)
	(begin
	  (cond
	   ((member stmt-key db:special-queries)
	    (let ((starttime (current-milliseconds)))
	      (debug:print-info 11 "Handling special statement " stmt-key)
	      (debug:print-info 9 "Handling special statement " stmt-key)
	      (case stmt-key
		((immediate)
		 ;; This is a read or mixed read-write query, must clear the cache
		 (case *transport-type*
		   ((http)
		    (mutex-lock! *db:process-queue-mutex*)
		    (db:process-cached-writes db)

Modified http-transport.scm from [161d181983] to [c1f69a9798].

119
120
121
122
123
124
125

126
127

128
129
130
131
132
133
134
135
136

137
138
139
140
141
142
143
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146







+


+









+







   (begin
     (print-error-message exn)
     (if (< portnum 9000)
	 (begin 
	   (print "WARNING: failed to start on portnum: " portnum ", trying next port")
	   (thread-sleep! 0.1)
	   ;; (open-run-close tasks:remove-server-records tasks:open-db)
	   (open-run-close tasks:server-delete tasks:open-db ipaddrstr portnum)
	   (http-transport:try-start-server ipaddrstr (+ portnum 1)))
	 (print "ERROR: Tried and tried but could not start the server")))
   ;; any error in following steps will result in a retry
   (set! *runremote* (list ipaddrstr portnum))
   ;; (open-run-close tasks:remove-server-records tasks:open-db)
   (open-run-close tasks:server-register 
		   tasks:open-db 
		   (current-process-id)
		   ipaddrstr portnum 0 'live 'http)
   (print "INFO: Trying to start server on " ipaddrstr ":" portnum)
   ;; This starts the spiffy server
   (start-server port: portnum)
   (open-run-close tasks:server-delete tasks:open-db ipaddrstr portnum)
   (print "INFO: server has been stopped")))

;;======================================================================
;; S E R V E R   U T I L I T I E S 
;;======================================================================

;;======================================================================
227
228
229
230
231
232
233












234
235
236
237
238
239
240
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255







+
+
+
+
+
+
+
+
+
+
+
+







      (thread-sleep! 4) ;; no need to do this very often
      ;; NB// sync currently does NOT return queue-length
      (let () ;; (queue-len (cdb:client-call server-info 'sync #t 1)))
      ;; (print "Server running, count is " count)
        (if (< count 1) ;; 3x3 = 9 secs aprox
            (loop (+ count 1)))
        
	;; Check that iface and port have not changed (can happen if server port collides)
	(mutex-lock! *heartbeat-mutex*)
	(set! sdat *runremote*)
	(mutex-unlock! *heartbeat-mutex*)

	(if (not (equal? sdat (list iface port)))
	    (begin 
	      (debug:print-info 1 "interface changed, refreshing iface and port info")
	      (set! iface (car sdat))
	      (set! port  (cadr sdat))
	      (set! spid  (tasks:server-get-server-id tdb #f iface port #f))))

        ;; NOTE: Get rid of this mechanism! It really is not needed...
        (tasks:server-update-heartbeat tdb spid)
      
        ;; (if ;; (or (> numrunning 0) ;; stay alive for two days after last access
        (mutex-lock! *heartbeat-mutex*)
        (set! last-access *last-db-access*)
        (mutex-unlock! *heartbeat-mutex*)

Modified megatest-version.scm from [139eab9acd] to [24437845eb].

1
2
3
4
5
6

7
1
2
3
4
5

6
7





-
+

;; Always use two digit decimal
;; 1.01, 1.02...1.10,1.11 ... 1.99,2.00..

(declare (unit megatest-version))

(define megatest-version 1.5414)
(define megatest-version 1.5415)

Modified megatest.scm from [f7af8ee8db] to [2419ab740a].

31
32
33
34
35
36
37
38

39
40
41
42
43





44


45
46
47


48
49
50


51
52

53
54
55
56


57
58
59
60
61
62
63
31
32
33
34
35
36
37

38




39
40
41
42
43
44
45
46
47
48


49
50



51
52
53

54




55
56
57
58
59
60
61
62
63







-
+
-
-
-
-

+
+
+
+
+

+
+

-
-
+
+
-
-
-
+
+

-
+
-
-
-
-
+
+







(include "common_records.scm")
(include "key_records.scm")
(include "db_records.scm")
(include "megatest-fossil-hash.scm")

;; (use trace dot-locking)
;; (trace
;;   thread-sleep!
;;  cdb:client-call
;;  sqlite3:execute
;;  sqlite3:for-each-row
;;  open-run-close
;;  runs:can-run-more-tests
;;  cdb:remote-run
;;  cdb:test-set-status-state
;;  change-directory
;;  db:process-queue-item
;;  db:test-get-logfile-info
;;  db:teststep-set-status!
;;  nice-path
;;  obtain-dot-lock
;;  open-run-close
;;  read-config
;; db:teststep-set-status!
;; tests:test-set-status!
;;  runs:can-run-more-tests
;;  sqlite3:execute
;; cdb:test-set-status-state
;; cdb:client-call
;; tests:check-waiver-eligibility
;;  sqlite3:for-each-row
;;  tests:check-waiver-eligibility
;;  tests:summarize-items
;;  db:test-get-logfile-info
;;  tests:test-set-status!
;;  obtain-dot-lock
;;  change-directory
;;  cdb:remote-run
;; )
;;  thread-sleep!
;;)
       

(define help (conc "
Megatest, documentation at http://www.kiatoa.com/fossils/megatest
  version " megatest-version "
  license GPL, Copyright Matt Welland 2006-2012

128
129
130
131
132
133
134

135
136
137
138
139
140
141
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142







+







  -setvars VAR1=val1,VAR2=val2 : Add environment variables to a run NB// these are
                                 overwritten by values set in config files.
  -server -|hostname      : start the server (reduces contention on megatest.db), use
                            - to automatically figure out hostname
  -transport http|zmq     : use http or zmq for transport (default is http) 
  -daemonize              : fork into background and disconnect from stdin/out
  -list-servers           : list the servers 
  -stop-server id         : stop server specified by id (see output of -list-servers)
  -repl                   : start a repl (useful for extending megatest)
  -load file.scm          : load and run file.scm

Spreadsheet generation
  -extract-ods fname.ods  : extract an open document spreadsheet from the database
  -pathmod path           : insert path, i.e. path/runame/itempath/logfile.html
                            will clear the field if no rundir/testname/itempath/logfile
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
152
153
154
155
156
157
158

159
160
161
162
163
164
165







-







megatest -test-files 'logs/*.log' -target ubuntu/n%/no% :runname w49% -testpatt test_mt%

Called as " (string-intersperse (argv) " ") "
Version " megatest-version ", built from " megatest-fossil-hash ))

;;  -gui                    : start a gui interface
;;  -config fname           : override the runconfig file with fname
;;  -kill-server host:port|pid : kill server specified by host:port or pid

;; process args
(define remargs (args:get-args 
		 (argv)
		 (list  "-runtests"  ;; run a specific test
			"-config"    ;; override the config file name
			"-execute"   ;; run the command encoded in the base64 parameter
191
192
193
194
195
196
197
198

199
200
201
202
203
204
205
191
192
193
194
195
196
197

198
199
200
201
202
203
204
205







-
+







			":value"
			":expected"
			":tol"
			":units"
			;; misc
			"-server"
			"-transport"
			"-kill-server"
			"-stop-server"
			"-port"
			"-extract-ods"
			"-pathmod"
			"-env2file"
			"-setvars"
			"-set-state-status"
			"-debug" ;; for *verbosity* > 2
323
324
325
326
327
328
329
330
331


332
333
334
335
336






337
338
339
340
341
342
343
344

345
346
347
348
349
350
351
323
324
325
326
327
328
329


330
331
332
333



334
335
336
337
338
339
340
341
342
343




344
345
346
347
348
349
350
351







-
-
+
+


-
-
-
+
+
+
+
+
+




-
-
-
-
+







		    ;; (server:launch (string->symbol (args:get-arg "-transport" "http"))))
		    (system (conc (car (argv)) " -server - -daemonize -transport " (args:get-arg "-transport" "http")))
		    (thread-sleep! 3)) ;; give the server a few seconds to start
		  (debug:print 0 "INFO: Servers already running " servers)
		  )))))
	

(if (args:get-arg "-list-servers")
	;; (args:get-arg "-kill-server"))
(if (or (args:get-arg "-list-servers")
	(args:get-arg "-stop-server"))
    (let ((tl (setup-for-run)))
      (if tl 
	  (let ((servers (open-run-close tasks:get-all-servers tasks:open-db))
		(fmtstr  "~5a~8a~8a~20a~20a~10a~10a~10a~10a~10a\n")
		(servers-to-kill '()))
	  (let* ((servers (open-run-close tasks:get-all-servers tasks:open-db))
		 (fmtstr  "~5a~8a~8a~20a~20a~10a~10a~10a~10a~10a\n")
		 (servers-to-kill '())
		 (killinfo   (args:get-arg "-stop-server"))
		 (khost-port (if killinfo (if (substring-index ":" killinfo)(string-split ":") #f) #f))
		 (sid        (if killinfo (if (substring-index ":" killinfo) #f (string->number killinfo)) #f)))
	    (format #t fmtstr "Id" "MTver" "Pid" "Host" "Interface" "OutPort" "InPort" "LastBeat" "State" "Transport")
	    (format #t fmtstr "==" "=====" "===" "====" "=========" "=======" "======" "========" "=====" "=========")
	    (for-each 
	     (lambda (server)
	       (let* (;; (killinfo   (args:get-arg "-kill-server"))
		      ;; (khost-port (if killinfo (if (substring-index ":" killinfo)(string-split ":") #f) #f))
		      ;; (kpid       (if killinfo (if (substring-index ":" killinfo) #f (string->number killinfo)) #f))
		      (id         (vector-ref server 0))
	       (let* ((id         (vector-ref server 0))
		      (pid        (vector-ref server 1))
		      (hostname   (vector-ref server 2))
		      (interface  (vector-ref server 3))
		      (pullport   (vector-ref server 4))
		      (pubport    (vector-ref server 5))
		      (start-time (vector-ref server 6))
		      (priority   (vector-ref server 7))
359
360
361
362
363
364
365
366
367
368





369
370
371
372
373
374
375
359
360
361
362
363
364
365

366

367
368
369
370
371
372
373
374
375
376
377
378







-

-
+
+
+
+
+







		 ;; no need to login as status of #t indicates we are connecting to correct 
		 ;; server
		 (if (equal? state "dead")
		     (if (> last-update (* 25 60 60)) ;; keep records around for slighly over a day.
			 (open-run-close tasks:server-deregister tasks:open-db hostname pullport: pullport pid: pid action: 'delete))
		     (if (> last-update 20)        ;; Mark as dead if not updated in last 20 seconds
			 (open-run-close tasks:server-deregister tasks:open-db hostname pullport: pullport pid: pid)))

		 (format #t fmtstr id mt-ver pid hostname interface pullport pubport last-update
			 (if status "alive" "dead") transport)))
			 (if status "alive" "dead") transport)
		 (if (equal? id sid)
		     (begin
		       (debug:print-info 0 "Attempting to stop server with pid " pid)
		       (tasks:kill-server status hostname pullport pid transport)))))
	     servers)
	    (debug:print-info 1 "Done with listservers")
	    (set! *didsomething* #t)
	    (exit) ;; must do, would have to add checks to many/all calls below
	    )
	  (exit)))
    ;; if not list or kill then start a client (if appropriate)

Modified tasks.scm from [16cec1c8c7] to [e04939ec8c].

21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
21
22
23
24
25
26
27








28
29
30
31
32
33
34







-
-
-
-
-
-
-
-







;;======================================================================
;; Tasks db
;;======================================================================

(define (tasks:open-db)
  (let* ((dbpath  (conc *toppath* "/monitor.db"))
	 (exists  (file-exists? dbpath))
	 ;; 	      ;; BUGGISHNESS: Remove this code in six months. Today is 11/13/2012
	 ;;              (if (< (file-change-time dbpath) 1352851396.0)
	 ;;        	  (begin
	 ;;        	    (debug:print 0 "NOTE: removing old db file " dbpath)
	 ;;        	    (delete-file dbpath)
	 ;;        	    #f)
	 ;;        	  #t)
	 ;;              #f))
	 (mdb     (sqlite3:open-database dbpath)) ;; (never-give-up-open-db dbpath))
	 (handler (make-busy-timeout 36000)))
    (sqlite3:set-busy-handler! mdb handler)
    (sqlite3:execute mdb (conc "PRAGMA synchronous = 0;"))
    (if (not exists)
	(begin
	  (sqlite3:execute mdb "CREATE TABLE IF NOT EXISTS tasks_queue (id INTEGER PRIMARY KEY,
123
124
125
126
127
128
129




130
131
132
133
134
135
136
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132







+
+
+
+







	  (case action
	    ((delete)(sqlite3:execute mdb "DELETE FROM servers WHERE  hostname=? AND port=?;" hostname port))
	    (else    (sqlite3:execute mdb "UPDATE servers SET state='dead' WHERE hostname=? AND port=?;" hostname port)))
	  (debug:print 0 "ERROR: tasks:server-deregister called with neither pid nor port specified"))))

(define (tasks:server-deregister-self mdb hostname)
  (tasks:server-deregister mdb hostname pid: (current-process-id)))

;; need a simple call for robustly removing records given host and port
(define (tasks:server-delete mdb hostname port)
  (tasks:server-deregister mdb hostname port: port action: 'delete))

(define (tasks:server-get-server-id mdb hostname iface port pid)
  (debug:print-info 12 "tasks:server-get-server-id " mdb " " hostname " " iface " " port " " pid)
  (let ((res #f))
    (sqlite3:for-each-row
     (lambda (id)
       (set! res id))
227
228
229
230
231
232
233
234

235
236
237
238
239
240

241
242
243
244
245
246
247
248
249
250
251
252
253
254

255

256

257
258






259
260
261
262
263
264
265
223
224
225
226
227
228
229

230
231
232
233
234
235

236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251

252
253
254


255
256
257
258
259
260
261
262
263
264
265
266
267







-
+





-
+














+
-
+

+
-
-
+
+
+
+
+
+







;; 		  (if (null? tal)
;; 		      #f
;; 		      (loop (car tal)(cdr tal))))))))))

(define (tasks:remove-server-records mdb)
  (sqlite3:execute mdb "DELETE FROM servers;"))

(define (tasks:mark-server hostname port pid state)
(define (tasks:mark-server hostname port pid state transport)
  (if port
      (open-run-close tasks:server-deregister tasks:open-db hostname port: port)
      (open-run-close tasks:server-deregister tasks:open-db hostname pid:  pid)))


(define (tasks:kill-server status hostname port pid)
(define (tasks:kill-server status hostname port pid transport)
  (debug:print-info 1 "Removing defunct server record for " hostname ":" port)
  (if port
      (open-run-close tasks:server-deregister tasks:open-db hostname port: port)
      (open-run-close tasks:server-deregister tasks:open-db hostname pid:  pid))
  (if status ;; #t means alive
      (begin
	(if (equal? hostname (get-host-name))
	    (handle-exceptions
	     exn
	     (debug:print-info 0 "server may or may not be dead, check for megatest -server running as pid " pid "\n"
			       "  EXCEPTION: " ((condition-property-accessor 'exn 'message) exn))
	     (debug:print 1 "Sending signal/term to " pid " on " hostname)
	     (process-signal pid signal/term)
	     (thread-sleep! 5) ;; give it five seconds to die peacefully then do a brutal kill
	     ;;(process-signal pid signal/kill)
	     (process-signal pid signal/kill)) ;; local machine, send sig term
	     ) ;; local machine, send sig term
	    (begin
		(debug:print-info 1 "Stopping remote servers not yet supported."))))
	      (debug:print-info 1 "Telling alive server on " hostname ":" port " to commit servercide")
	      (cdb:kill-server zmq-socket))))    ;; remote machine, try telling server to commit suicide
	;;      (debug:print-info 1 "Telling alive server on " hostname ":" port " to commit servercide")
	;;      (let ((serverdat (list hostname port)))
	;;	(case (string->symbol transport)
	;;	  ((http)(http-transport:client-connect hostname port))
	;;	  (else  (debug:print "ERROR: remote stopping servers of type " transport " not supported yet")))
	;;	(cdb:kill-server serverdat)))))    ;; remote machine, try telling server to commit suicide
      (begin
	(if status 
	    (if (equal? hostname (get-host-name))
		(begin
		  (debug:print-info 1 "Sending signal/term to " pid " on " hostname)
		  (process-signal pid signal/term)  ;; local machine, send sig term
		  (thread-sleep! 5)                 ;; give it five seconds to die peacefully then do a brutal kill

Modified tests/fullrun/megatest.config from [c5fdf7800f] to [d25787fc32].

55
56
57
58
59
60
61
62

63
64
65
66
67
68
69
55
56
57
58
59
60
61

62
63
64
65
66
67
68
69







-
+







# XTERM   [system xterm]
# RUNDEAD [system exit 56]

[server]

# If the server can't be started on this port it will try the next port until
# it succeeds
port 8099
port 8080

# This server will keep running this number of hours after last access. 
# Three minutes is 0.05 hours
timeout 0.05

## disks are:
## name host:/path/to/area