Megatest

Check-in [bbd7cb0f5a]
Login
Overview
Comment:Random port assignment, minimize re-use of ports
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.60
Files: files | file ages | folders
SHA1: bbd7cb0f5a1d5d9963c1b00c2ea566370747e4b9
User & Date: matt on 2014-02-24 19:44:31
Other Links: branch diff | manifest | tags
Context
2014-02-24
22:15
Added debugging tags to server state changes. Cleaned up dashboard to display new server data. check-in: 904e5f7d6c user: matt tags: v1.60
19:44
Random port assignment, minimize re-use of ports check-in: bbd7cb0f5a user: matt tags: v1.60
16:46
Trying to get reliable server starts in the face of ports, hosts, kills etc. What fun check-in: cb995a99df user: mrwellan tags: v1.60
Changes

Modified docs/manual/howto.txt from [d3633d3bbf] to [b28c4b0da6].

55
56
57
58
59
60
61








The runscript method is a brute force way to run scripts where the
user is responsible for setting STATE and STATUS

-------------------
runscript main.csh
-------------------
















>
>
>
>
>
>
>
>
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
The runscript method is a brute force way to run scripts where the
user is responsible for setting STATE and STATUS

-------------------
runscript main.csh
-------------------

Debugging Server Problems
~~~~~~~~~~~~~~~~~~~~~~~~~

----------------
sudo lsof -i
sudo netstat -lptu
sudo netstat -tulpn
----------------

Modified http-transport.scm from [78b88f80d9] to [f8416b573d].

389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416

417
418
419
420
421
422
423
	    (debug:print-info 0 "Average non-cached time   "
			      (if (eq? *number-non-write-queries* 0)
				  "n/a (no queries)"
				  (/ *total-non-write-delay* 
				     *number-non-write-queries*))
			      " ms")
	    (debug:print-info 0 "Server shutdown complete. Exiting")
	    (tasks:server-delete-record! tdb server-id)
	    (exit))))))

;; all routes though here end in exit ...
;;
;; start_server? 
;;
(define (http-transport:launch run-id)
  (set! *run-id*   run-id)
  (if (args:get-arg "-daemonize")
      (daemon:ize))
  (if (server:check-if-running run-id)
      (begin
	(debug:print 0 "INFO: Server for run-id " run-id " already running")
	(exit 0)))
  (let ((server-id (open-run-close tasks:server-lock-slot tasks:open-db run-id)))
    (if (not server-id)
	(begin
	  ;; since we didn't get the server lock we are going to clean up and bail out
	  (debug:print-info 2 "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue")
	  (open-run-close tasks:server-delete-records-for-this-pid tasks:open-db))

	(let* ((th2 (make-thread (lambda ()
				   (http-transport:run 
				    (if (args:get-arg "-server")
					(args:get-arg "-server")
					"-")
				    run-id
				    server-id)) "Server run"))







|



















|
>







389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
	    (debug:print-info 0 "Average non-cached time   "
			      (if (eq? *number-non-write-queries* 0)
				  "n/a (no queries)"
				  (/ *total-non-write-delay* 
				     *number-non-write-queries*))
			      " ms")
	    (debug:print-info 0 "Server shutdown complete. Exiting")
	    (tasks:server-delete-record tdb server-id)
	    (exit))))))

;; all routes though here end in exit ...
;;
;; start_server? 
;;
(define (http-transport:launch run-id)
  (set! *run-id*   run-id)
  (if (args:get-arg "-daemonize")
      (daemon:ize))
  (if (server:check-if-running run-id)
      (begin
	(debug:print 0 "INFO: Server for run-id " run-id " already running")
	(exit 0)))
  (let ((server-id (open-run-close tasks:server-lock-slot tasks:open-db run-id)))
    (if (not server-id)
	(begin
	  ;; since we didn't get the server lock we are going to clean up and bail out
	  (debug:print-info 2 "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue")
	  (open-run-close tasks:server-delete-records-for-this-pid tasks:open-db)
	  )
	(let* ((th2 (make-thread (lambda ()
				   (http-transport:run 
				    (if (args:get-arg "-server")
					(args:get-arg "-server")
					"-")
				    run-id
				    server-id)) "Server run"))

Modified tasks.scm from [0a12e30788] to [8d2eda9003].

126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141








142
143
144
145
146
147
148
149
150
151
152
153


154

155



156
157
158
159
160
161
162
163

164
165
166
167
168
169
170
171
172
173




174


175
176
177
178
179
180
181
182
       (set! res num-in-queue))
     mdb
     "SELECT count(id) FROM servers WHERE run_id=? AND state = 'available';"
     run-id)
    res))

(define (tasks:server-clean-out-old-records-for-run-id mdb run-id)
  (sqlite3:execute mdb "DELETE FROM servers WHERE state in ('available','shutting-down') AND (strftime('%s','now') - start_time) > 30 AND run_id=?;" run-id))

(define (tasks:server-force-clean-running-records-for-run-id mdb run-id)
  (sqlite3:execute mdb "DELETE FROM servers WHERE state = 'running' AND run_id=?;" run-id))

(define (tasks:server-force-clean-run-record mdb run-id iface port)
  (sqlite3:execute mdb "DELETE FROM servers WHERE state = 'running' AND run_id=? AND interface=? AND port=?;"
		   run-id iface port))









(define (tasks:server-set-state! mdb server-id state)
  (sqlite3:execute mdb "UPDATE servers SET state=? WHERE id=?;" state server-id))

(define (tasks:server-delete-record! mdb server-id)
  (sqlite3:execute mdb "DELETE FROM servers WHERE id=?;" server-id))

(define (tasks:server-delete-records-for-this-pid mdb)
  (sqlite3:execute mdb "DELETE FROM servers WHERE hostname=? AND pid=?;" (get-host-name) (current-process-id)))

(define (tasks:server-set-interface-port mdb server-id interface port)
  (sqlite3:execute mdb "UPDATE servers SET interface=?,port=? WHERE id=?;" interface port server-id))



(define (tasks:server-get-next-port mdb)

  (let ((res         #f)



	(port-param  (if (and (args:get-arg "-port")
			      (string->number (args:get-arg "-port")))
			 (string->number (args:get-arg "-port"))
			 #f))
	(config-port (if (and (config-lookup  *configdat* "server" "port")
			      (string->number (config-lookup  *configdat* "server" "port")))
			 (string->number (config-lookup  *configdat* "server" "port"))
			 #f)))

    (sqlite3:for-each-row
     (lambda (port)
       (set! res (+ port 1))) ;; set to next
     mdb
     "SELECT max(port) FROM servers;")
    (cond
     ((and port-param res)   (if (> res port-param) res port-param))
     (port-param             port-param)
     ((and config-port res)  (if (> res config-port) res config-port))
     (config-port            config-port)




     ((and res (> res 8080)) res)


     (else                   (+ 5000 (random 1001))))))

(define (tasks:server-am-i-the-server? mdb run-id)
  (let* ((all    (tasks:server-get-servers-vying-for-run-id mdb run-id))
	 (first  (if (null? all)
		     (begin (debug:print 0 "ERROR: no servers listed, should be at least one by now.") 
			    (sqlite3:finalize! mdb)
			    (exit 1))







|


|


|


>
>
>
>
>
>
>
>



<
<
<
<
<
<



>
>

>
|
>
>
>
|
|
|
|
|
|
|
|
>


|

|



|
|
>
>
>
>
|
>
>
|







126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152






153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
       (set! res num-in-queue))
     mdb
     "SELECT count(id) FROM servers WHERE run_id=? AND state = 'available';"
     run-id)
    res))

(define (tasks:server-clean-out-old-records-for-run-id mdb run-id)
  (sqlite3:execute mdb "UPDATE servers SET state='defunct' WHERE state in ('available','shutting-down') AND (strftime('%s','now') - start_time) > 300 AND run_id=?;" run-id))

(define (tasks:server-force-clean-running-records-for-run-id mdb run-id)
  (sqlite3:execute mdb "UPDATE servers SET state='defunct' WHERE state = 'running' AND run_id=?;" run-id))

(define (tasks:server-force-clean-run-record mdb run-id iface port)
  (sqlite3:execute mdb "UPDATE servers SET state='defunct' WHERE state = 'running' AND run_id=? AND interface=? AND port=?;"
		   run-id iface port))

(define (tasks:server-delete-records-for-this-pid mdb)
  (sqlite3:execute mdb "UPDATE servers SET state='defunct' WHERE hostname=? AND pid=?;" (get-host-name) (current-process-id)))

(define (tasks:server-delete-record mdb server-id)
  (sqlite3:execute mdb "UPDATE servers SET state='defunct' WHERE id=?;" server-id)
  ;; use this opportuntity to clean out records over one month old
  (sqlite3:execute mdb "DELETE FROM servers WHERE state not in ('running','shutting-down') AND (strftime('%s','now') - start_time) > 2628000;"))

(define (tasks:server-set-state! mdb server-id state)
  (sqlite3:execute mdb "UPDATE servers SET state=? WHERE id=?;" state server-id))







(define (tasks:server-set-interface-port mdb server-id interface port)
  (sqlite3:execute mdb "UPDATE servers SET interface=?,port=? WHERE id=?;" interface port server-id))

;; Get random port not used in long time
;;
(define (tasks:server-get-next-port mdb)
  (let* ((lownum        30000)
	(highnum        64000)
	(used-ports     '())
	(get-rand-port  (lambda ()
			  (+ lownum (random (- highnum lownum)))))
	(port-param     (if (and (args:get-arg "-port")
				 (string->number (args:get-arg "-port")))
			    (string->number (args:get-arg "-port"))
			    #f))
	;; (config-port    (if (and (config-lookup  *configdat* "server" "port")
	;; 			 (string->number (config-lookup  *configdat* "server" "port")))
	;; 		    (string->number (config-lookup  *configdat* "server" "port"))
	;; 		    #f))
	)
    (sqlite3:for-each-row
     (lambda (port)
       (set! used-ports (cons port used-ports)))
     mdb
     "SELECT port FROM servers;")
    (cond
     ((and port-param res)   (if (> res port-param) res port-param))
     (port-param             port-param)
     ;; ((and config-port res)  (if (> res config-port) res config-port))
     ;; (config-port            config-port)
     (else
      (let loop ((port     (get-rand-port))
		 (remtries 100))
	(if (member port used-ports)
	    (if (> remtries 0)
		(loop (get-rand-port)(- remtries 1))
		(get-rand-port))
	    port))))))

(define (tasks:server-am-i-the-server? mdb run-id)
  (let* ((all    (tasks:server-get-servers-vying-for-run-id mdb run-id))
	 (first  (if (null? all)
		     (begin (debug:print 0 "ERROR: no servers listed, should be at least one by now.") 
			    (sqlite3:finalize! mdb)
			    (exit 1))