Megatest

Check-in [b13bf213d4]
Login
Overview
Comment:Improved auto server reconnect in client.scm but it is still flakey under extreme load
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.60
Files: files | file ages | folders
SHA1: b13bf213d405d1446f3ff22faac32c69e02499c0
User & Date: matt on 2014-02-18 21:16:23
Other Links: branch diff | manifest | tags
Context
2014-02-19
23:30
Partial fix for issues with $MT_MEGATEST -test-files lookithome.log -target $MT_TARGET :runname $MT_RUNNAME -testpatt ez_fail check-in: 845c375e3f user: matt tags: v1.60
2014-02-18
21:16
Improved auto server reconnect in client.scm but it is still flakey under extreme load check-in: b13bf213d4 user: matt tags: v1.60
19:41
Clean up, removed old references to *runremote* check-in: 7590c8479a user: matt tags: v1.60
Changes

Modified client.scm from [67235b3676] to [4e17077a64].

51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70


71
72
73
74
75
76
77
78



79
80
81
82
83
84
85
86


87
88
89
90
91
92
93
94
95
96



97
98
99
100
101
102
103
104
;;   2. We are a run tests, list runs or other interactive process and we must figure out
;;      *transport-type* and *runremote* from the monitor.db
;;
;; client:setup
;;
;; lookup_server, need to remove *runremote* stuff
;;
(define (client:setup run-id #!key (remaining-tries 10))
  (debug:print 0 "INFO: client:setup remaining-tries=" remaining-tries)
  (if (<= remaining-tries 0)
      (begin
	(debug:print 0 "ERROR: failed to start or connect to server for run-id " run-id)
	(exit 1))
      (let ((server-dat (and run-id (hash-table-ref/default *runremote* run-id #f))))
	(if server-dat
	    (let ((start-res (http-transport:client-connect run-id ;; NB// confusion over server-dat and connection result!
							    (tasks:hostinfo-get-interface server-dat)
							    (tasks:hostinfo-get-port      server-dat))))
	      (if start-res ;; sucessful login?
		  start-res


		  (begin    ;; login failed
		    (hash-table-delete! *runremote* run-id)
		    (open-run-close tasks:server-force-clean-run-record
				    tasks:open-db
				    run-id 
				    (tasks:hostinfo-get-interface server-dat)
				    (tasks:hostinfo-get-port      server-dat))
		    (thread-sleep! 5)



		    (client:setup run-id remaining-tries: (- remaining-tries 1)))))
	    (let* ((server-dat (open-run-close tasks:get-server tasks:open-db run-id)))
	      (if server-dat
		  (let ((start-res (http-transport:client-connect run-id
								  (tasks:hostinfo-get-interface server-dat)
								  (tasks:hostinfo-get-port      server-dat))))
		    (if start-res
			start-res


			(begin    ;; login failed
			  (hash-table-delete! *runremote* run-id)
			  (open-run-close tasks:server-force-clean-run-record
					  tasks:open-db
					  run-id 
					  (tasks:hostinfo-get-interface server-dat)
					  (tasks:hostinfo-get-port      server-dat))
			  (thread-sleep! 2)
			  (server:try-running run-id)
			  (thread-sleep! 5) ;; give server a little time to start up



			  (client:setup run-id remaining-tries: (- remaining-tries 1)))))
		  (begin    ;; no server registered
		    (thread-sleep! 2)
		    (server:try-running run-id)
		    (thread-sleep! 5) ;; give server a little time to start up
		    (client:setup run-id remaining-tries: (- remaining-tries 1)))))))))

;; keep this as a function to ease future 







|








|
|


>
>
|
|
|
|
|
|
|
|
>
>
>
|







>
>
|
|
|
|
|
|
|
|
|
|
>
>
>
|







51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
;;   2. We are a run tests, list runs or other interactive process and we must figure out
;;      *transport-type* and *runremote* from the monitor.db
;;
;; client:setup
;;
;; lookup_server, need to remove *runremote* stuff
;;
(define (client:setup run-id #!key (remaining-tries 20) (failed-connects 0))
  (debug:print 0 "INFO: client:setup remaining-tries=" remaining-tries)
  (if (<= remaining-tries 0)
      (begin
	(debug:print 0 "ERROR: failed to start or connect to server for run-id " run-id)
	(exit 1))
      (let ((server-dat (and run-id (hash-table-ref/default *runremote* run-id #f))))
	(if server-dat
	    (let ((start-res (http-transport:client-connect run-id ;; NB// confusion over server-dat and connection result!
							    (car  server-dat)
							    (cadr server-dat))))
	      (if start-res ;; sucessful login?
		  start-res
		  (if (and (< remaining-tries 10)
			   (odd? remaining-tries))
		      (begin    ;; login failed
			(hash-table-delete! *runremote* run-id)
			(open-run-close tasks:server-force-clean-run-record
					tasks:open-db
					run-id 
					(car  server-dat)
					(cadr server-dat))
			(thread-sleep! 5)
			(client:setup run-id remaining-tries: (- remaining-tries 1)))
		      (begin
			(thread-sleep! 5)
			(client:setup run-id remaining-tries: (- remaining-tries 1))))))
	    (let* ((server-dat (open-run-close tasks:get-server tasks:open-db run-id)))
	      (if server-dat
		  (let ((start-res (http-transport:client-connect run-id
								  (tasks:hostinfo-get-interface server-dat)
								  (tasks:hostinfo-get-port      server-dat))))
		    (if start-res
			start-res
			(if (and (< remaining-tries 10)
				 (odd? remaining-tries))
			    (begin    ;; login failed
			      (hash-table-delete! *runremote* run-id)
			      (open-run-close tasks:server-force-clean-run-record
					      tasks:open-db
					      run-id 
					      (tasks:hostinfo-get-interface server-dat)
					      (tasks:hostinfo-get-port      server-dat))
			      (thread-sleep! 2)
			      (server:try-running run-id)
			      (thread-sleep! 5) ;; give server a little time to start up
			      (client:setup run-id remaining-tries: (- remaining-tries 1)))
			    (begin
			      (thread-sleep! 5)
			      (client:setup run-id remaining-tries: (- remaining-tries 1))))))
		  (begin    ;; no server registered
		    (thread-sleep! 2)
		    (server:try-running run-id)
		    (thread-sleep! 5) ;; give server a little time to start up
		    (client:setup run-id remaining-tries: (- remaining-tries 1)))))))))

;; keep this as a function to ease future 

Modified server.scm from [8eb4730569] to [3ef5cd2281].

105
106
107
108
109
110
111
112

113
114
115
116
117
118
119
(define (server:reply return-addr query-sig success/fail result)
  (db:obj->string (vector success/fail query-sig result)))

;; > file 2>&1 
(define (server:try-running run-id)
  (let* ((rand-name (random 100))
	 (cmdln (conc (if (getenv "MT_MEGATEST") (getenv "MT_MEGATEST") "megatest")
		     " -server - -run-id " run-id " name=" rand-name " > " *toppath* "/db/" run-id "-" rand-name ".log 2>&1 &")))

    (debug:print 0 "INFO: Starting server (" cmdln ") as none running ...")
    (push-directory *toppath*)
    (system cmdln)
    (pop-directory)))

(define (server:check-if-running run-id)
  (let loop ((server (open-run-close tasks:get-server tasks:open-db run-id))







|
>







105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
(define (server:reply return-addr query-sig success/fail result)
  (db:obj->string (vector success/fail query-sig result)))

;; > file 2>&1 
(define (server:try-running run-id)
  (let* ((rand-name (random 100))
	 (cmdln (conc (if (getenv "MT_MEGATEST") (getenv "MT_MEGATEST") "megatest")
		    ;; " -server - -run-id " run-id " name=" rand-name " > " *toppath* "/db/" run-id "-" rand-name ".log 2>&1 &")))
		      " -server - -run-id " run-id " > " *toppath* "/db/" run-id ".log 2>&1 &")))
    (debug:print 0 "INFO: Starting server (" cmdln ") as none running ...")
    (push-directory *toppath*)
    (system cmdln)
    (pop-directory)))

(define (server:check-if-running run-id)
  (let loop ((server (open-run-close tasks:get-server tasks:open-db run-id))