Megatest

Check-in [b8f2578046]
Login
Overview
Comment:Hacky solution for no-sync db
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.80-tcp-inmem
Files: files | file ages | folders
SHA1: b8f25780465dffc1926711f64eedb98330227989
User & Date: matt on 2023-02-20 20:05:08
Other Links: branch diff | manifest | tags
Context
2023-02-21
11:39
server start smooth, but initial data load to inmem broken. check-in: 677b6ef8e8 user: matt tags: v1.80-tcp-inmem
2023-02-20
20:05
Hacky solution for no-sync db check-in: b8f2578046 user: matt tags: v1.80-tcp-inmem
14:17
wip. better but now there are run-away issues in ext-tests: check-in: 94e5d1fb43 user: matt tags: v1.80-tcp-inmem
Changes

Modified Makefile from [c01f62811d] to [7b082d9b05].

34
35
36
37
38
39
40
41

42
43
44

45
46
47
48
49
50
51
34
35
36
37
38
39
40

41
42
43
44
45
46
47
48
49
50
51
52







-
+



+







            tcp-transportmod.scm

all : $(PREFIX)/bin/.$(ARCHSTR) mtest dboard mtut tcmt

transport-mode.scm : transport-mode.scm.template
	cp transport-mode.scm.template transport-mode.scm

dashboard-mode.scm : transport-mode.scm.template
dashboard-transport-mode.scm : transport-mode.scm.template
	cp transport-mode.scm.template dashboard-transport-mode.scm

megatest.scm : transport-mode.scm
dashboard.scm : dashboard-transport-mode.scm

# dbmod.import.o is just a hack here
mofiles/dbfile.o     : mofiles/debugprint.o mofiles/commonmod.o dbmod.import.o
db.o : dbmod.import.o
mofiles/debugprint.o : mofiles/mtargs.o

# ftail.scm rmtmod.scm commonmod.scm removed

Modified api.scm from [351c29f44d] to [2d5deac3b3].

153
154
155
156
157
158
159
160
161
162
163
164
165






166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232




























































233
234



235
236
237
238
239
240
241

242
243


244
245
246
247
248
249
250

251
252
253
254
255
256
257
258
259
260
261
262
263

264
265
266
267
268
269
270
153
154
155
156
157
158
159






160
161
162
163
164
165
166
167
168
169
170
171
172




























































173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245


246
247
248
249
250
251
252
253

254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275







-
-
-
-
-
-
+
+
+
+
+
+







-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+


+
+
+







+
-
-
+
+






-
+













+







;; These are called by the server on recipt of /api calls
;;    - keep it simple, only return the actual result of the call, i.e. no meta info here
;;
;;    - returns #( flag result )
;;
(define (api:execute-requests dbstruct dat)
  (db:open-no-sync-db) ;; sets *no-sync-db*
;;   (handle-exceptions
;;    exn
;;    (let ((call-chain (get-call-chain)))
;;      (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer, dat=" dat ", exn=" exn)
;;      (print-call-chain (current-error-port))
;;      (debug:print 0 *default-log-port* " message: "  ((condition-property-accessor 'exn 'message) exn))       
  ;;   (handle-exceptions
  ;;    exn
  ;;    (let ((call-chain (get-call-chain)))
  ;;      (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer, dat=" dat ", exn=" exn)
  ;;      (print-call-chain (current-error-port))
  ;;      (debug:print 0 *default-log-port* " message: "  ((condition-property-accessor 'exn 'message) exn))       
  ;;      (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens
  (if (> *api-process-request-count* 200)
      (begin
	(if (common:low-noise-print 30 "too many threads")
	    (debug:print 0 *default-log-port* "WARNING: "*api-process-request-count*" threads, potential overload, adding 0.5 sec delay."))
	(thread-sleep! 0.5) ;; take a nap
	))
   (cond
    ((not (vector? dat))                    ;; it is an error to not receive a vector
     (vector #f (vector #f "remote must be called with a vector")))
    #;((> *api-process-request-count* 200) ;; 20)
     (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an overloaded message.")
     (set! *server-overloaded* #t)
     (vector #f (vector #f 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor!
    (else  
     (let* ((cmd-in            (vector-ref dat 0))
            (cmd               (if (symbol? cmd-in)
				   cmd-in
				   (string->symbol cmd-in)))
            (params            (vector-ref dat 1))
	    (run-id            (if (null? params)
				   0
				   (car params)))
	    (write-mutex       (if (hash-table-exists? *db-write-mutexes* run-id)
				   (hash-table-ref *db-write-mutexes* run-id)
				   (let* ((newmutex (make-mutex)))
				     (hash-table-set! *db-write-mutexes* run-id newmutex)
				     newmutex)))
            (start-t           (current-milliseconds))
            (readonly-mode     (dbr:dbstruct-read-only dbstruct))
            (readonly-command  (member cmd api:read-only-queries))
            (writecmd-in-readonly-mode (and readonly-mode (not readonly-command))))
       (if (not readonly-command)
	   (mutex-lock! write-mutex))
       (let* ((tmppath    (dbr:dbstruct-tmppath  dbstruct))
	      (clean-run-id (cond
			     ((number? run-id)   run-id)
			     ((equal? run-id #f) "main")
			     (else               "other")))
	      (crumbfile  (dbfile:wait-for-qif tmppath clean-run-id (cons cmd params)))
	      (res    
               (if writecmd-in-readonly-mode
                   (conc "attempt to run write command "cmd" on a read-only database")
		   (api:dispatch-request dbstruct cmd run-id params))))
	 (delete-file* crumbfile)
	 (if (not readonly-command)
	     (mutex-unlock! write-mutex))
	 
	 ;; save all stats
	 (let ((delta-t (- (current-milliseconds)
			   start-t))
	       (modified-cmd (if (eq? cmd 'general-call)
				 (string->symbol (conc "general-call-" (car params)))
				 cmd)))
	   (hash-table-set! *db-api-call-time* modified-cmd
			    (cons delta-t (hash-table-ref/default *db-api-call-time* modified-cmd '()))))
	 (if writecmd-in-readonly-mode
             (begin
               #;(common:telemetry-log (conc "api-out:"(->string cmd))
               payload: `((params . ,params)
               (ok-res . #t)))
	       (vector #f res))
             (begin
               #;(common:telemetry-log (conc "api-out:"(->string cmd))
               payload: `((params . ,params)
               (ok-res . #f)))
               (vector #t res))))))))
  (cond
   ((not (vector? dat))                    ;; it is an error to not receive a vector
    (vector #f (vector #f "remote must be called with a vector")))
   #;((> *api-process-request-count* 200) ;; 20)
   (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an overloaded message.")
   (set! *server-overloaded* #t)
   (vector #f (vector #f 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor!
   (else  
    (let* ((cmd-in            (vector-ref dat 0))
           (cmd               (if (symbol? cmd-in)
				  cmd-in
				  (string->symbol cmd-in)))
           (params            (vector-ref dat 1))
	   (run-id            (if (null? params)
				  0
				  (car params)))
	   (write-mutex       (if (hash-table-exists? *db-write-mutexes* run-id)
				  (hash-table-ref *db-write-mutexes* run-id)
				  (let* ((newmutex (make-mutex)))
				    (hash-table-set! *db-write-mutexes* run-id newmutex)
				    newmutex)))
           (start-t           (current-milliseconds))
           (readonly-mode     (dbr:dbstruct-read-only dbstruct))
           (readonly-command  (member cmd api:read-only-queries))
           (writecmd-in-readonly-mode (and readonly-mode (not readonly-command))))
      (if (not readonly-command)
	  (mutex-lock! write-mutex))
      (let* ((tmppath    (dbr:dbstruct-tmppath  dbstruct))
	     (clean-run-id (cond
			    ((number? run-id)   run-id)
			    ((equal? run-id #f) "main")
			    (else               "other")))
	     (crumbfile  (dbfile:wait-for-qif tmppath clean-run-id (cons cmd params)))
	     (res    
              (if writecmd-in-readonly-mode
                  (conc "attempt to run write command "cmd" on a read-only database")
		  (api:dispatch-request dbstruct cmd run-id params))))
	(delete-file* crumbfile)
	(if (not readonly-command)
	    (mutex-unlock! write-mutex))
	
	;; save all stats
	(let ((delta-t (- (current-milliseconds)
			  start-t))
	      (modified-cmd (if (eq? cmd 'general-call)
				(string->symbol (conc "general-call-" (car params)))
				cmd)))
	  (hash-table-set! *db-api-call-time* modified-cmd
			   (cons delta-t (hash-table-ref/default *db-api-call-time* modified-cmd '()))))
	(if writecmd-in-readonly-mode
            (begin
              #;(common:telemetry-log (conc "api-out:"(->string cmd))
              payload: `((params . ,params)
              (ok-res . #t)))
	      (vector #f res))
            (begin
              #;(common:telemetry-log (conc "api-out:"(->string cmd))
              payload: `((params . ,params)
              (ok-res . #f)))
              (vector #t res))))))))

;; indat is (cmd run-id params meta)
;;
;; WARNING: Do not print anything in this function as it reads/writes to current in/out port
;;
(define (api:tcp-dispatch-request-make-handler dbstruct) ;; cmd run-id params)
  (lambda ()
    (let* ((indat (deserialize)))
      (set! *api-process-request-count* (+ *api-process-request-count* 1))
      (match indat
	((cmd run-id params meta)
	 (let* ((status  (cond
			  ;; turn off busy throttling while trying to get things stable
			  ((> *api-process-request-count* 50) 'busy)
			  ((> *api-process-request-count* 25) 'loaded)
			  ;; ((> *api-process-request-count* 50) 'busy)
			  ;; ((> *api-process-request-count* 25) 'loaded)
			  (else 'ok)))
		(errmsg  (case status
			   ((busy)   (conc "Server overloaded, "*api-process-request-count*" threads in flight"))
			   ((loaded) (conc "Server loaded, "*api-process-request-count*" threads in flight"))
			   (else     #f)))
		(result  (case status
			   ((busy) #f)
			   ((busy loaded) #f)
			   (else
			    (case cmd
			      ((ping) (tt:mk-signature *toppath*))
			      (else
			       (api:dispatch-request dbstruct cmd run-id params))))))
		(payload (list status errmsg result '())))
	   (set! *api-process-request-count* (- *api-process-request-count* 1))
	   (serialize payload)))
	(else
	 (assert #f "FATAL: failed to deserialize indat "indat))))))
       

(define (api:dispatch-request dbstruct cmd run-id params)
  (db:open-no-sync-db)
  (case cmd
    ;;===============================================
    ;; READ/WRITE QUERIES
    ;;===============================================

    ((get-keys-write)                        (db:get-keys dbstruct)) ;; force a dummy "write" query to force server; for debug in -repl
    

Modified db.scm from [c29acb0315] to [8b2649f90e].

1470
1471
1472
1473
1474
1475
1476







1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488













1489
1490
1491

1492
1493
1494
1495

1496
1497
1498
1499
1500
1501
1502
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483












1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498

1499
1500
1501
1502

1503
1504
1505
1506
1507
1508
1509
1510







+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+


-
+



-
+







	      (lambda (dbdat db)
		(sqlite3:execute  (db:get-cache-stmth dbdat db "DELETE FROM metadat WHERE var=?;") var))))

;;======================================================================
;; no-sync.db - small bits of data to be shared between servers
;;======================================================================

(define (db:get-dbsync-path)
  (case (rmt:transport-mode)
    ((http)(common:get-db-tmp-area))
    ((tcp) (conc *toppath*"/.megatest"))
    ((nfs) (conc *toppath*"/.megatest"))
    (else "/tmp/dunno-this-gonna-exist")))

(define (db:no-sync-db db-in)
  (if db-in
      db-in
      (if *no-sync-db*
	  *no-sync-db*
	  (begin
	    (mutex-lock! *db-access-mutex*)
	    (let ((dbpath (common:get-db-tmp-area))
		  (db     (dbfile:open-no-sync-db dbpath)))
	      (set! *no-sync-db* db)
	      (mutex-unlock! *db-access-mutex*)
	      db)))))
 ;; (define (db:no-sync-db db-in)
 ;;   (if db-in
 ;;       db-in
 ;;       (if *no-sync-db*
 ;; 	  *no-sync-db*
 ;; 	  (begin
 ;; 	    (mutex-lock! *db-access-mutex*)
 ;; 	    (let ((dbpath (db:get-dbsync-path))
 ;; 		  (db     (dbfile:open-no-sync-db dbpath)))
 ;; 	      (assert (sqlite3:database? db) "FATAL: db:no-sync-db failed to open a database")
 ;; 	      (set! *no-sync-db* db)
 ;; 	      (mutex-unlock! *db-access-mutex*)
 ;; 	      db)))))

(define (with-no-sync-db proc)
  (let* ((db  (db:no-sync-db *no-sync-db*)))
  (let* ((db  (db:open-no-sync-db)))
    (proc db)))

(define (db:open-no-sync-db)
  (dbfile:open-no-sync-db (db:dbfile-path)))
  (dbfile:open-no-sync-db (db:get-dbsync-path)))

(define (db:no-sync-close-db db stmt-cache)
  (db:safely-close-sqlite3-db db stmt-cache))


;; use a global for some primitive caching, it is just silly to
;; re-read the db over and over again for the keys since they never

Modified dbfile.scm from [2bab3e7208] to [1ed5efa652].

392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412


413
414
415






416
417
418
419
420
421
422
392
393
394
395
396
397
398

399
400
401
402
403
404
405
406
407
408
409
410
411
412
413



414
415
416
417
418
419
420
421
422
423
424
425
426







-













+
+
-
-
-
+
+
+
+
+
+







  (let* ((backupfname (conc fname"-"(current-process-id)".bak"))
	 (cmd (conc "cp "fname" "backupfname";mv "fname" "(conc fname ".delme;")
		    "cp "backupfname" "fname)))
    (dbfile:print-err "WARNING: attempting recovery of file "fname" by running commands:\n"
		      "  "cmd)
    (system cmd)))


(define (dbfile:open-no-sync-db dbpath)
  (if *no-sync-db*
      *no-sync-db*
      (begin
	(if (not (file-exists? dbpath))
	    (create-directory dbpath #t))
	(let* ((dbname    (conc dbpath "/no-sync.db"))
	       (db-exists (file-exists? dbname))
	       (init-proc (lambda (db)
			    (if (not db-exists)
				(begin
				  (sqlite3:execute db "CREATE TABLE IF NOT EXISTS no_sync_metadat (var TEXT,val TEXT, CONSTRAINT no_sync_metadat_constraint UNIQUE (var));"))
				)))
	       (on-tmp      (equal? (car (string-split dbpath "/")) "tmp"))
	       (db        (if on-tmp
	       (db        (dbfile:cautious-open-database dbname init-proc 0 "WAL"))) ;; (sqlite3:open-database dbname)))
	  ;; (sqlite3:execute db "PRAGMA synchronous = 0;")
	  ;; (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 136000)) ;; done in cautious-open-database
			      (dbfile:cautious-open-database dbname init-proc 0 "WAL")
			      (sqlite3:open-database dbname))))
	  (if on-tmp	      ;; done in cautious-open-database
	      (begin
		(sqlite3:execute db "PRAGMA synchronous = 0;")
		(sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 136000))))
	  (set! *no-sync-db* db)
	  db))))

(define (db:no-sync-set db var val)
  (sqlite3:execute db "INSERT OR REPLACE INTO no_sync_metadat (var,val) VALUES (?,?);" var val))

(define (db:no-sync-del! db var)

Modified rmt.scm from [2482b80340] to [73327553e5].

102
103
104
105
106
107
108


109
110





111
112
113
114
115
116
117
102
103
104
105
106
107
108
109
110


111
112
113
114
115
116
117
118
119
120
121
122







+
+
-
-
+
+
+
+
+







   ((> attemptnum 20) (thread-sleep! 1)))

  ;; I'm turning this off, it may make sense to move it
  ;; into http-transport-handler
  (if (and (> attemptnum 5) (= 0 (modulo attemptnum 15)))  
      (begin
	(debug:print 0 *default-log-port* "ERROR: can't connect to server, trying to start a server.")
	(case (rmt:transport-mode)
	  ((http)
	(server:run *toppath*)
	(thread-sleep! 3)))
	   (server:run *toppath*)
	   (thread-sleep! 3))
	  (else
	   (thread-sleep! 1) ;; for tcp the server is started by routines in tcp-transportmod. For nfs there is no server
	   ))))
  
  ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote
  ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds.
  ;; 3. do the query, if on homehost use local access
  ;;
  (let* ((start-time    (current-seconds)) ;; snapshot time so all use cases get same value
         (areapath      *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas

Modified server.scm from [5eef4f0a57] to [8a167481c8].

467
468
469
470
471
472
473

474
475
476
477
478
479
480
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481







+







  ;; find oldest alive
  ;;   1. sort by age ascending and ping until good
  ;; find alive rand from youngest
  ;;   1. sort by age descending
  ;;   2. take five
  ;;   3. check alive, discard if not and repeat
  ;; first we clean up old server files
  (assert (eq? (rmt:transport-mode) 'http) "FATAL: server:run called with rmt:transport-mode="(rmt:transport-mode))
  (server:clean-up-old areapath)
  (let* ((since-last (- (current-seconds) server-last-start))
        (server-start-delay 10))     
    (if ( < (- (current-seconds) server-last-start) 10 )
      (begin
        (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start))
        (debug:print 2 *default-log-port* "server:choose-server: last server start less than " server-start-delay " seconds ago. Sleeping " server-start-delay " seconds")
566
567
568
569
570
571
572

573


574
575
576
577
578
579
580
567
568
569
570
571
572
573
574

575
576
577
578
579
580
581
582
583







+
-
+
+







		 (debug:print 0 *default-log-port* "WARNING: failed to delete old server info file "sfile)
		 (delete-file sfile))))))
     sfiles)))

;; would like to eventually get rid of this
;;
(define (common:on-homehost?)
  (if (eq? (rmt:transport-mode) 'http)
  (server:choose-server *toppath* 'home?))
      (server:choose-server *toppath* 'home?)
      #t)) ;; there is no homehost for tcp and nfs is always on home so #t should work

;; kind start up of server, wait before allowing another server for a given
;; area to be launched
;;
(define (server:kind-run areapath)
  ;; look for $MT_RUN_AREA_HOME/logs/server-start-last
  ;; and wait for it to be at least <server idletime> seconds old

Modified tcp-transportmod.scm from [b2bb58190e] to [a92ff9f544].

96
97
98
99
100
101
102

103
104
105
106
107
108
109
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110







+







  (thread       #f)
  (host-port    #f)
  (cmd-thread   #f)
  (ro-mode      #f)
  (ro-mode-checked #f)
  (last-access  (current-seconds))
  (servinf-file #f)
  (last-serv-start 0)
  )

(define (tt:make-remote areapath)
  (make-tt areapath: areapath))

;; do all the busy work of finding and setting up conn for
;; connecting to a server
132
133
134
135
136
137
138



139
140
141



142
143
144
145


146
147
148
149
150
151
152
133
134
135
136
137
138
139
140
141
142



143
144
145
146
147
148

149
150
151
152
153
154
155
156
157







+
+
+
-
-
-
+
+
+



-
+
+







			   server-id: server-id
			   server-start: start-time
			   pid: pid)))
	       (hash-table-set! (tt-conns ttdat) dbfname conn)
	       ;; verify we can talk to this server
	       (if (tt:ping host port server-id)
		   conn
		   (let* ((curr-secs (current-seconds)))
		     ;; rm the (last server) would go here
		     (if (> (- curr-secs (tt-last-serv-start ttdat)) 30)
		   (begin
		     ;; rm the (last server) would go here
		     (server-start-proc)
			 (begin
			   (tt-last-serv-start-set! ttdat curr-secs)
			   (server-start-proc))) ;; don't try and start server unless 30 sec has gone by since last attempt
		     (thread-sleep! 1)
		     (tt:client-connect-to-server ttdat dbfname run-id testsuite)))))
	    (else
	     (debug:print-info 0 *default-log-port* "Number of records did not match expected. Bad server info?")
	     (debug:print-info 0 *default-log-port* "No server found. Starting one for run-id "run-id" in dbfile "dbfname)
	     (tt-last-serv-start-set! ttdat (current-seconds))
	     (server-start-proc)
	     (thread-sleep! 1)
	     (tt:client-connect-to-server ttdat dbfname run-id testsuite)))))))
    
(define (tt:ping host port server-id)
  (let*  ((res (tt:send-receive-direct host port `(ping #f #f #f)))) ;; please send me your server-id
    ;;
185
186
187
188
189
190
191
192

193
194
195
196
197
198
199
190
191
192
193
194
195
196

197
198
199
200
201
202
203
204







-
+







		(debug:print 0 *default-log-port* "WARNING: server is loaded, will try again in a second.")
		(thread-sleep! 1)
		(tt:handler  ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe))
	       (else
		result)))
	    (else
	     (if (not res)
		 (begin ;; server likely died
		 (begin ;; let* ((srvfile (tt-conn-servinf-file ))) ;; server likely died
		   (hash-table-set! (tt-conns ttdat) dbfname #f)
		   (debug:print 0 *default-log-port* "INFO: connection to server broken, reconnecting.")
		   (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe))
		 (assert #f "FATAL: tt:handler received bad data "res)))))
	(begin
	  (thread-sleep! 1) ;; give it a rest and try again
	  (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)))))