Megatest

Check-in [58bd90c5bc]
Login
Overview
Comment:fixed obscure bug when db is slightly malformed due to ^C. Tweak server gating, it is still not quite right...
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.65-cleanup
Files: files | file ages | folders
SHA1: 58bd90c5bc09ba63e6bda955e36fdc79027b6347
User & Date: mrwellan on 2020-08-26 22:56:54
Other Links: branch diff | manifest | tags
Context
2020-10-04
22:21
Trial 3. Backout server throttle code. Closed-Leaf check-in: 293027ea36 user: matt tags: v1.65-cleanup-try-3
2020-08-26
23:04
Tweaked server gate - still not right :( ==/FAIL/orion/== check-in: 7e26fb2f0c user: mrwellan tags: v1.65-cleanup
22:56
fixed obscure bug when db is slightly malformed due to ^C. Tweak server gating, it is still not quite right... check-in: 58bd90c5bc user: mrwellan tags: v1.65-cleanup
21:14
Added instrumentation for server start throttle. ==/3.18/0.6/PASS/1201/orion/== check-in: 56b3986bbb user: mrwellan tags: v1.65-cleanup
Changes

Modified db.scm from [f437d94aa0] to [2f649dc1fb].

457
458
459
460
461
462
463
464

465
466
467
468
469
470
471
457
458
459
460
461
462
463

464
465
466
467
468
469
470
471







-
+







	  exn
	(begin
	  (print "Attempt to safely close sqlite3 db failed. Trying again. exn=" exn)
	  (thread-sleep! 3)
	  (sqlite3:interrupt! db)
	  (db:safely-close-sqlite3-db db stmtcache try-num: (- try-num 1)))
	(if (sqlite3:database? db)
	    (let* ((stmts (hash-table-ref/default stmt-cache db #f)))
	    (let* ((stmts (and stmt-cache (hash-table-ref/default stmt-cache db #f))))
	      (if stmts (map sqlite3:finalize! (hash-table-values stmts)))
	      (sqlite3:finalize! db)
	      #t)
	    #f))))

;; close all opened run-id dbs
(define (db:close-all dbstruct)
824
825
826
827
828
829
830
831
832
833
834
835
836







837
838
839
840
841
842
843
824
825
826
827
828
829
830






831
832
833
834
835
836
837
838
839
840
841
842
843
844







-
-
-
-
-
-
+
+
+
+
+
+
+







              (thread-sleep! delay-handicap)
              (debug:print-info 0 *default-log-port* "synthetic sync delay of "delay-handicap" seconds completed")
              )
            
	    ;; first pass implementation, just insert all changed rows
	    (for-each 
	     (lambda (targdb)
	       (let* ((db     (db:dbdat-get-db targdb))
                      (drp-trigger (if (member "last_update" field-names)
                                      (db:drop-trigger db tablename) 
                                       #f))
                       (is-trigger-dropped (if (member "last_update" field-names)
                                              (db:is-trigger-dropped db tablename) #f)) 
	       (let* ((db                 (db:dbdat-get-db targdb))
                      (drp-trigger        (if (member "last_update" field-names)
					      (db:drop-trigger db tablename) 
					      #f))
                      (is-trigger-dropped (if (member "last_update" field-names)
                                              (db:is-trigger-dropped db tablename)
					      #f)) 
		      (stmth  (sqlite3:prepare db full-ins)))
		 ;; (db:delay-if-busy targdb) ;; NO WAITING
                 (if (member "last_update" field-names)
                     (debug:print-info 3 *default-log-port* "is-trigger-dropped: " is-trigger-dropped)) 
		 (for-each
		  (lambda (fromdat-lst)
		    (sqlite3:with-transaction
1222
1223
1224
1225
1226
1227
1228
1229

1230
1231
1232
1233
1234
1235

1236
1237
1238
1239
1240
1241

1242
1243
1244
1245
1246
1247

1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266

1267
1268
1269

1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281











1282
1283
1284
1285
1286




1287
1288
1289
1290
1291
1292
1293
1294
1295








1296
1297
1298
1299
1300
1301
1302
1223
1224
1225
1226
1227
1228
1229

1230
1231
1232
1233
1234
1235

1236
1237
1238
1239
1240
1241

1242
1243
1244
1245
1246
1247

1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266

1267
1268
1269

1270
1271
1272










1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285



1286
1287
1288
1289
1290
1291







1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306







-
+





-
+





-
+





-
+


















-
+


-
+


-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+


-
-
-
+
+
+
+


-
-
-
-
-
-
-
+
+
+
+
+
+
+
+







(define db:trigger-list 
     (list (list "update_runs_trigger"  "CREATE TRIGGER IF NOT EXISTS update_runs_trigger AFTER UPDATE ON runs
                             FOR EACH ROW
                               BEGIN 
                                 UPDATE runs SET last_update=(strftime('%s','now'))
                                   WHERE id=old.id;
                               END;" ) 
       (list "update_run_stats_trigger"  "CREATE TRIGGER  IF NOT EXISTS update_run_stats_trigger AFTER UPDATE ON run_stats
	   (list "update_run_stats_trigger"  "CREATE TRIGGER  IF NOT EXISTS update_run_stats_trigger AFTER UPDATE ON run_stats
                             FOR EACH ROW
                               BEGIN 
                                 UPDATE run_stats SET last_update=(strftime('%s','now'))
                                   WHERE id=old.id;
                               END;" )
       (list "update_tests_trigger"  "CREATE TRIGGER  IF NOT EXISTS update_tests_trigger AFTER UPDATE ON tests
	   (list "update_tests_trigger"  "CREATE TRIGGER  IF NOT EXISTS update_tests_trigger AFTER UPDATE ON tests
                             FOR EACH ROW
                               BEGIN 
                                 UPDATE tests SET last_update=(strftime('%s','now'))
                                   WHERE id=old.id;
                               END;" )
       (list "update_teststeps_trigger"  "CREATE TRIGGER  IF NOT EXISTS update_teststeps_trigger AFTER UPDATE ON test_steps
	   (list "update_teststeps_trigger"  "CREATE TRIGGER  IF NOT EXISTS update_teststeps_trigger AFTER UPDATE ON test_steps
                             FOR EACH ROW
                               BEGIN 
                                 UPDATE test_steps SET last_update=(strftime('%s','now'))
                                   WHERE id=old.id;
                               END;" )
       (list "update_test_data_trigger"  "CREATE TRIGGER  IF NOT EXISTS update_test_data_trigger AFTER UPDATE ON test_data
	   (list "update_test_data_trigger"  "CREATE TRIGGER  IF NOT EXISTS update_test_data_trigger AFTER UPDATE ON test_data
                             FOR EACH ROW
                               BEGIN 
                                 UPDATE test_data SET last_update=(strftime('%s','now'))
                                   WHERE id=old.id;
                               END;" )))

(define (db:create-all-triggers dbstruct)
(db:with-db
   dbstruct #f #f
   (lambda (db)
(db:create-triggers db))))

(define (db:create-triggers db)
    (for-each (lambda (key)
              (sqlite3:execute db (cadr key)))
          db:trigger-list))

(define (db:drop-all-triggers dbstruct)
(db:with-db
  (db:with-db
   dbstruct #f #f
   (lambda (db)
(db:drop-triggers db))))
     (db:drop-triggers db))))

(define (db:is-trigger-dropped db tbl-name)
     (let* ((trigger-name (if (equal? tbl-name "test_steps")
				"update_teststeps_trigger" 
                                (conc "update_" tbl-name "_trigger"))))
            (sqlite3:for-each-row
		   (lambda (name)
                     ;(print name)
		     (set! res (vector name)))
		   db 
		   "select name  from sqlite_master where type = 'trigger' ;" 
		   )))
  (let* ((trigger-name (if (equal? tbl-name "test_steps")
			   "update_teststeps_trigger" 
                           (conc "update_" tbl-name "_trigger")))
	 (res          #f))
    (sqlite3:for-each-row
     (lambda (name)
       (if (equal? name trigger-name)
	   (set! res #t)))
     db 
     "SELECT name FROM sqlite_master WHERE type = 'trigger' ;" 
     )))

(define (db:drop-triggers db)
  (for-each (lambda (key) 
              (sqlite3:execute db (conc "drop trigger " (car key))))
          db:trigger-list))
  (for-each
   (lambda (key) 
     (sqlite3:execute db (conc "drop trigger if exists " (car key))))
   db:trigger-list))

(define  (db:drop-trigger db tbl-name)
      (let* ((trigger-name (if (equal? tbl-name "test_steps")
				"update_teststeps_trigger" 
                                (conc "update_" tbl-name "_trigger"))))
       (for-each (lambda (key) 
             (if (equal? (car key) trigger-name)
             (sqlite3:execute db (conc "drop trigger " trigger-name))))
      db:trigger-list)))
  (let* ((trigger-name (if (equal? tbl-name "test_steps")
			   "update_teststeps_trigger" 
                           (conc "update_" tbl-name "_trigger"))))
    (for-each
     (lambda (key) 
       (if (equal? (car key) trigger-name)
           (sqlite3:execute db (conc "drop trigger if exists " trigger-name))))
     db:trigger-list)))

(define  (db:create-trigger db tbl-name)
      (let* ((trigger-name (if (equal? tbl-name "test_steps")
                              "update_teststeps_trigger" 
                              (conc "update_" tbl-name "_trigger"))))
       (for-each (lambda (key) 
             (if (equal? (car key) trigger-name)
2178
2179
2180
2181
2182
2183
2184
2185
2186


2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203




2204
2205
2206
2207
2208
2209
2210
2182
2183
2184
2185
2186
2187
2188


2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204



2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215







-
-
+
+














-
-
-
+
+
+
+







			  (string->number res)
			  #f)))
          (if newres
              newres
              res))
        res)))

(define (db:no-sync-close-db db stmtcache)
  (db:safely-close-sqlite3-db db stmtcache))
(define (db:no-sync-close-db db stmt-cache)
  (db:safely-close-sqlite3-db db stmt-cache))

;; transaction protected lock aquisition
;; either:
;;    fails    returns  (#f . lock-creation-time)
;;    succeeds (returns (#t . lock-creation-time)
;; use (db:no-sync-del! db keyname) to release the lock
;;
(define (db:no-sync-get-lock db-in keyname)
  (let ((db (db:no-sync-db db-in)))
    (sqlite3:with-transaction
     db
     (lambda ()
       (handle-exceptions
	   exn
	   (let ((lock-time (current-seconds)))
	     (sqlite3:execute db "INSERT INTO no_sync_metadat (var,val) VALUES(?,?);" keyname lock-time)
	     `(#t . ,lock-time))
	 (let ((lock-time (current-seconds)))
	   (debug:print-info 2 *default-log-port* "db:no-sync-get-lock keyname=" keyname ", lock-time=" lock-time ", exn=" exn)
	   (sqlite3:execute db "INSERT INTO no_sync_metadat (var,val) VALUES(?,?);" keyname lock-time)
	   `(#t . ,lock-time))
	 `(#f . ,(sqlite3:first-result db "SELECT val FROM no_sync_metadat WHERE var=?;" keyname)))))))



;; use a global for some primitive caching, it is just silly to
;; re-read the db over and over again for the keys since they never
;; change
2236
2237
2238
2239
2240
2241
2242
2243


2244
2245
2246
2247
2248
2249
2250
2241
2242
2243
2244
2245
2246
2247

2248
2249
2250
2251
2252
2253
2254
2255
2256







-
+
+







      (let loop ((hed (car header))
                 (tal (cdr header))
                 (n   0))
        (if (equal? hed field)
            (handle-exceptions
             exn
             (begin
               (debug:print 0 *default-log-port* "WARNING: attempt to read non-existant field, row=" row " header=" header " field=" field ", exn=" exn)
               (debug:print 0 *default-log-port* "WARNING: attempt to read non-existant field, row="
			    row " header=" header " field=" field ", exn=" exn)
               #f)
             (vector-ref row n))
	    (if (null? tal) #f (loop (car tal)(cdr tal)(+ n 1)))))))

;; Accessors for the header/data structure
;; get rows and header from 
(define (db:get-header vec)(vector-ref vec 0))
4477
4478
4479
4480
4481
4482
4483
4484

4485
4486
4487
4488
4489
4490
4491
4483
4484
4485
4486
4487
4488
4489

4490
4491
4492
4493
4494
4495
4496
4497







-
+







      (if dbdat
	  (let* ((dbpath (db:dbdat-get-path dbdat))
		 (db     (db:dbdat-get-db   dbdat)) ;; we'll return this so (db:delay--if-busy can be called inline
		 (dbfj   (conc dbpath "-journal")))
	    (if (handle-exceptions
		 exn
		 (begin
		   (debug:print-info 0 *default-log-port* "WARNING: failed to test for existance of " dbfj)
		   (debug:print-info 0 *default-log-port* "WARNING: failed to test for existance of " dbfj ", exn=" exn)
		   (thread-sleep! 1)
		   (db:delay-if-busy count (- count 1))) 
		 (common:file-exists? dbfj))
		(case count
		  ((6)
		   (thread-sleep! 0.2)
		   (db:delay-if-busy count: 5))

Modified server.scm from [1606531507] to [0ee248d5ed].

322
323
324
325
326
327
328
329
330
331





332
333
334


335
336
337
338
339

340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361

362

363
364
365
366
367
368
369
322
323
324
325
326
327
328



329
330
331
332
333
334


335
336


337
338

339

340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359

360
361
362
363
364
365
366
367
368
369







-
-
-
+
+
+
+
+

-
-
+
+
-
-


-
+
-




















-
+

+








;; wait for server=start-last to be three seconds old
;;
(define (server:wait-for-server-start-last-flag areapath)
  (let* ((start-flag (conc areapath "/logs/server-start-last")))
    (if (file-exists? start-flag)
	(let* ((fmodtime (file-modification-time start-flag))
	       (reftime  (+ 3 (random 5)))
	       (delta    (- (current-seconds) fmodtime)))
	  (if (> delta reftime) ;; good enough
	       (reftime  (+ 2 (random 3)))
	       (delta    (- (current-seconds) fmodtime))
	       (all-go   (> delta reftime)))
	  (if all-go
	      #t ;; (system (conc "touch " start-flag)) ;; lazy but safe
	      (begin
		(debug:print-info 0 *default-log-port* "Ready to start server, last start: "
				  fmodtime ", delta: " delta ", reftime: " reftime)
		(debug:print-info 0 *default-log-port* "Gating server start, last start: "
				  fmodtime ", delta: " delta ", reftime: " reftime ", all-go=" all-go)
		(system (conc "touch " start-flag))) ;; lazy but safe
	      (begin
		(thread-sleep! 5)
		(server:wait-for-server-start-last-flag areapath))))
	(system (conc "touch " start-flag)))))
	#;(system (conc "touch " start-flag)))))
	      

;; kind start up of servers, wait 40 seconds before allowing another server for a given
;; run-id to be launched
;;
(define (server:kind-run areapath)
  ;; look for $MT_RUN_AREA_HOME/logs/server-start-last
  ;; and wait for it to be at least 3 seconds old
  (server:wait-for-server-start-last-flag areapath)
  (if (not (server:check-if-running areapath)) ;; why try if there is already a server running?
      (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun
	     (call-num     (car last-run-dat))
	     (when-run     (cadr last-run-dat))
	     (run-delay    (+ (case call-num
				((0)    0)
				((1)   20)
				((2)  300)
				(else 600))
			      (random 5)))   ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously
	     (lock-file    (conc areapath "/logs/server-start.lock")))
	(if	(> (- (current-seconds) when-run) run-delay)
		(begin
		(let* ((start-flag (conc areapath "/logs/server-start-last")))
		  (common:simple-file-lock-and-wait lock-file expire-time: 15)
		  (system (conc "touch " start-flag)) ;; lazy but safe
		  (server:run areapath)
		  (thread-sleep! 2) ;; don't release the lock for at least a few seconds
		  (common:simple-file-release-lock lock-file)))
	(hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds))))))

;; this one seems to be the general entry point
;;