Megatest

Check-in [faeb319c76]
Login
Overview
Comment:Protected accesses to megatest.db and monitor.db with journal file busy control
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.60
Files: files | file ages | folders
SHA1: faeb319c76f2ae738bb0981a450e27c8fe9ba503
User & Date: matt on 2014-11-10 22:31:21
Other Links: branch diff | manifest | tags
Context
2014-11-11
09:21
reduce noise on output check-in: 678ea430aa user: mrwellan tags: v1.60
2014-11-10
22:31
Protected accesses to megatest.db and monitor.db with journal file busy control check-in: faeb319c76 user: matt tags: v1.60
2014-11-07
14:09
Protect the transaction in sync-db with exception handler check-in: d46174b7d1 user: mrwellan tags: v1.60
Changes

Modified db.scm from [6ed0b093c7] to [ce7df076f7].

252
253
254
255
256
257
258


259
260
261
262




263
264
265
266
267
268
269
270
271
272
273


274
275
276
277




278
279
280
281
282
283
284
252
253
254
255
256
257
258
259
260




261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277




278
279
280
281
282
283
284
285
286
287
288







+
+
-
-
-
-
+
+
+
+











+
+
-
-
-
-
+
+
+
+







    (if (eq? run-id 0)
	;; runid equal to 0 is main.db
	(if maindb
	    (if (or (not (number? mtime))
		    (not (number? stime))
		    (> mtime stime)
		    force-sync)
		(begin
		  (db:delay-if-busy)
		(let ((num-synced (db:sync-tables (db:sync-main-list maindb) maindb olddb)))
		  (dbr:dbstruct-set-stime! dbstruct (current-milliseconds))
		  num-synced)
		0)
		  (let ((num-synced (db:sync-tables (db:sync-main-list maindb) maindb olddb)))
		    (dbr:dbstruct-set-stime! dbstruct (current-milliseconds))
		    num-synced)
		  0))
	    (begin
	      ;; this can occur when using local access (i.e. not in a server)
	      ;; need a flag to turn it off.
	      ;;
	      (debug:print 3 "WARNING: call to sync main.db to megatest.db but main not initialized")
	      0))
	;; any other runid is a run
	(if (or (not (number? mtime))
		(not (number? stime))
		(> mtime stime)
		force-sync)
	    (begin
	      (db:delay-if-busy)
	    (let ((num-synced (db:sync-tables db:sync-tests-only inmem refdb rundb olddb)))
	      (dbr:dbstruct-set-stime! dbstruct (current-milliseconds))
	      num-synced)
	    0))))
	      (let ((num-synced (db:sync-tables db:sync-tests-only inmem refdb rundb olddb)))
		(dbr:dbstruct-set-stime! dbstruct (current-milliseconds))
		num-synced)
	      0)))))

;; close all opened run-id dbs
(define (db:close-all dbstruct)
  ;; finalize main.db
  (db:sync-touched dbstruct 0 force-sync: #t)
  ;;(common:db-block-further-queries)
  ;; (mutex-lock! *db-sync-mutex*) ;; with this perhaps it isn't necessary to use the block-further-queries mechanism?
509
510
511
512
513
514
515


516

517
518
519
520
521
522
523
524
525
526
527
528
529
530


531

532
533
534
535


536

537
538
539
540
541
542
543
544

545
546
547
548
549
550
551
552
553
554
555

556
557
558
559
560
561
562
513
514
515
516
517
518
519
520
521

522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538

539
540
541
542
543
544
545

546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574







+
+
-
+














+
+
-
+




+
+
-
+








+











+







;;
(define (db:multi-db-sync run-ids . options)
  (let* ((toppath  (launch:setup-for-run))
	 (dbstruct (if toppath (make-dbr:dbstruct path: toppath) #f))
	 (mtdb     (if toppath (db:open-megatest-db)))
	 (run-ids  (if run-ids 
		       run-ids
		       (if toppath (begin
				     (db:delay-if-busy)
		       (if toppath (db:get-all-run-ids mtdb))))
				     (db:get-all-run-ids mtdb)))))
	 (mdb     (tasks:open-db))
	 (servers (tasks:get-all-servers mdb)))
    
    ;; kill servers
    (if (member 'killservers options)
	(for-each
	 (lambda (server)
	   (tasks:server-delete-record mdb (vector-ref server 0) "dbmigration")
	   (tasks:kill-server (vector-ref server 2)(vector-ref server 1)))
	 servers))

    ;; clear out junk records
    ;;
    (if (member 'dejunk options)
	(begin
	  (db:delay-if-busy)
	(db:clean-up mtdb))
	  (db:clean-up mtdb)))

    ;; adjust test-ids to fit into proper range
    ;;
    (if (member 'adj-testids options)
	(begin
	  (db:delay-if-busy)
	(db:prep-megatest.db-for-migration mtdb))
	  (db:prep-megatest.db-for-migration mtdb)))

    ;; sync runs, test_meta etc.
    ;;
    (if (member 'old2new options)
	(begin
	  (db:sync-tables (db:sync-main-list mtdb) mtdb (db:get-db dbstruct #f))
	  (for-each 
	   (lambda (run-id)
	     (db:delay-if-busy)
	     (let ((testrecs (db:get-all-tests-info-by-run-id mtdb run-id))
		   (dbstruct (if toppath (make-dbr:dbstruct path: toppath local: #t) #f)))
	       (debug:print 0 "INFO: Propagating " (length testrecs) " records for run-id=" run-id " to run specific db")
	       (db:replace-test-records dbstruct run-id testrecs)
	       (sqlite3:finalize! (dbr:dbstruct-get-rundb dbstruct))))
	   run-ids)))

    ;; now ensure all newdb data are synced to megatest.db
    (if (member 'new2old options)
	(for-each
	 (lambda (run-id)
	   (db:delay-if-busy)
	   (let ((fromdb (if toppath (make-dbr:dbstruct path: toppath local: #t) #f)))
	     (if (eq? run-id 0)
		 (db:sync-tables (db:sync-main-list dbstruct)(db:get-db fromdb run-id) mtdb)
		 (db:sync-tables db:sync-tests-only (db:get-db fromdb run-id) mtdb))))
	 run-ids))
    
    (db:close-all dbstruct)

Modified http-transport.scm from [6319f7b685] to [c83e2578f6].

153
154
155
156
157
158
159

160
161
162
163

164
165
166
167
168
169
170
171
172
173
174
175
176
177

178
179
180
181
182
183
184
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187







+




+














+








	     ;; get_next_port goes here
	     (http-transport:try-start-server run-id
					      ipaddrstr
					      (portlogger:open-run-close portlogger:find-port)
					      server-id))
	   (begin
	     (tasks:wait-on-busy-monitor.db)
	     (tasks:server-force-clean-run-record (tasks:get-db) run-id ipaddrstr portnum " http-transport:try-start-server")
	     (print "ERROR: Tried and tried but could not start the server"))))
     ;; any error in following steps will result in a retry
     (set! *server-info* (list ipaddrstr portnum))
     (tasks:wait-on-busy-monitor.db)
     (tasks:server-set-interface-port 
		     (tasks:get-db)
		     server-id 
		     ipaddrstr portnum)
     (debug:print 0 "INFO: Trying to start server on " ipaddrstr ":" portnum)
     ;; This starts the spiffy server
     ;; NEED WAY TO SET IP TO #f TO BIND ALL
     ;; (start-server bind-address: ipaddrstr port: portnum)
     (if config-hostname ;; this is a hint to bind directly
	 (start-server port: portnum bind-address: (if (equal? config-hostname "-")
						       ipaddrstr
						       config-hostname))
	 (start-server port: portnum))
     ;;  (portlogger:open-run-close portlogger:set-port portnum "released")
     (tasks:wait-on-busy-monitor.db)
     (tasks:server-force-clean-run-record (tasks:get-db) run-id ipaddrstr portnum " http-transport:try-start-server")
     (debug:print 1 "INFO: server has been stopped"))))

;;======================================================================
;; S E R V E R   U T I L I T I E S 
;;======================================================================

254
255
256
257
258
259
260

261
262
263
264
265
266
267
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271







+







	    exn
	    (debug:print 0 "WARNING: closing connections failed. Server at " fullurl " almost certainly dead")
	    (close-all-connections!))
	   (debug:print 0 "WARNING: Failed to communicate with server, trying again, numretries left: " numretries)
	   (http-transport:client-api-send-receive run-id serverdat cmd params numretries: (- numretries 1)))
	 (begin
	   (mutex-unlock! *http-mutex*)
	   (tasks:wait-on-busy-monitor.db)
	   (tasks:kill-server-run-id run-id)
	   #f))
     (begin
       (debug:print-info 11 "fullurl=" fullurl ", cmd=" cmd ", params=" params ", run-id=" run-id "\n")
       ;; set up the http-client here
       (max-retry-attempts 1)
       ;; consider all requests indempotent
367
368
369
370
371
372
373

374
375
376
377
378
379
380
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385







+







			      sdat
                              (begin
				(debug:print-info 0 "Still waiting, last-sdat=" last-sdat)
                                (sleep 4)
				(if (> (- (current-seconds) start-time) 120) ;; been waiting for two minutes
				    (let ((tdb  (tasks:open-db)))
				      (debug:print 0 "ERROR: transport appears to have died, exiting server " server-id " for run " run-id)
				      (tasks:wait-on-busy-monitor.db)
				      (tasks:server-delete-record tdb server-id "failed to start, never received server alive signature")
				      (sqlite3:finalize! tdb)
				      (exit))
				    (loop start-time
					  (equal? sdat last-sdat)
					  sdat)))))))
         (iface       (car server-info))
402
403
404
405
406
407
408

409
410
411
412
413
414
415
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421







+







	(debug:print 2 "SYNC: time= " sync-time ", rem-time=" rem-time)

      ;;
      ;; set_running after our first pass through and start the db
      ;;
      (if (eq? server-state 'available)
	  (begin
	    (tasks:wait-on-busy-monitor.db)
	    (tasks:server-set-state! tdb server-id "dbprep")
	    (thread-sleep! 5) ;; give some margin for queries to complete before switching from file based access to server based access
	    (set! *inmemdb*  (db:setup run-id))
	    (tasks:server-set-state! tdb server-id "running")))

      (if (and (<= rem-time 4)
	       (> rem-time 0))
461
462
463
464
465
466
467

468
469
470
471
472
473
474
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481







+







	    ;;     (tasks:server-set-state! tdb server-id "running"))
	    ;;
	    (loop 0 server-state))
	  (begin
	    (debug:print-info 0 "Starting to shutdown the server.")
	    ;; need to delete only *my* server entry (future use)
	    (set! *time-to-exit* #t)
	    (tasks:wait-on-busy-monitor.db) ;; wait here in addition to just before the shutting-down
	    (if *inmemdb* (db:sync-touched *inmemdb* *run-id* force-sync: #t))
	    ;;
	    ;; start_shutdown
	    ;;
	    (tasks:server-set-state! tdb server-id "shutting-down")
	    (portlogger:open-run-close portlogger:set-port port "released")
	    (thread-sleep! 5)
484
485
486
487
488
489
490

491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510

511
512
513
514
515
516

517
518
519
520
521

522
523
524
525
526
527
528
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539







+




















+






+





+







	    (debug:print-info 0 "Average non-cached time   "
			      (if (eq? *number-non-write-queries* 0)
				  "n/a (no queries)"
				  (/ *total-non-write-delay* 
				     *number-non-write-queries*))
			      " ms")
	    (debug:print-info 0 "Server shutdown complete. Exiting")
	    (tasks:wait-on-busy-monitor.db)
	    (tasks:server-delete-record tdb server-id " http-transport:keep-running")
	    (exit))))))

;; all routes though here end in exit ...
;;
;; start_server? 
;;
(define (http-transport:launch run-id)
  (set! *run-id*   run-id)
  (if (args:get-arg "-daemonize")
      (begin
	(daemon:ize)
	(if *alt-log-file* ;; we should re-connect to this port, I think daemon:ize disrupts it
	    (begin
	      (current-error-port *alt-log-file*)
	      (current-output-port *alt-log-file*)))))
  (if (server:check-if-running run-id)
      (begin
	(debug:print 0 "INFO: Server for run-id " run-id " already running")
	(exit 0)))
  (tasks:wait-on-busy-monitor.db)
  (let loop ((server-id (tasks:server-lock-slot (tasks:get-db) run-id))
	     (remtries  4))
    (if (not server-id)
	(if (> remtries 0)
	    (begin
	      (thread-sleep! 2)
	      (tasks:wait-on-busy-monitor.db)
	      (loop (tasks:server-lock-slot (tasks:get-db) run-id)
		    (- remtries 1)))
	    (begin
	      ;; since we didn't get the server lock we are going to clean up and bail out
	      (debug:print-info 2 "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue")
	      (tasks:wait-on-busy-monitor.db)
	      (tasks:server-delete-records-for-this-pid (tasks:get-db) " http-transport:launch")
	      ))
	(let* ((th2 (make-thread (lambda ()
				   (debug:print-info 0 "Server run thread started")
				   (http-transport:run 
				    (if (args:get-arg "-server")
					(args:get-arg "-server")

Modified launch.scm from [56d8cf0de9] to [490135a0e7].

498
499
500
501
502
503
504
505

506
507
508
509

510
511
512
513

514
515
516
517
518
519
520
498
499
500
501
502
503
504

505
506
507
508

509
510
511
512

513
514
515
516
517
518
519
520







-
+



-
+



-
+







	 (bestsize 0))
    (if disks 
	(for-each 
	 (lambda (disk-num)
	   (let* ((dirpath    (cadr (assoc disk-num disks)))
		  (freespc    (cond
			       ((not (directory? dirpath))
				(if (common:low-noise-print 20 "disks" disk-num)
				(if (common:low-noise-print 50 "disks not a dir " disk-num)
				    (debug:print 0 "WARNING: disk " disk-num " at path " dirpath " is not a directory - ignoring it."))
				-1)
			       ((not (file-write-access? dirpath))
				(if (common:low-noise-print 20 "disks" disk-num)
				(if (common:low-noise-print 50 "disks not writeable " disk-num)
				    (debug:print 0 "WARNING: disk " disk-num " at path " dirpath " is not writeable - ignoring it."))
				-1)
			       ((not (eq? (string-ref dirpath 0) #\/))
				(if (common:low-noise-print 20 "disks" disk-num)
				(if (common:low-noise-print 50 "disks not a proper path " disk-num)
				    (debug:print 0 "WARNING: disk " disk-num " at path " dirpath " is not a fully qualified path - ignoring it."))
				-1)
			       (else
				(get-df dirpath)))))
	     (if (> freespc bestsize)
		 (begin
		   (set! best     dirpath)

Modified rmt.scm from [f39ba1bbf7] to [ec918e30be].

95
96
97
98
99
100
101

102
103
104
105




106
107
108
109
110
111
112
95
96
97
98
99
100
101
102




103
104
105
106
107
108
109
110
111
112
113







+
-
-
-
-
+
+
+
+







		(hash-table-delete! *runremote* run-id) ;; don't keep using the same connection
		(rmt:send-receive cmd run-id params))))
	(let ((max-avg-qry (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "10"))))
	  (debug:print-info 4 "no server and read-only query, bypassing normal channel")
	  ;; (if (rmt:write-frequency-over-limit? cmd run-id)(server:kind-run run-id))
	  (let ((curr-max (rmt:get-max-query-average run-id)))
	    (if (> (cdr curr-max) max-avg-qry)
		(if (common:low-noise-print 10 "start server due to max average query too long")
		(begin
		  (debug:print-info 0 "Max average query, " (inexact->exact (round (cdr curr-max))) "ms (" (car curr-max) ") exceeds " max-avg-qry ", try starting server ...")
		  (server:kind-run run-id))
		(debug:print-info 3 "Max average query, " (inexact->exact (round (cdr curr-max))) "ms (" (car curr-max) ") below " max-avg-qry ", not starting server...")))
		      (begin
			(debug:print-info 0 "Max average query, " (inexact->exact (round (cdr curr-max))) "ms (" (car curr-max) ") exceeds " max-avg-qry ", try starting server ...")
			(server:kind-run run-id)
			(debug:print-info 3 "Max average query, " (inexact->exact (round (cdr curr-max))) "ms (" (car curr-max) ") below " max-avg-qry ", not starting server...")))))
	  (rmt:open-qry-close-locally cmd run-id params)))))

(define (rmt:update-db-stats run-id rawcmd params duration)
  (mutex-lock! *db-stats-mutex*)
  (handle-exceptions
   exn
   (begin

Modified runs.scm from [27619ede09] to [5861640706].

158
159
160
161
162
163
164
165
166
167



168
169
170
171
172
173
174
158
159
160
161
162
163
164



165
166
167
168
169
170
171
172
173
174







-
-
-
+
+
+







    (if (> (- currtime lasttime) waitval)
	(begin
	  (hash-table-set! *runs:denoise* key currtime)
	  #t)
	#f)))

(define (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs)
  (thread-sleep! (cond
		  ((> *runs:can-run-more-tests-count* 20) 2);; obviously haven't had any work to do for a while
		  (else 0)))
  ;;(thread-sleep! (cond
  ;;      	  ((> *runs:can-run-more-tests-count* 20) 2);; obviously haven't had any work to do for a while
  ;;      	  (else 0)))
  (let* ((num-running             (rmt:get-count-tests-running run-id))
	 (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup))
	 (job-group-limit         (let ((jobg-count (config-lookup *configdat* "jobgroups" jobgroup)))
				    (if (string? jobg-count)
					(string->number jobg-count)
					jobg-count))))
    (if (> (+ num-running num-running-in-jobgroup) 0)
1115
1116
1117
1118
1119
1120
1121
1122

1123
1124
1125
1126
1127
1128
1129
1115
1116
1117
1118
1119
1120
1121

1122
1123
1124
1125
1126
1127
1128
1129







-
+







	    (if (> (current-seconds)(+ last-time-incomplete 900))
		(begin
		  (debug:print-info 0 "Marking stuck tests as INCOMPLETE while waiting for run " run-id ". Running as pid " (current-process-id) " on " (get-host-name))
		  (set! last-time-incomplete (current-seconds))
		  (rmt:find-and-mark-incomplete run-id #f)))
	    (if (not (eq? num-running prev-num-running))
		(debug:print-info 0 "run-wait specified, waiting on " num-running " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at " (time->string (seconds->local-time (current-seconds)))))
	    (thread-sleep! 15)
	    (thread-sleep! 5)
	    ;; (wait-loop (rmt:get-count-tests-running-for-run-id run-id) num-running))))
	    (wait-loop (rmt:get-count-tests-running-for-run-id run-id) num-running))))
    ;; LET* ((test-record
    ;; we get here on "drop through". All done!
    (debug:print-info 1 "All tests launched")))

(define (runs:calc-fails prereqs-not-met)

Modified tasks.scm from [b57f2cb57e] to [059408bffa].

49
50
51
52
53
54
55



56
57
58
59
60
61
62
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65







+
+
+







(define (tasks:get-task-db-path)
  (if *task-db*
      (vector-ref *task-db* 1)
      (let* ((linktree     (configf:lookup *configdat* "setup" "linktree"))
	     (dbpath       (conc linktree "/.db/monitor.db")))
	dbpath)))

(define (tasks:wait-on-busy-monitor.db)
  (tasks:wait-on-journal (tasks:get-task-db-path) 30))

;; If file exists AND
;;    file readable
;;         ==> open it
;; If file exists AND
;;    file NOT readable
;;         ==> open in-mem version
;; If file NOT exists