Megatest

Check-in [336e9917b1]
Login
Overview
Comment:locking of main.db nearly complete
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.6584-ck5
Files: files | file ages | folders
SHA1: 336e9917b16585d028b89c0c01debfddf43ae6eb
User & Date: matt on 2021-04-29 09:17:37
Other Links: branch diff | manifest | tags
Context
2021-04-29
21:58
wip check-in: f3260cf6bc user: matt tags: v1.6584-ck5
09:17
locking of main.db nearly complete check-in: 336e9917b1 user: matt tags: v1.6584-ck5
2021-04-28
23:27
wip check-in: a758074358 user: matt tags: v1.6584-ck5
Changes

Modified dbmod.scm from [9c5f03fda2] to [2abeb8436f].

226
227
228
229
230
231
232
233
234



235
236
237
238
239
240
241
242
243
244
245


246

247
248
249
250
251
252
253
254







255
256
257
258
259

260
261


262
263
264
265






266
267
268
269
270
271
272
226
227
228
229
230
231
232


233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248

249








250
251
252
253
254
255
256
257
258
259
260

261
262
263
264
265
266



267
268
269
270
271
272
273
274
275
276
277
278
279







-
-
+
+
+











+
+
-
+
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+




-
+


+
+

-
-
-
+
+
+
+
+
+







;; Make the dbstruct, call for main db at least once
;; sync disk db to inmem
;;
;; called in http-transport and replicated in rmt.scm for *local* access. 
;;
(define (db:setup run-id)
  (assert *toppath* "FATAL: db:setup called before toppath is available.")
  (let* ((dbstruct (make-dbr:dbstruct)))
    (db:get-dbdat dbstruct *toppath* run-id)
  (let* ((dbstruct (make-dbr:dbstruct))
	 (db-file  (db:run-id->path *toppath* run-id)))
    (db:get-dbdat dbstruct *toppath* db-file)
    (set! *dbstruct-db* dbstruct)
    dbstruct))

;;======================================================================
;; setting/getting a lock on the db for only one server per db
;;
;;  NOTE:
;;       These operate directly on the disk file, NOT on the inmemory db
;;       The lockname is the filename (can have many to one, run-id to fname 
;;======================================================================

;; called before db is open?
;;
(define (db:get-iam-server-lock dbstruct apath run-id)
(define (db:get-iam-server-lock dbh dbfname)
  (let* ((dbh     (db:get-ddb apath dbstruct run-id))
	 (dbfname (db:run-id->path run-id)))
    (sqlite3:with-transaction
     dbh
     (lambda ()
       (let* ((locked (db:get-locker dbh dbfname)))
	 (if (not locked)
	     (db:take-lock dbh dbfname)))))))
  (sqlite3:with-transaction
   dbh
   (lambda ()
     (let* ((locked (db:get-locker dbh dbfname)))
       (if (not locked)
	   (db:take-lock dbh dbfname)
	   #f)))))
	     
;; (exn sqlite3) 
(define (db:get-locker dbh dbfname)
  (condition-case
   (sqlite3:first-row dbh "SELECT owner_id,owner_host,event_time FROM locks WHERE lockname=%;" dbfname)
   (sqlite3:first-row dbh "SELECT owner_pid,owner_host,event_time FROM locks WHERE lockname=%;" dbfname)
   (exn (sqlite3) #f)))

;; should never fail because it is run in a transaction with a test for the lock
;;
(define (db:take-lock dbh dbfname)
  (condition-case
   (sqlite3:first-row dbh "INSERT INTO locks lockname,owner_id,owner_host VALUES (?,?,?);" dbfname (current-process-id) (get-host-name))
   (exn (sqlite3) #f)))
  ;; (condition-case
  ;;  (begin
     (sqlite3:execute dbh "INSERT INTO locks (lockname,owner_pid,owner_host) VALUES (?,?,?);" dbfname (current-process-id) (get-host-name))
   ;;   #t)
     ;; (exn (sqlite3) #f)))
     #t)

(define (db:release-lock dbh dbfname)
  (sqlite3:execute dbh "DELETE FROM locks WHERE lockname=?;" dbfname))

;;======================================================================
;; SQLITE3 HELPERS
;;======================================================================

Modified http-transportmod.scm from [a5cc6ea588] to [51f05a712a].

94
95
96
97
98
99
100
101

102
103
104
105
106
107
108
94
95
96
97
98
99
100

101
102
103
104
105
106
107
108







-
+







;; 
;; (use spiffy uri-common intarweb http-client spiffy-request-vars intarweb spiffy-directory-listing)
;; 
;; Configurations for server
(tcp-buffer-size 2048)
(max-connections 2048) 

(defstruct sdat
(defstruct servdat
  host
  port
  uuid)

(define (http-transport:make-server-url hostport)
  (if (not hostport)
      #f
224
225
226
227
228
229
230
231

232
233
234
235
236
237
238
224
225
226
227
228
229
230

231
232
233
234
235
236
237
238







-
+







		
		;; get_next_port goes here
		(http-transport:try-start-server ipaddrstr
						 (portlogger:open-run-close portlogger:find-port)))
	      (begin
		(print "ERROR: Tried and tried but could not start the server"))))
      ;; any error in following steps will result in a retry
      (set! *server-info* (make-sdat host: ipaddrstr port: portnum))
      (set! *server-info* (make-servdat host: ipaddrstr port: portnum))
      (debug:print 0 *default-log-port* "INFO: Trying to start server on " ipaddrstr ":" portnum)
      ;; This starts the spiffy server
      ;; NEED WAY TO SET IP TO #f TO BIND ALL
      ;; (start-server bind-address: ipaddrstr port: portnum)
      (if config-hostname ;; this is a hint to bind directly
	  (start-server port: portnum bind-address: (if (equal? config-hostname "-")
							ipaddrstr
445
446
447
448
449
450
451









452
453
454
455
456
457
458
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467







+
+
+
+
+
+
+
+
+







	 (api-req      (make-request method: 'POST uri: api-uri))
	 (server-dat   (vector iface port api-uri api-url api-req (current-seconds) server-id)))
    server-dat))

;;======================================================================
;; NEW SERVER METHOD
;;======================================================================

;; only use for main.db - need to re-write some of this :(
;;
(define (get-lock-db sdat dbfile)
  (let* ((dbh (db:open-run-db dbfile db:initialize-db))
	 (res (db:get-iam-server-lock dbh dbfile)))
    (sqlite3:finalize! dbh)
    res))


(define *srvpktspec*
  `((server (host    . h)
	    (port    . p)
	    (servkey . k)
	    (pid     . i)
	    (ipaddr  . a)
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
478
479
480
481
482
483
484





485
486
487
488
489
490
491







-
-
-
-
-







		   pkts-dir
		   pkt-dat
		   pktspec: pkt-spec
		   ptype: 'server)))
    (debug:print 0 *default-log-port* "Server on "host":"port" registered in pkt "uuid)
    uuid))

;; ya, fake it for now
;;
(define (register-server-in-db db-file)
  #t)

(define (get-pkts-dir #!optional (apath #f))
  (let* ((effective-toppath (or *toppath* apath)))
    (assert effective-toppath
	    "ERROR: get-pkts-dir called without *toppath* set. Exiting.")
    (let* ((pdir (conc effective-toppath "/.meta/srvpkts")))
      (if (file-exists? pdir)
	  pdir
610
611
612
613
614
615
616
617

618
619
620
621
622


623
624
625
626
627
628
629
630
631

632
633

634






635

636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655


656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675

676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698




699
700
701
702
703
704



705
706
707
708
709
710
711
712
713
714
715
716
717
718
719



720
721
722
723
724
725
726
614
615
616
617
618
619
620

621
622
623
624


625
626
627
628
629
630
631
632
633
634

635


636
637
638
639
640
641
642
643

644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662


663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683

684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703




704
705
706
707
708
709
710



711
712
713
714
715
716
717
718
719
720
721
722
723
724
725



726
727
728
729
730
731
732
733
734
735







-
+



-
-
+
+








-
+
-
-
+

+
+
+
+
+
+
-
+


















-
-
+
+



















-
+



















-
-
-
-
+
+
+
+



-
-
-
+
+
+












-
-
-
+
+
+







				;; create a server pkt in *toppath*/.meta/srvpkts

				;; TODO:
				;;   1. change sdat to stuct
				;;   2. add uuid to struct
				;;   3. update uuid in sdat here
				;;
				(sdat-uuid-set! sdat
				(servdat-uuid-set! sdat
						(register-server
						 pkts-dir *srvpktspec*
						 (get-host-name)
						 (sdat-port sdat) server-key
						 (sdat-host sdat) db-file))
						 (servdat-port sdat) server-key
						 (servdat-host sdat) db-file))

				;; now read pkts and see if we are a contender
				(let* ((all-pkts     (get-all-server-pkts pkts-dir *srvpktspec*))
				       (viables      (get-viable-servers all-pkts db-file))
				       (best-srv     (get-best-candidate viables db-file))
				       (best-srv-key (if best-srv (alist-ref 'servkey best-srv) #f)))
				  (debug:print 0 *default-log-port* "best-srv-key: "best-srv-key", server-key: "server-key)
				  ;; am I the best-srv, compare server-keys to know
				  (if (and (equal? best-srv-key server-key)
				  (if (equal? best-srv-key server-key)
					   (register-server-in-db db-file))
				      (if (db:get-iam-server-lock *dbstruct-db* *toppath* run-id)
				      (if (get-lock-db sdat db-file) ;; (db:get-iam-server-lock *dbstruct-db* *toppath* run-id)
					  (debug:print 0 *default-log-port* "I'm the server!")
					  (begin
					    (debug:print 0 *default-log-port* "I'm not the server, exiting.")
					    (bdat-time-to-exit-set! *bdat* #t)
					    (exit)))
				      (begin
					(debug:print 0 *default-log-port* "Keys do not match "best-srv-key", "server-key", exiting.")
					  (bdat-time-to-exit-set! *bdat* #t))) ;; nope, we are not needed, exit when can do
					(bdat-time-to-exit-set! *bdat* #t)))
				  sdat))
                              (begin
				(debug:print-info 0 *default-log-port* "Still waiting, last-sdat=" last-sdat)
                                (sleep 4)
				(if (> (- (current-seconds) start-time) 120) ;; been waiting for two minutes
				    (begin
				      (debug:print-error 0 *default-log-port* "transport appears to have died, exiting server")
                                      #;(common:save-pkt `((action . died)
                                                         (T      . server)
                                                         (pid    . ,(current-process-id))
                                                         (ipaddr . ,(car sdat))
                                                         (port   . ,(cadr sdat))
                                                         (msg    . "Transport died?"))
						       *configdat* #t)
				      (exit))
				    (loop start-time
					  (equal? sdat last-sdat)
					  sdat)))))))
	 (iface       (car server-info))
         (port        (cadr server-info))
	 (iface       (servdat-host server-info))
         (port        (servdat-port server-info))
         (last-access 0)
	 (server-timeout (server:expiration-timeout))
	 (server-log-file (args:get-arg "-log"))) ;; always set when we are a server

    (let loop ((count         0)
	       (server-state 'available)
	       (bad-sync-count 0)
	       (start-time     (current-milliseconds)))
      ;; Use this opportunity to sync the tmp db to megatest.db
      (if (not *dbstruct-db* )
	  (let ((watchdog (bdat-watchdog *bdat*)))
	    (debug:print 0 *default-log-port* "SERVER: dbprep")
	    
	    (db:setup dbname) ;; sets *dbstruct-db* as side effect

	    (debug:print 0 *default-log-port* "SERVER: running, megatest version: " (common:get-full-version)) ;; NOTE: the server is NOT yet marked as running in the log. We do that in the keep-running routine.
	    (if watchdog
		(if (not (member (thread-state watchdog) '(ready running blocked sleeping dead)))
		    (begin
		      (debug:print-info 0 "Starting watchdog thread (in state "(thread-state watchdog)")")
		      (debug:print-info 0 *default-log-port* "Starting watchdog thread (in state "(thread-state watchdog)")")
		      (thread-start! watchdog)))
		(debug:print 0 *default-log-port* "ERROR: *watchdog* not setup, cannot start it."))))
      
      ;; when things go wrong we don't want to be doing the various queries too often
      ;; so we strive to run this stuff only every four seconds or so.
      (let* ((sync-time (- (current-milliseconds) start-time))
	    (rem-time  (quotient (- 4000 sync-time) 1000)))
	(if (and (<= rem-time 4)
		 (>  rem-time 0))
	    (thread-sleep! rem-time)))
      
      (if (< count 1) ;; 3x3 = 9 secs aprox
	  (loop (+ count 1) 'running bad-sync-count (current-milliseconds)))
      
      ;; Check that iface and port have not changed (can happen if server port collides)
      (mutex-lock! *heartbeat-mutex*)
      (set! sdat *server-info*)
      (mutex-unlock! *heartbeat-mutex*)
      
      (if (or (not (equal? (sdat-host sdat) iface))
	      (not (equal? (sdat-port sdat) port)))
	  (let ((new-iface (car sdat))
		(new-port  (cadr sdat)))
      (if (or (not (equal? (servdat-host sdat) iface))
	      (not (equal? (servdat-port sdat) port)))
	  (let ((new-iface (servdat-host sdat))
		(new-port  (servdat-port sdat)))
	    (debug:print-info 0 *default-log-port* "WARNING: interface changed, refreshing iface and port info")
	    (set! iface new-iface)
	    (set! port  new-port)
             (if (not *server-id*)
              (set! *server-id* (server:mk-signature)))
            (debug:print 0 *default-log-port* (current-seconds) (current-directory) (current-process-id) (argv))
	    (if (not *server-id*)
		(set! *server-id* (server:mk-signature)))
            ;; (debug:print 0 *default-log-port* (current-seconds) (current-directory) (current-process-id) (argv))
	    (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds) " server-id: " *server-id*)
	    (flush-output *default-log-port*)))
      
      ;; Transfer *db-last-access* to last-access to use in checking that we are still alive
      (mutex-lock! *heartbeat-mutex*)
      (set! last-access *db-last-access*)
      (mutex-unlock! *heartbeat-mutex*)
      
      (if (common:low-noise-print 120 (conc "server running on " iface ":" port))
	  (begin
             (if (not *server-id*)
              (set! *server-id* (server:mk-signature)))
            (debug:print 0 *default-log-port* (current-seconds) (current-directory) (current-process-id) (argv))   
	    (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds) " server-id: " *server-id*)
	    (flush-output *default-log-port*)))
	     ;; (debug:print 0 *default-log-port* (current-seconds) (current-directory) (current-process-id) (argv))   
	     (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds) " server-id: " *server-id*)
	     (flush-output *default-log-port*)))
      (if (common:low-noise-print 60 "dbstats")
	  (begin
	    (debug:print 0 *default-log-port* "Server stats:")
	    (db:print-current-query-stats)))
      (let* ((hrs-since-start  (/ (- (current-seconds) server-start-time) 3600)))
	(cond
         ((and *server-run*

Modified launchmod.scm from [6c6ba3f325] to [ba5b81a174].

461
462
463
464
465
466
467
468

469
470
471
472
473
474
475
461
462
463
464
465
466
467

468
469
470
471
472
473
474
475







-
+







	  (launch:setup) ;; should be properly in the run area home now
          
	  (set! tconfigreg (tests:get-all)) ;; mapping of testname => test source path
	  (let ((sighand (lambda (signum)
			   ;; (signal-mask! signum) ;; to mask or not? seems to cause issues in exiting
			   (if (eq? signum signal/stop)
			       (debug:print-error 0 *default-log-port* "attempt to STOP process. Exiting."))
			   (set! (bdat-time-to-exit *bdat*) #t)
			   (bdat-time-to-exit-set! *bdat* #t)
			   (print "Received signal " signum ", cleaning up before exit (set this test to COMPLETED/ABORT) . Please wait...")
			   (let ((th1 (make-thread (lambda ()
                                                     (print "set test to COMPLETED/ABORT begin.")
						     (rmt:test-set-state-status run-id test-id "COMPLETED" "ABORT" "received kill signal")
                                                     (print "set test to COMPLETED/ABORT complete.")
						     (print "Killed by signal " signum ". Exiting")
						     (exit 1))))

Modified rmtmod.scm from [0f70a2dcff] to [9c11844c69].

1765
1766
1767
1768
1769
1770
1771






1772
1773
1774
1775
1776
1777
1778
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784







+
+
+
+
+
+







			 (bdat-time-to-exit-set! *bdat* #t)
			 #t))))
    (debug:print-info 4 *default-log-port* "starting exit process, finalizing databases.")
    (if (and no-hurry (debug:debug-mode 18))
	(rmt:print-db-stats))
    (let ((th1 (make-thread (lambda () ;; thread for cleaning up, give it five seconds
                              (if *dbstruct-db* (db:close-all *dbstruct-db*)) ;; one second allocated
			      (if *server-info*
				  (let ((pkt-file (conc (get-pkts-dir *toppath*)
							"/" (servdat-uuid *server-info*)
							".pkt")))
				    (debug:print-info 0 *default-log-port* "removing pkt "pkt-file)
				    (delete-file* pkt-file)))
			      (if (bdat-task-db *bdat*)    
				  (let ((db (cdr (bdat-task-db *bdat*))))
				    (if (sqlite3:database? db)
					(begin
					  (sqlite3:interrupt! db)
					  (sqlite3:finalize! db #t)
					  ;; (vector-set! (bdat-task-db *bdat*) 0 #f)