Megatest

Changes On Branch 076441b7a672d6dd
Login

Changes In Branch inmem-per-run-db-per-run-server Through [076441b7a6] Excluding Merge-Ins

This is equivalent to a diff from 6ce4b508b6 to 076441b7a6

2013-12-09
08:54
fixed typo check-in: f6c7f13b18 user: mrwellan tags: inmem-per-run-db
2013-12-01
23:06
Coarse grained migration of accessing server via run-id index into *runremote* check-in: 1aa5496577 user: matt tags: inmem-per-run-db-per-run-server
21:28
Conversion to inmem-per-run-db-per-run-server check-in: 076441b7a6 user: matt tags: inmem-per-run-db-per-run-server
2013-11-30
22:00
More fixes check-in: 6ce4b508b6 user: matt tags: inmem-per-run-db
21:57
More fixes check-in: 96ae57fada user: matt tags: inmem-per-run-db

Modified common.scm from [a28c1ed00f] to [0ebd6dd938].

57
58
59
60
61
62
63

64
65
66
67
68
69
70
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71







+







(define *server-info*       #f)
(define *time-to-exit*      #f)
(define *received-response* #f)
(define *default-numtries*  10)
(define *server-run*        #t)
(define *db-write-access*   #t)
(define *inmemdb*           #f)
(define *run-id*            #f)

(define *target*            (make-hash-table)) ;; cache the target here; target is keyval1/keyval2/.../keyvalN
(define *keys*              (make-hash-table)) ;; cache the keys here
(define *keyvals*           (make-hash-table))
(define *toptest-paths*     (make-hash-table)) ;; cache toptest path settings here
(define *test-paths*        (make-hash-table)) ;; cache test-id to test run paths here
(define *test-ids*          (make-hash-table)) ;; cache run-id, testname, and item-path => test-id

Modified dashboard-tests.scm from [78c574598f] to [0f4d58ff5b].

138
139
140
141
142
143
144
145

146
147
148
149
150
151
152
138
139
140
141
142
143
144

145
146
147
148
149
150
151
152







-
+







;;======================================================================
;; Run info panel
;;======================================================================
(define (run-info-panel db keydat testdat runname)
  (let* ((run-id     (db:test-get-run_id testdat))
	 (rundat     (db:get-run-info db run-id))
	 (header     (db:get-header rundat))
	 (event_time (db:get-value-by-header (db:get-row rundat)
	 (event_time (db:get-value-by-header (db:get-rows rundat)
					     (db:get-header rundat)
					     "event_time")))
    (iup:frame 
     #:title "Megatest Run Info" ; #:expand "YES"
     (iup:hbox ; #:expand "YES"
      (apply iup:vbox ; #:expand "YES"
	     (append (map (lambda (keyval)
460
461
462
463
464
465
466
467

468
469
470
471
472
473
474
460
461
462
463
464
465
466

467
468
469
470
471
472
473
474







-
+







    (if (not testdat)
	(begin
	  (debug:print 2 "ERROR: No test data found for test " test-id ", exiting")
	  (exit 1))
	(let* (;; (run-id        (if testdat (db:test-get-run_id testdat) #f))
	       (keydat        (if testdat (db:get-key-val-pairs dbstruct run-id) #f))
	       (rundat        (if testdat (db:get-run-info dbstruct run-id) #f))
	       (runname       (if testdat (db:get-value-by-header (db:get-row rundat)
	       (runname       (if testdat (db:get-value-by-header (db:get-rows rundat)
								  (db:get-header rundat)
								  "runname") #f))
	       (tdb           (tdb:open-test-db-by-test-id-local dbstruct run-id test-id))
	       ;; These next two are intentional bad values to ensure errors if they should not
	       ;; get filled in properly.
	       (logfile       "/this/dir/better/not/exist")
	       (rundir        logfile)

Modified db.scm from [2818f0f5b2] to [46374c3bd7].

168
169
170
171
172
173
174
175

176
177
178
179
180
181
182
168
169
170
171
172
173
174

175
176
177
178
179
180
181
182







-
+







	  (if (not dbexists)
	      (db:initialize-main-db db))
	  (dbr:dbstruct-set-main! dbstruct db)
	  db))))

;; Make the dbstruct, setup up auxillary db's and call for main db at least once
;;
(define (db:setup #!key (local #f))
(define (db:setup run-id #!key (local #f))
  (let ((dbstruct (make-dbr:dbstruct path: *toppath* local: local)))
    (db:get-db dbstruct #f) ;; force one call to main
    ;; (if (not sdb:qry)
    ;;     (begin
    ;;       (set! sdb:qry (make-sdb:qry (conc *toppath* "/db/strings.db"))) ;; we open the normalization helpers here
    ;;       (sdb:qry 'setup #f)
    ;;       ;; Initialize with some known needed strings, NOTE: set this up to only execute on first db initialization
770
771
772
773
774
775
776
777

778
779
780
781
782
783
784
785
786
787





788
789
790
791
792
793
794
770
771
772
773
774
775
776

777
778

779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798







-
+

-








+
+
+
+
+







		       (lambda (key)
			 (set! res (cons key res)))
		       (db:get-db dbstruct #f)
		       "SELECT fieldname FROM keys ORDER BY id DESC;")))
	(set! *db-keys* res)
	res)))

;; 
;; look up values in a header/data structure
(define (db:get-value-by-header row header field)
  (debug:print-info 4 "db:get-value-by-header row: " row " header: " header " field: " field)
  (if (null? header) #f
      (let loop ((hed (car header))
		 (tal (cdr header))
		 (n   0))
	(if (equal? hed field)
	    (vector-ref row n)
	    (if (null? tal) #f (loop (car tal)(cdr tal)(+ n 1)))))))

;; Accessors for the header/data structure
;; get rows and header from 
(define (db:get-header vec)(vector-ref vec 0))
(define (db:get-rows   vec)(vector-ref vec 1))

;;======================================================================
;;  R U N S
;;======================================================================

(define (db:get-run-name-from-id dbstruct run-id)
  (let ((res #f))
    (sqlite3:for-each-row
973
974
975
976
977
978
979
980

981
982
983
984
985
986
987
977
978
979
980
981
982
983

984
985
986
987
988
989
990
991







-
+







     runs-info)
    res))

;; db:get-runs-by-patt
;; get runs by list of criteria
;; register a test run with the db
;;
;; Use: (db-get-value-by-header (db:get-header runinfo)(db:get-row runinfo))
;; Use: (db:get-value-by-header (db:get-header runinfo)(db:get-rows runinfo))
;;  to extract info from the structure returned
;;
(define (db:get-runs-by-patt dbstruct keys runnamepatt targpatt offset limit) ;; test-name)
  (let* ((tmp      (runs:get-std-run-fields keys '("id" "runname" "state" "status" "owner" "event_time")))
	 (keystr   (car tmp))
	 (header   (cadr tmp))
	 (res     '())
1011
1012
1013
1014
1015
1016
1017
1018

1019
1020
1021
1022
1023
1024
1025
1015
1016
1017
1018
1019
1020
1021

1022
1023
1024
1025
1026
1027
1028
1029







-
+







		   (lambda (a . r)
		     (set! res (cons (list->vector (cons a r)) res)))
		   (db:get-db dbstruct #f)
		   qry-str
		   runnamepatt)))
    (vector header res)))

;; use (get-value-by-header (db:get-header runinfo)(db:get-row runinfo))
;; use (get-value-by-header (db:get-header runinfo)(db:get-rows runinfo))
(define (db:get-run-info dbstruct run-id)
  ;;(if (hash-table-ref/default *run-info-cache* run-id #f)
  ;;    (hash-table-ref *run-info-cache* run-id)
  (let* ((res       (vector #f #f #f #f))
	 (keys      (db:get-keys dbstruct))
	 (remfields (list "id" "runname" "state" "status" "owner" "event_time"))
	 (header    (append keys remfields))

Modified db_records.scm from [832e173195] to [8182037580].

107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
107
108
109
110
111
112
113




114
115
116
117
118
119
120







-
-
-
-







(define-inline (db:test-set-diskfree! vec val)(vector-set! vec 8 val))
(define-inline (db:test-set-testname! vec val)(vector-set! vec 2 val))
(define-inline (db:test-set-state!    vec val)(vector-set! vec 3 val))
(define-inline (db:test-set-status!   vec val)(vector-set! vec 4 val))
(define-inline (db:test-set-run_duration! vec val)(vector-set! vec 12 val))
(define-inline (db:test-set-final_logf! vec val)(vector-set! vec 13 val))

;; get rows and header from 
(define-inline (db:get-header vec)(vector-ref vec 0))
(define-inline (db:get-rows   vec)(vector-ref vec 1))

;; make-vector-record "" db mintest id run_id testname state status event_time item_path
;;
(define (make-db:mintest)(make-vector 7))
(define-inline (db:mintest-get-id           vec)    (vector-ref  vec 0))
(define-inline (db:mintest-get-run_id       vec)    (vector-ref  vec 1))
(define-inline (db:mintest-get-testname     vec)    (vector-ref  vec 2))
(define-inline (db:mintest-get-state        vec)    (vector-ref  vec 3))
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
203
204
205
206
207
208
209



210
211
212
213
214
215
216







-
-
-







(define-inline (tdb:steps-table-get-runtime    vec)    (vector-ref  vec 4))
(define-inline (tdb:step-stable-set-stepname!  vec val)(vector-set! vec 0 val))
(define-inline (tdb:step-stable-set-start!     vec val)(vector-set! vec 1 val))
(define-inline (tdb:step-stable-set-end!       vec val)(vector-set! vec 2 val))
(define-inline (tdb:step-stable-set-status!    vec val)(vector-set! vec 3 val))
(define-inline (tdb:step-stable-set-runtime!   vec val)(vector-set! vec 4 val))

;; use this one for db-get-run-info
(define-inline (db:get-row    vec)(vector-ref vec 1))

;; The data structure for handing off requests via wire
(define (make-cdb:packet)(make-vector 6))
(define-inline (cdb:packet-get-client-sig   vec)    (vector-ref  vec 0))
(define-inline (cdb:packet-get-qtype        vec)    (vector-ref  vec 1))
(define-inline (cdb:packet-get-immediate    vec)    (vector-ref  vec 2))
(define-inline (cdb:packet-get-query-sig    vec)    (vector-ref  vec 3))
(define-inline (cdb:packet-get-params       vec)    (vector-ref  vec 4))

Modified http-transport.scm from [ac12c8febe] to [2e4f53bbc7].

56
57
58
59
60
61
62
63

64
65
66
67
68
69
70
56
57
58
59
60
61
62

63
64
65
66
67
68
69
70







-
+







	   (set! res adr)))
     (vector->list (hostinfo-addresses (hostname->hostinfo hostname))))
    (string-intersperse 
     (map number->string
	  (u8vector->list
	   (if res res (hostname->ip hostname)))) ".")))

(define (http-transport:run hostn)
(define (http-transport:run hostn run-id server-id)
  (debug:print 2 "Attempting to start the server ...")
  (if (not *toppath*)
      (if (not (setup-for-run))
	  (begin
	    (debug:print 0 "ERROR: cannot find megatest.config, cannot start server, exiting")
	    (exit))))
  (let* (;; (iface           (if (string=? "-" hostn)
139
140
141
142
143
144
145
146

147
148
149
150

151
152
153
154
155
156
157
158
159
160
161

162
163
164
165
166

167
168
169


170
171
172
173
174
175
176
139
140
141
142
143
144
145

146
147
148
149

150
151
152
153
154
155
156
157
158



159
160
161
162


163
164


165
166
167
168
169
170
171
172
173







-
+



-
+








-
-
-
+



-
-
+

-
-
+
+







				   (send-response body: "hey there!\n"
						  headers: '((content-type text/plain))))
				  ((equal? (uri-path (request-uri (current-request))) 
					   '(/ "hey"))
				   (send-response body: "hey there!\n"
						  headers: '((content-type text/plain))))
				  (else (continue))))))))
    (http-transport:try-start-server ipaddrstr start-port)))
    (http-transport:try-start-server ipaddrstr start-port server-id)))

;; This is recursively run by http-transport:run until sucessful
;;
(define (http-transport:try-start-server ipaddrstr portnum)
(define (http-transport:try-start-server ipaddrstr portnum server-id)
  (handle-exceptions
   exn
   (begin
     (print-error-message exn)
     (if (< portnum 9000)
	 (begin 
	   (debug:print 0 "WARNING: failed to start on portnum: " portnum ", trying next port")
	   (thread-sleep! 0.1)
	   ;; (open-run-close tasks:remove-server-records tasks:open-db)
	   (open-run-close tasks:server-delete tasks:open-db ipaddrstr portnum)
	   (http-transport:try-start-server ipaddrstr (+ portnum 1)))
	   (http-transport:try-start-server ipaddrstr (+ portnum 1) server-id))
	 (print "ERROR: Tried and tried but could not start the server")))
   ;; any error in following steps will result in a retry
   (set! *runremote* (list ipaddrstr portnum))
   ;; (open-run-close tasks:remove-server-records tasks:open-db)
   (open-run-close tasks:server-register 
   (open-run-close tasks:server-set-interface-port 
		   tasks:open-db 
		   (current-process-id)
		   ipaddrstr portnum 0 'startup 'http)
		   server-id 
		   ipaddrstr portnum)
   (debug:print 1 "INFO: Trying to start server on " ipaddrstr ":" portnum)
   ;; This starts the spiffy server
   ;; NEED WAY TO SET IP TO #f TO BIND ALL
   (start-server bind-address: ipaddrstr port: portnum)
   (open-run-close tasks:server-delete tasks:open-db ipaddrstr portnum)
   (debug:print 1 "INFO: server has been stopped")))

395
396
397
398
399
400
401
402

403
404
405
406



407
408
409
410
411



412
413
414


415

416
417
418
419
420
421
422
423
424
425
426
427
428

429
430
431
432
433
434
435
392
393
394
395
396
397
398

399
400
401
402

403
404
405
406
407
408
409

410
411
412
413
414
415
416
417

418
419
420
421
422


423
424
425
426
427
428

429
430
431
432
433
434
435
436







-
+



-
+
+
+




-
+
+
+



+
+
-
+




-
-






-
+







;; 	  (set! *transport-type* 'fs)
;; 	  #f))))


;; run http-transport:keep-running in a parallel thread to monitor that the db is being 
;; used and to shutdown after sometime if it is not.
;;
(define (http-transport:keep-running)
(define (http-transport:keep-running server-id)
  ;; if none running or if > 20 seconds since 
  ;; server last used then start shutdown
  ;; This thread waits for the server to come alive
  (let* ((server-info (let loop ()
  (let* ((server-info (let loop ((start-time (current-seconds))
				 (changed    #t)
				 (last-sdat  "not this"))
                        (let ((sdat #f))
                          (mutex-lock! *heartbeat-mutex*)
                          (set! sdat *runremote*)
                          (mutex-unlock! *heartbeat-mutex*)
                          (if sdat
                          (if (and sdat
				   (not changed)
				   (> (- (current-seconds) start-time) 2))
			      sdat
                              (begin
                                (sleep 4)
                                (loop start-time
				      (equal? sdat last-sdat)
                                (loop))))))
				      sdat))))))
         (iface       (car server-info))
         (port        (cadr server-info))
         (last-access 0)
	 (tdb         (tasks:open-db))
	 (spid        ;;(open-run-close tasks:server-get-server-id tasks:open-db #f iface port #f))
	   (tasks:server-get-server-id tdb #f iface port #f))
	 (server-timeout (let ((tmo (config-lookup  *configdat* "server" "timeout")))
			   (if (and (string? tmo)
				    (string->number tmo))
			       (* 60 60 (string->number tmo))
			       ;; default to three days
			       (* 3 24 60 60)))))
    (debug:print-info 2 "server-timeout: " server-timeout ", server pid: " spid " on " iface ":" port)
    (tasks:server-set-state! tdb server-id "running")
    (let loop ((count 0))
      ;; Use this opportunity to sync the inmemdb to db
      (let ((start-time (current-milliseconds))
	    (sync-time  #f)
	    (rem-time   #f))
	(if *inmemdb* (db:sync-touched *inmemdb* force-sync: #t))
	(set! sync-time  (- (current-milliseconds) start-time))
447
448
449
450
451
452
453
454

455
456
457
458

459
460
461
462
463

464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485


486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501

502
503
504
505


506
507
508
509
510
511
512
513
514

515
516
517
518




519
520
521
522
523


524
525




526
527

528
529
530
531
532
533
534
448
449
450
451
452
453
454

455
456
457
458

459

460
461
462

463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483


484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505

506
507
508
509
510
511
512
513
514
515

516




517
518
519
520
521
522
523
524
525
526
527


528
529
530
531
532

533
534
535
536
537
538
539
540







-
+



-
+
-



-
+




















-
-
+
+
















+



-
+
+








-
+
-
-
-
-
+
+
+
+





+
+
-
-
+
+
+
+

-
+







      
      ;; Check that iface and port have not changed (can happen if server port collides)
      (mutex-lock! *heartbeat-mutex*)
      (set! sdat *runremote*)
      (mutex-unlock! *heartbeat-mutex*)
      
      (if (or (not (equal? sdat (list iface port)))
	      (not spid))
	      (not server-id))
	  (begin 
	    (debug:print-info 0 "interface changed, refreshing iface and port info")
	    (set! iface (car sdat))
	    (set! port  (cadr sdat))
	    (set! port  (cadr sdat))))
	    (set! spid  (tasks:server-get-server-id tdb #f iface port #f))))
      
      ;; NOTE: Get rid of this mechanism! It really is not needed...
      ;; (open-run-close tasks:server-update-heartbeat tasks:open-db spid)
      (tasks:server-update-heartbeat tdb spid)
      (tasks:server-update-heartbeat tdb server-id)
      
      ;; (if ;; (or (> numrunning 0) ;; stay alive for two days after last access

      ;; Transfer *last-db-access* to last-access to use in checking that we are still alive
      (mutex-lock! *heartbeat-mutex*)
      (set! last-access *last-db-access*)
      (mutex-unlock! *heartbeat-mutex*)

      ;; (debug:print 11 "last-access=" last-access ", server-timeout=" server-timeout)
      (if (and *server-run*
	       (> (+ last-access server-timeout)
		  (current-seconds)))
	  (begin
	    (debug:print-info 0 "Server continuing, seconds since last db access: " (- (current-seconds) last-access))
	    (loop 0))
	  (begin
	    (debug:print-info 0 "Starting to shutdown the server.")
	    ;; need to delete only *my* server entry (future use)
	    (set! *time-to-exit* #t)
	    (if *inmemdb* (db:sync-touched *inmemdb* force-sync: #t))
	    (open-run-close tasks:server-deregister-self tasks:open-db (get-host-name))
	    (thread-sleep! 1)
	    ( tasks:server-set-state! tdb server-id "shutting-down")
	    (thread-sleep! 5)
	    (debug:print-info 0 "Max cached queries was    " *max-cache-size*)
	    (debug:print-info 0 "Number of cached writes   " *number-of-writes*)
	    (debug:print-info 0 "Average cached write time "
			      (if (eq? *number-of-writes* 0)
				  "n/a (no writes)"
				  (/ *writes-total-delay*
				     *number-of-writes*))
			      " ms")
	    (debug:print-info 0 "Number non-cached queries "  *number-non-write-queries*)
	    (debug:print-info 0 "Average non-cached time   "
			      (if (eq? *number-non-write-queries* 0)
				  "n/a (no queries)"
				  (/ *total-non-write-delay* 
				     *number-non-write-queries*))
			      " ms")
	    (debug:print-info 0 "Server shutdown complete. Exiting")
	    (tasks:server-delete-record! tdb server-id)
	    (exit))))))

;; all routes though here end in exit ...
(define (http-transport:launch)
(define (http-transport:launch run-id)
  (set! *run-id*   run-id)
  (if (not *toppath*)
      (if (not (setup-for-run))
	  (begin
	    (debug:print 0 "ERROR: cannot find megatest.config, exiting")
	    (exit))))
  (debug:print-info 2 "Starting the standalone server")
  (if (args:get-arg "-daemonize")
      (daemon:ize))
  (let ((hostinfo (open-run-close tasks:get-best-server tasks:open-db)))
  (let ((server-id (open-run-close tasks:server-lock-slot tasks:open-db run-id)))
    (debug:print 11 "http-transport:launch hostinfo=" hostinfo)
    ;; #(1 "143.182.207.24" 5736 -1 "http" 22771 "hostname")
    (if hostinfo
	(debug:print-info 2 "NOT starting new server, one is already running on " (vector-ref hostinfo 1) ":" (vector-ref hostinfo 2))
    (if (not server-id)
	(begin
	  (debug:print-info 2 "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue")
	  (open-run-close tasks:server-delete-records-for-this-pid tasks:open-db))
	(if *toppath* 
	    (let* ((th2 (make-thread (lambda ()
				       (http-transport:run 
					(if (args:get-arg "-server")
					    (args:get-arg "-server")
					    "-")
					run-id
					    "-"))) "Server run"))
		   (th3 (make-thread http-transport:keep-running "Keep running")))
					server-id)) "Server run"))
		   (th3 (make-thread (lambda ()
				       (http-transport:keep-running server-id))
				     "Keep running")))
	      ;; Database connection
	      (set! *inmemdb*  (db:setup))
	      (set! *inmemdb*  (db:setup run-id))
	      (thread-start! th2)
	      (thread-start! th3)
	      (set! *didsomething* #t)
	      (thread-join! th2))
	    (debug:print 0 "ERROR: Failed to setup for megatest")))
    ;; (sdb:qry 'finalize)
    (exit)))

Modified launch.scm from [7c2ca47271] to [01eba87552].

473
474
475
476
477
478
479
480

481
482
483
484
485
486
487
473
474
475
476
477
478
479

480
481
482
483
484
485
486
487







-
+







;; 
;; All log file links should be stored relative to the top of link path
;;  
;; <target> - <testname> [ - <itempath> ] 
;;
(define (create-work-area run-id run-info keyvals test-id test-src-path disk-path testname itemdat)
  (let* ((item-path (item-list->path itemdat))
	 (runname  (db:get-value-by-header (db:get-row run-info)
	 (runname  (db:get-value-by-header (db:get-rows run-info)
					   (db:get-header run-info)
					   "runname"))
	 ;; convert back to db: from rdb: - this is always run at server end
	 (target   (string-intersperse (map cadr keyvals) "/"))

	 (not-iterated  (equal? "" item-path))

Modified megatest.scm from [d71e637572] to [8e86c2252f].

28
29
30
31
32
33
34

35
36
37
38
39
40
41
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42







+







(declare (uses daemon))
(declare (uses db))
;; (declare (uses sdb))
;; (declare (uses filedb))
(declare (uses tdb))
(declare (uses mt))
(declare (uses api))
(declare (uses tasks)) ;; only used for debugging.

(define *db* #f) ;; this is only for the repl, do not use in general!!!!

(include "common_records.scm")
(include "key_records.scm")
(include "db_records.scm")
(include "run_records.scm")
199
200
201
202
203
204
205

206
207
208
209
210
211
212
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214







+







			"-set-state-status"
			"-debug" ;; for *verbosity* > 2
			"-gen-megatest-test"
			"-override-timeout"
			"-test-files"  ;; -test-paths is for listing all
			"-load"        ;; load and exectute a scheme file
			"-dumpmode"
			"-run-id"
			) 
		 (list  "-h"
			"-version"
		        "-force"
		        "-xterm"
		        "-showkeys"
		        "-show-keys"
341
342
343
344
345
346
347
348
349
350







351
352
353
354
355
356
357
343
344
345
346
347
348
349



350
351
352
353
354
355
356
357
358
359
360
361
362
363







-
-
-
+
+
+
+
+
+
+








(if (args:get-arg "-server")

    ;; Server? Start up here.
    ;;
    (let ((tl        (setup-for-run))
	  (transport (or (configf:lookup *configdat* "setup" "transport")
			 (args:get-arg "-transport" "http"))))
      (debug:print 2 "Launching server using transport " transport)
      (server:launch (string->symbol transport)))
			 (args:get-arg "-transport" "http")))
	  (run-id    (and (args:get-arg "-run-id")
			  (string->number (args:get-arg "-run-id")))))
      (debug:print 2 "Launching server using transport " transport " for run-id=" run-id)
      (if run-id
	  (server:launch (string->symbol transport) run-id)
	  (debug:print 0 "ERROR: server requires run-id be specified with -run-id")))

    ;; Not a server? This section will decide how to communicate
    ;;
    ;;  Setup client for all expect listed here
    (if (null? (lset-intersection 
		     equal?
		     (hash-table-keys args:arg-hash)

Modified mt.scm from [78f8ad9fdf] to [029729b148].

35
36
37
38
39
40
41
42

43
44
45
46
47
48
49
35
36
37
38
39
40
41

42
43
44
45
46
47
48
49







-
+







;;  R U N S
;;======================================================================

;; runs:get-runs-by-patt
;; get runs by list of criteria
;; register a test run with the db
;;
;; Use: (db-get-value-by-header (db:get-header runinfo)(db:get-row runinfo))
;; Use: (db-get-value-by-header (db:get-header runinfo)(db:get-rows runinfo))
;;  to extract info from the structure returned
;;
(define (mt:get-runs-by-patt keys runnamepatt targpatt)
  (let loop ((runsdat  (rmt:get-runs-by-patt keys runnamepatt targpatt 0 500))
	     (res      '())
	     (offset   0)
	     (limit    500))

Modified server.scm from [ddc244d255] to [be1ec84c84].

40
41
42
43
44
45
46
47

48
49
50
51
52
53
54
55
56
57
58


59
60
61
62
63
64
65
40
41
42
43
44
45
46

47
48
49
50
51
52
53
54
55
56


57
58
59
60
61
62
63
64
65







-
+









-
-
+
+







;; S E R V E R
;;======================================================================

;; Call this to start the actual server
;;

;; all routes though here end in exit ...
(define (server:launch transport)
(define (server:launch transport run-id)
  (if (not *toppath*)
      (if (not (setup-for-run))
	  (begin
	    (debug:print 0 "ERROR: cannot find megatest.config, exiting")
	    (exit))))
  (debug:print-info 2 "Starting server using " transport " transport")
  (set! *transport-type* transport)
  (case transport
    ;; ((fs)   (exit)) ;; there is no "fs" server transport
    ((fs http) (http-transport:launch))
    ((zmq)     (zmq-transport:launch))
    ((fs http) (http-transport:launch run-id))
    ((zmq)     (zmq-transport:launch run-id))
    (else
     (debug:print "WARNING: unrecognised transport " transport)
     (exit))))

;;======================================================================
;; Q U E U E   M A N A G E M E N T
;;======================================================================

Modified tasks.scm from [c9fadd13bc] to [318318de38].

62
63
64
65
66
67
68
69
70


71
72
73
74
75
76
77
62
63
64
65
66
67
68


69
70
71
72
73
74
75
76
77







-
-
+
+







                                  pubport INTEGER,
                                  start_time TIMESTAMP,
                                  priority INTEGER,
                                  state TEXT,
                                  mt_version TEXT,
                                  heartbeat TIMESTAMP,
                                  transport TEXT,
                                  run_id INTEGER,
                               CONSTRAINT servers_constraint UNIQUE (pid,hostname,port));")
                                  run_id INTEGER);")
;;                               CONSTRAINT servers_constraint UNIQUE (pid,hostname,port));")
	  (sqlite3:execute mdb "CREATE TABLE IF NOT EXISTS clients (id INTEGER PRIMARY KEY,
                                  server_id INTEGER,
                                  pid INTEGER,
                                  hostname TEXT,
                                  cmdline TEXT,
                                  login_time TIMESTAMP,
                                  logout_time TIMESTAMP DEFAULT -1,
89
90
91
92
93
94
95
96
97
98










99
100
101
102
103
104
105
106











107
108
109
110

111
112
113
114
115
116
117
118
119
120






121
122
123


124
125
126
127





128
129
130


131
132
133
134
135
136























137
138



139
140
141


142
143
144

145
146
147
148

149
150
151
152

153
154
155
156
157
158
159
89
90
91
92
93
94
95



96
97
98
99
100
101
102
103
104
105
106
107






108
109
110
111
112
113
114
115
116
117
118




119


120
121






122
123
124
125
126
127



128
129




130
131
132
133
134
135


136
137
138





139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161


162
163
164
165


166
167
168


169




170




171
172
173
174
175
176
177
178







-
-
-
+
+
+
+
+
+
+
+
+
+


-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
+
-
-


-
-
-
-
-
-
+
+
+
+
+
+
-
-
-
+
+
-
-
-
-
+
+
+
+
+

-
-
+
+

-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+

-
-
+
+

-
-
+
-
-
-
-
+
-
-
-
-
+







(define (tasks:hostinfo-get-interface   vec)    (vector-ref  vec 1))
(define (tasks:hostinfo-get-port        vec)    (vector-ref  vec 2))
(define (tasks:hostinfo-get-pubport     vec)    (vector-ref  vec 3))
(define (tasks:hostinfo-get-transport   vec)    (vector-ref  vec 4))
(define (tasks:hostinfo-get-pid         vec)    (vector-ref  vec 5))
(define (tasks:hostinfo-get-hostname    vec)    (vector-ref  vec 6))

;; state: 'live, 'shutting-down, 'dead
(define (tasks:server-register mdb pid interface port priority state transport #!key (pubport -1))
  (debug:print-info 11 "tasks:server-register " pid " " interface " " port " " priority " " state)
(define (tasks:server-lock-slot mdb run-id)
  (let ((res '())
	(best #f))
    (tasks:server-clean-out-old-records-for-run-id mdb run-id)
    (tasks:server-set-available mdb run-id)
    (thread-sleep! 2) ;; Try removing this. It may not be needed.
    (tasks:server-am-i-the-server? mdb run-id)))
	
;; register that this server may come online (first to register goes though with the process)
(define (tasks:server-set-available mdb run-id)
  (sqlite3:execute 
   mdb 
   "INSERT OR REPLACE INTO servers (pid,hostname,port,pubport,start_time,priority,state,mt_version,heartbeat,interface,transport)
                             VALUES(?,  ?,       ?,   ?,  strftime('%s','now'), ?, ?, ?, strftime('%s','now'),?,?);"
   pid (get-host-name) port pubport priority (conc state) 
   (common:version-signature)
   interface 
   (conc transport))
   "INSERT INTO servers (pid,hostname,port,pubport,start_time,      priority,state,mt_version,heartbeat,   interface,transport,run_id)
                   VALUES(?, ?,       ?,   ?, strftime('%s','now'), ?,       ?,    ?, strftime('%s','now'),?,        ?,        ?);"
   (current-process-id)       ;; pid
   (get-host-name)            ;; hostname
   -1                         ;; port
   -1                         ;; pubport
   (random 1000)              ;; priority (used a tiebreaker on get-available)
   "available"                ;; state
   (common:version-signature) ;; mt_version
   -1                         ;; interface
   -1                         ;; transport
  (vector 
   (tasks:server-get-server-id mdb (get-host-name) interface port pid)
   interface
   port
   run-id
   pubport
   transport
   ))

;; NB// two servers with same pid on different hosts will be removed from the list if pid: is used!
(define (tasks:server-deregister mdb hostname #!key (port #f)(pid #f)(action 'delete))
  (debug:print-info 11 "server-deregister " hostname ", port " port ", pid " pid)
  (if *db-write-access*
      (if pid
	  (case action
(define (tasks:server-clean-out-old-records-for-run-id mdb run-id)
  (sqlite3:execute mdb "DELETE FROM servers WHERE state='available' AND (strftime('%s','now') - start_time) > 30 AND run_id=?;" run-id)
  (sqlite3:execute mdb "DELETE FROM servers WHERE state='running'   AND (strftime('%s','now') - heartbeat)  > 10 AND run_id=?;" run-id))
  

(define (tasks:server-set-state! mdb server-id state)
	    ((delete)(sqlite3:execute mdb "DELETE FROM servers WHERE pid=?;" pid))
	    (else    (sqlite3:execute mdb "UPDATE servers SET state='dead' WHERE pid=?;" pid)))
	  (if port
  (sqlite3:execute mdb "UPDATE servers SET state=? WHERE id=?;" state server-id))

	      (case action
	    ((delete)(sqlite3:execute mdb "DELETE FROM servers WHERE (interface=? or hostname=?) AND port=?;" hostname hostname port))
	    (else    (sqlite3:execute mdb "UPDATE servers SET state='dead' WHERE (interface=? or hostname=?) AND port=?;" hostname hostname port)))
	      (debug:print 0 "ERROR: tasks:server-deregister called with neither pid nor port specified")))))
(define (tasks:server-delete-record! mdb server-id)
  (sqlite3:execute mdb "DELETE FROM servers WHERE id=?;" server-id))

(define (tasks:server-delete-records-for-this-pid mdb)
  (sqlite3:execute mdb "DELETE FROM servers WHERE hostname=? AND pid=?;" (get-host-name) (current-process-id)))

(define (tasks:server-deregister-self mdb hostname)
  (tasks:server-deregister mdb hostname pid: (current-process-id)))
(define (tasks:server-set-interface-port mdb server-id interface port)
  (sqlite3:execute mdb "UPDATE servers SET interface=?,port=? WHERE id=?;" interface port server-id))

;; need a simple call for robustly removing records given host and port
(define (tasks:server-delete mdb hostname port)
  (tasks:server-deregister mdb hostname port: port action: 'delete))

(define (tasks:server-get-server-id mdb hostname iface port pid)
(define (tasks:server-am-i-the-server? mdb run-id)
  (let* ((all    (tasks:server-get-servers-vying-for-run-id mdb run-id))
	 (first  (if (null? all)
		     (begin (debug:print 0 "ERROR: no servers listed, should be at least one by now.") 
			    (sqlite3:finalize! mdb)
			    (exit 1))
		     (car (db:get-rows all))))
	 (header   (db:get-header all))
	 (id       (db:get-value-by-header first header "id"))
	 (hostname (db:get-value-by-header first header "hostname"))
	 (pid      (db:get-value-by-header first header "pid"))
	 (priority (db:get-value-by-header first header "priority")))
    (debug:print 0 "INFO: am-i-the-server got record " first)
    ;; for now a basic check. add tiebreaking by priority later
    (if (and (equal? hostname (get-host-name))
	     (equal? pid      (current-process-id)))
	id
	#f)))
	     
;; Use: (db:get-value-by-header (car (db:get-rows dat)) (db:get-header dat) "fieldname")
;;  to extract info from the structure returned
;;
(define (tasks:server-get-servers-vying-for-run-id mdb run-id)
  (debug:print-info 12 "tasks:server-get-server-id " mdb " " hostname " " iface " " port " " pid)
  (let ((res #f))
   (let* ((header (list "id" "hostname" "pid" "interface" "port" "pubport" "state" "run_id" "priority" "start_time"))
	  (selstr (string-intersperse header ","))
	  (res    '()))
    (sqlite3:for-each-row
     (lambda (id)
       (set! res id))
     (lambda (a . b)
       (set! res (cons (apply vector a b) res)))
     mdb
     (cond
      ((and hostname  pid)  "SELECT id FROM servers WHERE hostname=?  AND pid=?;")
     (conc "SELECT " selstr " FROM servers WHERE run_id=? ORDER BY start_time DESC;")
      ((and iface     port) "SELECT id FROM servers WHERE interface=? AND port=?;")
      ((and hostname  port) "SELECT id FROM servers WHERE hostname=?  AND port=?;")
      (else
       (begin
     run-id)
	 (debug:print 0 "ERROR: tasks:server-get-server-id needs (hostname and pid) OR (iface and port) OR (hostname and port)")
	 "SELECT id FROM servers WHERE pid=-999;")))
     (if hostname hostname iface)(if pid pid port))
    res))
    (vector header res)))

(define (tasks:server-update-heartbeat mdb server-id)
  (debug:print-info 1 "Heart beat update of server id=" server-id)
  (handle-exceptions
   exn
   (begin
     (debug:print 0 "WARNING: probable timeout on monitor.db access")
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205



206
207
208
209
210
211
212

213
214
215
216
217
218
219
220


221
222
223
224
225
226
227
228

229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
188
189
190
191
192
193
194






























195
196
197




198
199

200


201

202
203


204
205








206



























207
208
209
210
211
212
213







-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
-
-
-
-


-
+
-
-

-


-
-
+
+
-
-
-
-
-
-
-
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-







	 (heartbeat-delta 99e9))
    (sqlite3:for-each-row
     (lambda (delta)
       (set! heartbeat-delta delta))
     mdb "SELECT strftime('%s','now')-heartbeat FROM servers WHERE id=?;" server-id)
    (< heartbeat-delta 10)))

(define (tasks:client-register mdb pid hostname cmdline)
  (sqlite3:execute
   mdb
   "INSERT OR REPLACE INTO clients (server_id,pid,hostname,cmdline,login_time) VALUES(?,?,?,?,strftime('%s','now'));")
  (tasks:server-get-server-id mdb hostname #f #f pid)
  pid hostname cmdline)

(define (tasks:client-logout mdb pid hostname cmdline)
  (sqlite3:execute
   mdb
   "UPDATE clients SET logout_time=strftime('%s','now') WHERE pid=? AND hostname=? AND cmdline=?;"
   pid hostname cmdline))

(define (tasks:get-logged-in-clients mdb server-id)
  (let ((res '()))
    (sqlite3:for-each-row 
     (lambda (id server-id pid hostname cmdline login-time logout-time)
       (set! res (cons (vector id server-id pid hostname cmdline login-time lougout-time) res)))
     mdb
     "SELECT id,server_id,pid,hostname,cmdline,login_time,logout_time FROM clients WHERE server_id=?;"
     server-id)))

(define (tasks:have-clients? mdb server-id)
  (null? (tasks:get-logged-in-clients mdb server-id)))

;; ping each server in the db and return first found that responds. 
;; remove any others. will not necessarily remove all!
(define (tasks:get-best-server mdb)
  (let ((res '())
	(best #f)
(define (tasks:get-server mdb run-id)
  (let ((res  #f)
	(best #f))
	(transport (if (and *transport-type*
			    (not (eq? *transport-type* 'fs)))
		       (conc *transport-type*)
		       "%")))
    (sqlite3:for-each-row
     (lambda (id interface port pubport transport pid hostname)
       (set! res (cons (vector id interface port pubport transport pid hostname) res))
       (set! res (vector id interface port pubport transport pid hostname)))
       ;;(debug:print-info 2 "Found existing server " hostname ":" port " registered in db"))
       )
     mdb
     
     "SELECT id,interface,port,pubport,transport,pid,hostname FROM servers
          WHERE strftime('%s','now')-heartbeat < 10 
          AND mt_version=? AND transport LIKE ? 
          ORDER BY start_time DESC LIMIT 1;" (common:version-signature) transport)
          AND mt_version=? AND run_id=? AND state='running'
          ORDER BY start_time DESC LIMIT 1;" (common:version-signature) run-id)
    ;; for now we are keeping only one server registered in the db, return #f or first server found
    (if (null? res) #f (car res))))

;; BUG: This logic is probably needed unless methodology changes completely...
;;
;;     (if (null? res) #f
;; 	(let loop ((hed (car res))
;; 		   (tal (cdr res)))
    res))
;; 	  ;; (print "hed=" hed ", tal=" tal)
;; 	  (let* ((host     (list-ref hed 0))
;; 		 (iface    (list-ref hed 1))
;; 		 (port     (list-ref hed 2))
;; 		 (pid      (list-ref hed 4))
;; 		 (alive    (open-run-close tasks:server-alive? tasks:open-db #f hostname: host port: port)))
;; 	    (if alive
;; 		(begin
;; 		  (debug:print-info 2 "Found an existing, alive, server " host ", " port ".")
;; 		  (list host iface port))
;; 		(begin
;; 		  (debug:print-info 1 "Marking " host ":" port " as dead in server registry.")
;; 		  (if port
;; 		      (open-run-close tasks:server-deregister tasks:open-db host port: port)
;; 		      (open-run-close tasks:server-deregister tasks:open-db host pid:  pid))
;; 		  (if (null? tal)
;; 		      #f
;; 		      (loop (car tal)(cdr tal))))))))))

(define (tasks:remove-server-records mdb)
  (sqlite3:execute mdb "DELETE FROM servers;"))

(define (tasks:mark-server hostname port pid state transport)
  (if port
      (open-run-close tasks:server-deregister tasks:open-db hostname port: port)
      (open-run-close tasks:server-deregister tasks:open-db hostname pid:  pid)))


(define (tasks:kill-server status hostname port pid transport)
  (debug:print-info 1 "Removing defunct server record for " hostname ":" port)
  (if port
      (open-run-close tasks:server-deregister tasks:open-db hostname port: port)
      (open-run-close tasks:server-deregister tasks:open-db hostname pid:  pid))
  (if status ;; #t means alive
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
236
237
238
239
240
241
242












243
244
245
246
247
248
249







-
-
-
-
-
-
-
-
-
-
-
-







		(begin
		  (debug:print-info 1 "Sending signal/term to " pid " on " hostname)
		  (process-signal pid signal/term)  ;; local machine, send sig term
		  (thread-sleep! 5)                 ;; give it five seconds to die peacefully then do a brutal kill
		  (process-signal pid signal/kill)) 
		(debug:print 0 "WARNING: Can't kill frozen server on remote host " hostname))))))



(define (tasks:get-all-servers mdb)
  (let ((res '()))
    (sqlite3:for-each-row
     (lambda (id pid hostname interface port pubport start-time priority state mt-version last-update transport)
       (set! res (cons (vector id pid hostname interface port pubport start-time priority state mt-version last-update transport) res)))
     mdb
     "SELECT id,pid,hostname,interface,port,pubport,start_time,priority,state,mt_version,strftime('%s','now')-heartbeat AS last_update,transport FROM servers ORDER BY start_time DESC;")
    res))
       

;;======================================================================
;; Tasks and Task monitors
;;======================================================================


;;======================================================================
;; Tasks

Modified tests/Makefile from [7520ced67a] to [f03136dea1].

1
2
3
4
5
6
7
8
9
10
11
12
13










14
15
16
17
18
19
20
21
22
23
24
25
26
27

28
29
30
31
32
33
34
1
2
3










4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26

27
28
29
30
31
32
33
34



-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+













-
+







#
# run some tests

BINPATH=$(shell readlink -m $(PWD)/../bin)
MEGATEST=$(BINPATH)/megatest
DASHBOARD=$(BINPATH)/dashboard
PATH := $(BINPATH):$(PATH)
RUNNAME := $(shell date +w%V.%u.%H.%M)
IPADDR := "-"
# Set SERVER to "-server -"
SERVER  = 
DEBUG   = 1
LOGGING = 
BINPATH   = $(shell readlink -m $(PWD)/../bin)
MEGATEST  = $(BINPATH)/megatest
DASHBOARD = $(BINPATH)/dashboard
PATH     := $(BINPATH):$(PATH)
RUNNAME  := $(shell date +w%V.%u.%H.%M)
IPADDR   := "-"
RUNID    := 1
SERVER    = 
DEBUG     = 1
LOGGING   = 

OS  = $(shell grep ID /etc/*-release|cut -d= -f2)
FS  = $(shell df -T .|tail -1|awk '{print $$2}')
VER = $(shell fsl info|grep checkout|awk '{print $$2}'|cut -c 1-5)

# The NEWTARGET causes some tests to fail. Do not use until this is fixed.
NEWTARGET  = "$(OS)/$(FS)/$(VER)"
TARGET     = "ubuntu/nfs/none"

all : test1 test2 test3 test4 test5 test6 test7 test8 test9

server :
	cd ..;make;make install
	cd fullrun;../../bin/megatest -server - -debug 22
	cd fullrun;../../bin/megatest -server - -debug 22 -run-id $(RUNID)

stopserver :
	cd ..;make && make install
	cd fullrun;$(MEGATEST) -stop-server 0

repl :
	cd ..;make && make install

Added tests/watch-monitor.sh version [408ccfb929].









1
2
3
4
5
6
7
8
+
+
+
+
+
+
+
+
#!/bin/bash

sqlite3 fullrun/db/monitor.db << EOF
.header on
.mode column
select * from servers;
.q
EOF