Megatest

Changes On Branch v1.80
Login

Changes In Branch v1.80 Excluding Merge-Ins

This is equivalent to a diff from 60b31fb56a to 17856ae5a8

2024-06-05
18:38
merged fork Leaf check-in: 17856ae5a8 user: mmgraham tags: v1.80
18:36
Restored homehost functions to 1.80. Set the homehost when starting dashboard or megatest -run. Abort if an attempt is made to start a server on a non-homehost. check-in: 3019408957 user: mmgraham tags: v1.80
2024-05-15
00:57
Added CSCOPTS entry used for debugging. check-in: 22593531c9 user: matt tags: v1.80
2023-10-20
05:12
Merged fork

This node ran run-core-tests.sh through kill-rerun - a pretty good result. Dashboard comes up quickly also. check-in: e607892c7d user: mrwellan tags: v1.80

04:57
Merged fork check-in: 53900a0d02 user: mrwellan tags: v1.80-start-all
2023-10-19
18:55
changed a debug msg to level2, increased delay from 0.5 to 2 secs Leaf check-in: 60b31fb56a user: mmgraham tags: v1.80-processes
16:09
moved make-tmpdir-name into commonmod check-in: 1624c400a9 user: mmgraham tags: v1.80-processes

Modified Makefile from [8cdb8c3755] to [42f7351e93].

22
23
24
25
26
27
28

29
30
31
32
33
34
35

recent-commits.csv : .fslckout
	fossil timeline -n 350 -t ci -F "%h,%a,%b,%t,\"%c\"" > recent-commits.csv


SHELL=/bin/bash
PREFIX=$(PWD)

CSCOPTS=
INSTALL=install
SRCFILES = common.scm items.scm launch.scm ods.scm runconfig.scm	\
           server.scm configf.scm db.scm keys.scm		\
           process.scm runs.scm tasks.scm tests.scm genexample.scm	\
           tdb.scm mt.scm	\
           ezsteps.scm rmt.scm api.scm		\







>







22
23
24
25
26
27
28
29
30
31
32
33
34
35
36

recent-commits.csv : .fslckout
	fossil timeline -n 350 -t ci -F "%h,%a,%b,%t,\"%c\"" > recent-commits.csv


SHELL=/bin/bash
PREFIX=$(PWD)
# CSCOPTS=-lfa2 -specialize -inline-global
CSCOPTS=
INSTALL=install
SRCFILES = common.scm items.scm launch.scm ods.scm runconfig.scm	\
           server.scm configf.scm db.scm keys.scm		\
           process.scm runs.scm tasks.scm tests.scm genexample.scm	\
           tdb.scm mt.scm	\
           ezsteps.scm rmt.scm api.scm		\

Modified api.scm from [5fa313076b] to [800ec32af5].

236
237
238
239
240
241
242
243












244

245
246













247
248



























249
250
251
252
253
254
255
  (set! *api-threads* (filter (lambda (thdat)
				(not (member (thread-state (car thdat)) '(terminated dead))))
			      *api-threads*)))

(define (api:get-count-threads-alive)
  (length *api-threads*))
  













;; indat is (cmd run-id params meta)

;;
;; WARNING: Do not print anything in the lambda of this function as it













;;          reads/writes to current in/out port
;;



























(define (api:tcp-dispatch-request-make-handler dbstruct) ;; cmd run-id params)
  (assert *toppath* "FATAL: api:tcp-dispatch-request-make-handler called but *toppath* not set.")
  (if (not *server-signature*)
      (set! *server-signature* (tt:mk-signature *toppath*)))
  (lambda (indat)
    (api:register-thread (current-thread))
    (let* (;; (indat      (deserialize))







|
>
>
>
>
>
>
>
>
>
>
>
>
|
>

|
>
>
>
>
>
>
>
>
>
>
>
>
>
|

>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
  (set! *api-threads* (filter (lambda (thdat)
				(not (member (thread-state (car thdat)) '(terminated dead))))
			      *api-threads*)))

(define (api:get-count-threads-alive)
  (length *api-threads*))
  
;; ==========================================================================================================================================
;; api:tcp-dispatch-request-make-handler is a complex TCP request handler that manages server load, dispatches requests, and ensures that the 
;; server's state is consistent with the incoming requests. It is designed to be used in a concurrent environment where multiple requests are 
;; being handled simultaneously.
;;
;; It processes incoming requests and dispatches them accordingly. 
;; The function takes a dbstruct argument, which is a structure representing the database.
;;
;; 1. The function asserts that global variable `*toppath*` is set. 
;;
;; 2. It checks if `*server-signature*` is not set, and if so, it sets it using the `tt:mk-signature` function with `*toppath*` as an argument. 
;;    The `*server-signature*` is used to identify the server instance.
;;
;; 3. The function returns a lambda function that takes `indat` as an argument. indat is (cmd run-id params meta) This lambda is the actual 
;;    request handler that will be called with the incoming data.
;;
;; 4. Inside the lambda, the current thread is registered with `api:register-thread`.
;;
;; 5. Several local variables are initialized:
;;    - `newcount`: A counter for the number of requests being processed.
;;    - `numthreads`: The number of alive threads handling requests.
;;    - `delay-wait`: A calculated delay based on the number of requests.
;;
;; 6. A `normal-proc` lambda is defined to handle the incoming command (`cmd`), `run-id`, and `params`. It uses a `case` statement to handle 
;;    different commands. If the command is "ping", it returns the server signature. Otherwise, it dispatches the request using 
;;    `api:dispatch-request`.
;;
;; 7. The function updates the `*api-process-request-count*` and `*db-last-access*` global variables.
;;
;; 8. It checks if the number of requests (`newcount`) does not match the number of threads (`numthreads`) and performs cleanup and debugging 
;;    if necessary.
;;
;; 9. The `match` expression is used to destructure `indat` into its components (`cmd`, `run-id`, `params`, `meta`).
;;
;; 10. Several local variables are set based on the destructured data and the current server state:
;;     - `db-ok`: Checks if the database file name matches the expected one for the given `run-id`.
;;     - `ttdat`: Retrieves server information.
;;     - `server-state`: Gets the current state of the server.
;;     - `status`: Determines the server's load status based on `newcount`.
;;     - `errmsg`: Generates an error message based on the server's status.
;;     - `result`: Processes the command based on the server's status.
;;
;; 11. The `meta` variable is updated with additional information based on the command.
;;
;; 12. The `payload` is constructed, which includes the status, error message, result, and meta information.
;;
;; 13. The `*api-process-request-count*` is decremented, as the request has been processed.
;;
;; 14. The current thread is unregistered with `api:unregister-thread`.
;;
;; 15. Finally, the `payload` is returned, which would be the response to the incoming request.
;;
;; Nothing should be printed within the lambda because it interacts with the current input/output ports, which could interfere with the 
;; request/response flow.
;;
;; The `else` clause at the end of the `match` expression asserts a fatal error if `indat` cannot be deserialized, indicating that the incoming 
;; data is not in the expected format.
;;

(define (api:tcp-dispatch-request-make-handler dbstruct) ;; cmd run-id params)
  (assert *toppath* "FATAL: api:tcp-dispatch-request-make-handler called but *toppath* not set.")
  (if (not *server-signature*)
      (set! *server-signature* (tt:mk-signature *toppath*)))
  (lambda (indat)
    (api:register-thread (current-thread))
    (let* (;; (indat      (deserialize))
369
370
371
372
373
374
375

376

377
378
379
380
381
382
383
    ((dec-var)                      (apply db:dec-var dbstruct params))
    ((del-var)                      (apply db:del-var dbstruct params))
    ((add-var)                      (apply db:add-var dbstruct params))

    ((insert-run)                   (apply db:insert-run dbstruct params))

    ;; STEPS

    ((teststep-set-status!)         (apply db:teststep-set-status! dbstruct params))

    ((delete-steps-for-test!)       (apply db:delete-steps-for-test! dbstruct params))
    
    ;; TEST DATA
    ((test-data-rollup)             (apply db:test-data-rollup dbstruct params))
    ((csv->test-data)               (apply db:csv->test-data dbstruct params))

    ;; MISC







>
|
>







422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
    ((dec-var)                      (apply db:dec-var dbstruct params))
    ((del-var)                      (apply db:del-var dbstruct params))
    ((add-var)                      (apply db:add-var dbstruct params))

    ((insert-run)                   (apply db:insert-run dbstruct params))

    ;; STEPS
    ((teststep-set-status!)
     ;; (apply db:teststep-set-status! dbstruct params))
     (db:add-cached-write dbstruct db:teststep-set-status! run-id params))
    ((delete-steps-for-test!)       (apply db:delete-steps-for-test! dbstruct params))
    
    ;; TEST DATA
    ((test-data-rollup)             (apply db:test-data-rollup dbstruct params))
    ((csv->test-data)               (apply db:csv->test-data dbstruct params))

    ;; MISC

Modified common.scm from [0854266963] to [49835e4bfe].

41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
	rmtmod
	(prefix mtargs args:))

(include "common_records.scm")

(define (remove-files filespec)
  (let ((files (glob filespec)))
    (for-each delete-file files)))

(define (stop-the-train)
  (thread-start! (make-thread (lambda ()
				(let loop ()
				  (if (and *toppath*
					   (file-exists? (conc *toppath*"/stop-the-train")))
				      (let* ((msg (conc "ERROR: found file "*toppath*"/stop-the-train, exiting immediately")))







|







41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
	rmtmod
	(prefix mtargs args:))

(include "common_records.scm")

(define (remove-files filespec)
  (let ((files (glob filespec)))
    (for-each delete-file* files)))

(define (stop-the-train)
  (thread-start! (make-thread (lambda ()
				(let loop ()
				  (if (and *toppath*
					   (file-exists? (conc *toppath*"/stop-the-train")))
				      (let* ((msg (conc "ERROR: found file "*toppath*"/stop-the-train, exiting immediately")))
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
;; 
(defstruct remote

  ;; transport to be used
  ;; http              - use http-transport
  ;; http-read-cached  - use http-transport for writes but in-mem cached for reads
  (rmode            'http)
  (hh-dat            (let ((res (or (server:choose-server *toppath* 'homehost)
				    (cons #f #f))))
		       (assert (pair? res)(conc "FATAL: hh-dat should be a pair, got "res))
		       res))
  (server-url        #f) ;; (server:check-if-running *toppath*) #f))
  (server-id         #f)
  (server-info       #f) ;; (if *toppath* (server:check-if-running *toppath*) #f))
  (last-server-check 0)  ;; last time we checked to see if the server was alive
  (connect-time      (current-seconds)) ;; when we first connected
  (last-access       (current-seconds)) ;; last time we talked to server
  ;; (conndat           #f) ;; iface port api-uri api-url api-req seconds server-id







|
<
<
<







300
301
302
303
304
305
306
307



308
309
310
311
312
313
314
;; 
(defstruct remote

  ;; transport to be used
  ;; http              - use http-transport
  ;; http-read-cached  - use http-transport for writes but in-mem cached for reads
  (rmode            'http)
  (hh-dat            (common:get-homehost)) ;; homehost record ( addr . hhflag )



  (server-url        #f) ;; (server:check-if-running *toppath*) #f))
  (server-id         #f)
  (server-info       #f) ;; (if *toppath* (server:check-if-running *toppath*) #f))
  (last-server-check 0)  ;; last time we checked to see if the server was alive
  (connect-time      (current-seconds)) ;; when we first connected
  (last-access       (current-seconds)) ;; last time we talked to server
  ;; (conndat           #f) ;; iface port api-uri api-url api-req seconds server-id
1290
1291
1292
1293
1294
1295
1296































































1297
1298
1299
1300
1301
1302
1303
      #f))

;;======================================================================
;; logic for getting homehost. Returns (host . at-home)
;; IF *toppath* is not set, wait up to five seconds trying every two seconds
;; (this is to accomodate the watchdog)
;;
































































;;======================================================================
;; do we honor the caches of the config files?
;;
(define (common:use-cache?)
  (let ((res #t)) ;; priority by order of evaluation
    (if *configdat* ;; sillyness here. can't use setup/use-cache to know if we can use the cached files!







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
      #f))

;;======================================================================
;; logic for getting homehost. Returns (host . at-home)
;; IF *toppath* is not set, wait up to five seconds trying every two seconds
;; (this is to accomodate the watchdog)
;;
(define (common:get-homehost #!key (trynum 5))
  ;; called often especially at start up. use mutex to eliminate collisions
  (mutex-lock! *homehost-mutex*)
  (cond
   (*home-host*
    (mutex-unlock! *homehost-mutex*)
    *home-host*)
   ((not *toppath*)
    (mutex-unlock! *homehost-mutex*)
    (launch:setup) ;; safely mutexed now
    (if (> trynum 0)
	(begin
	  (thread-sleep! 2)
	  (common:get-homehost trynum: (- trynum 1)))
	#f))
   (else
    (let* ((currhost (get-host-name))
	   (bestadrs (server:get-best-guess-address currhost))
	   ;; first look in config, then look in file .homehost, create it if not found
	   (homehost (or (configf:lookup *configdat* "server" "homehost" )
			 (handle-exceptions
			     exn
			     (if (> trynum 0)
				 (let ((delay-time (* (- 5 trynum) 5)))
				   (mutex-unlock! *homehost-mutex*)
				   (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Failed to read .homehost file, delaying "
						delay-time " seconds and trying again, message: "  ((condition-property-accessor 'exn 'message) exn)
						", exn=" exn)
				   (thread-sleep! delay-time)
				   (common:get-homehost trynum: (- trynum 1)))
				 (begin
				   (mutex-unlock! *homehost-mutex*)
				   (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)
						"] Failed to read .homehost file after trying five times. Giving up and exiting, message: "
						((condition-property-accessor 'exn 'message) exn))
				   (exit 1)))
			   (let ((hhf (conc *toppath* "/.homehost")))
			     (if (common:file-exists? hhf)
				 (with-input-from-file hhf read-line)
				 (if (file-write-access? *toppath*)
				     (begin
                                       (debug:print 0 *default-log-port* "No .homehost file found. Setting it to the current machine")
				       (with-output-to-file hhf
					 (lambda ()
					   (print bestadrs)))
				       (begin
					 (mutex-unlock! *homehost-mutex*)
					 (car (common:get-homehost))))
				     #f))))))
	   (at-home  (or (equal? homehost currhost)
			 (equal? homehost bestadrs))))
      (set! *home-host* (cons homehost at-home))
      (mutex-unlock! *homehost-mutex*)
      *home-host*))))

;;======================================================================
;; am I on the homehost?
;;
(define (common:on-homehost?)
  (let ((hh (common:get-homehost)))
    (if hh
	(cdr hh)
	#f)))

;;======================================================================
;; do we honor the caches of the config files?
;;
(define (common:use-cache?)
  (let ((res #t)) ;; priority by order of evaluation
    (if *configdat* ;; sillyness here. can't use setup/use-cache to know if we can use the cached files!

Modified dashboard.scm from [92015a98e3] to [e2e9d0eb23].

70
71
72
73
74
75
76

77
78
79
80
81
82
83
(include "megatest-fossil-hash.scm")
(include "vg_records.scm")

;; set some parameters here - these need to be put in something that can be loaded from other
;; executables such as dashboard and mtutil
;;
(include "dashboard-transport-mode.scm")

(dbfile:db-init-proc db:initialize-main-db)
(set! rmtmod:send-receive rmt:send-receive)

(define help (conc 
	      "Megatest Dashboard, documentation at http://www.kiatoa.com/fossils/megatest version " megatest-version 
              " license GPL, Copyright (C) Matt Welland 2012-2017








>







70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
(include "megatest-fossil-hash.scm")
(include "vg_records.scm")

;; set some parameters here - these need to be put in something that can be loaded from other
;; executables such as dashboard and mtutil
;;
(include "dashboard-transport-mode.scm")

(dbfile:db-init-proc db:initialize-main-db)
(set! rmtmod:send-receive rmt:send-receive)

(define help (conc 
	      "Megatest Dashboard, documentation at http://www.kiatoa.com/fossils/megatest version " megatest-version 
              " license GPL, Copyright (C) Matt Welland 2012-2017

116
117
118
119
120
121
122
123
124

125
126
127
128
129
130
131
132
			)
		 args:arg-hash
		 0))

(if (args:get-arg "-mode")
    (let* ((mode (string->symbol (args:get-arg "-mode"))))
      (rmt:transport-mode mode))
    (rmt:transport-mode 'tcp))


(if (args:get-arg "-test") ;; need to use tcp for test control panel
    (rmt:transport-mode 'tcp))

;; RA => Might require revert for filters 
;; create a watch dog to move changes from lt/.db/*.db to megatest.db
;;
;;;(if (file-write-access? (conc *toppath* "/megatest.db"))
;;(debug:print-info 13 *default-log-port* "Before common:watchdog spawn")







<
|
>
|







117
118
119
120
121
122
123

124
125
126
127
128
129
130
131
132
133
			)
		 args:arg-hash
		 0))

(if (args:get-arg "-mode")
    (let* ((mode (string->symbol (args:get-arg "-mode"))))
      (rmt:transport-mode mode))

)

(if (args:get-arg "-test") ;; need to use tcp for test control panel -- Why? - Martin
    (rmt:transport-mode 'tcp))

;; RA => Might require revert for filters 
;; create a watch dog to move changes from lt/.db/*.db to megatest.db
;;
;;;(if (file-write-access? (conc *toppath* "/megatest.db"))
;;(debug:print-info 13 *default-log-port* "Before common:watchdog spawn")
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
   exn
   (begin
     (debug:print 2 *default-log-port* "WARNING: error in accessing databases in get-youngest-run-db-mod-time: "
		  ((condition-property-accessor 'exn 'message) exn) " db-dir="dbdir ", exn=" exn)
     (current-seconds)) ;; something went wrong - just print an error and return current-seconds
   (common:max (map (lambda (filen)
		      (file-modification-time filen))
		    (glob (conc dbdir "/*.db*"))))))

(define (dashboard:monitor-changed? commondat tabdat)
  (let* ((run-update-time (current-seconds))
	 (monitor-db-path (dboard:tabdat-monitor-db-path tabdat))
	 (monitor-modtime (if (and monitor-db-path (common:file-exists? monitor-db-path))
			      (file-modification-time monitor-db-path)
			      -1)))







|







3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
   exn
   (begin
     (debug:print 2 *default-log-port* "WARNING: error in accessing databases in get-youngest-run-db-mod-time: "
		  ((condition-property-accessor 'exn 'message) exn) " db-dir="dbdir ", exn=" exn)
     (current-seconds)) ;; something went wrong - just print an error and return current-seconds
   (common:max (map (lambda (filen)
		      (file-modification-time filen))
		    (cons (conc dbdir "/main.db") (glob (conc dbdir "/?.db")))))))

(define (dashboard:monitor-changed? commondat tabdat)
  (let* ((run-update-time (current-seconds))
	 (monitor-db-path (dboard:tabdat-monitor-db-path tabdat))
	 (monitor-modtime (if (and monitor-db-path (common:file-exists? monitor-db-path))
			      (file-modification-time monitor-db-path)
			      -1)))
3826
3827
3828
3829
3830
3831
3832

3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849



3850
3851
3852
3853
3854
3855
3856
  ;; (print "Starting dashboard main")
    
  (let* ((mtdb-path (conc *toppath* "/.mtdb/main.db"))
         (target (args:get-arg "-target"))
         (commondat       (dboard:commondat-make)))
    (if target
        (begin

          (args:remove-arg-from-ht "-target")
          (dboard:commondat-target-set! commondat target)
        )
    )

    (if (not (launch:setup))
        (begin
          (debug:print 0 *default-log-port* "Failed to find megatest.config, exiting") 
          (exit 1)
        )
    )

    #;(if (not (common:on-homehost?))
    (begin
      (debug:print 0 *default-log-port* "WARNING: You are starting the dashboard on a machine that is not the homehost:" (server:get-homehost))
      (debug:print 0 *default-log-port* "It will be slower.")
      ))





    (if (and (common:file-exists? mtdb-path)
	     (file-write-access? mtdb-path))
	(if (not (args:get-arg "-skip-version-check"))
            (common:exit-on-version-changed)))








>
|











|

|

<
>
>
>







3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850

3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
  ;; (print "Starting dashboard main")
    
  (let* ((mtdb-path (conc *toppath* "/.mtdb/main.db"))
         (target (args:get-arg "-target"))
         (commondat       (dboard:commondat-make)))
    (if target
        (begin
          (hash-table-delete! args:arg-hash "-target")  ;; workaround for the following commented out function
          ;; (args:remove-arg-from-ht "-target")   This function is in mtargs/mtargs.scm, but it's in an egg that is not in the current build of chicken 4.
          (dboard:commondat-target-set! commondat target)
        )
    )

    (if (not (launch:setup))
        (begin
          (debug:print 0 *default-log-port* "Failed to find megatest.config, exiting") 
          (exit 1)
        )
    )

    (if (not (common:on-homehost?))
    (begin
      (debug:print 0 *default-log-port* "WARNING: You are starting the dashboard on a machine that is not the homehost:" (car (common:get-homehost)))
      (debug:print 0 *default-log-port* "It will be slower.")

      )
      (debug:print 0 *default-log-port* "Dashboard started on the homehost: " (car (common:get-homehost)))
    )


    (if (and (common:file-exists? mtdb-path)
	     (file-write-access? mtdb-path))
	(if (not (args:get-arg "-skip-version-check"))
            (common:exit-on-version-changed)))

Modified db.scm from [b1837f1312] to [0a367c507f].

2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
     (sqlite3:execute 
      db
      "INSERT OR REPLACE into test_steps (test_id,stepname,state,status,event_time,comment,logfile) VALUES(?,?,?,?,?,?,?);"
      test-id teststep-name state-in status-in (current-seconds)
      (if comment comment "")
      (if logfile logfile "")))))



(define (db:delete-steps-for-test! dbstruct run-id test-id)
  ;; TODO: figure out why status is the key field rather than state (note:  CONSTRAINT test_steps_constraint UNIQUE (test_id,stepname,state) )
  (db:with-db
   dbstruct
   run-id
   #t
   (lambda (dbdat db)







<
<







2737
2738
2739
2740
2741
2742
2743


2744
2745
2746
2747
2748
2749
2750
     (sqlite3:execute 
      db
      "INSERT OR REPLACE into test_steps (test_id,stepname,state,status,event_time,comment,logfile) VALUES(?,?,?,?,?,?,?);"
      test-id teststep-name state-in status-in (current-seconds)
      (if comment comment "")
      (if logfile logfile "")))))



(define (db:delete-steps-for-test! dbstruct run-id test-id)
  ;; TODO: figure out why status is the key field rather than state (note:  CONSTRAINT test_steps_constraint UNIQUE (test_id,stepname,state) )
  (db:with-db
   dbstruct
   run-id
   #t
   (lambda (dbdat db)
4245
4246
4247
4248
4249
4250
4251














































4252
4253
4254
4255
4256
4257
4258
	 (begin
	   (debug:print 0 *default-log-port* "WARNING: path given, " outputfile " is relative, prefixing with current directory")
	   (conc (current-directory) "/" outputfile)))
     results)
    ;; brutal clean up
    (dbfile:add-dbdat dbstruct #f dbdat)
    (system "rm -rf tempdir")))















































;; (db:extract-ods-file db "outputfile.ods" '(("sysname" "%")("fsname" "%")("datapath" "%")) "%")

;;======================================================================
;; moving watch dogs here due to dependencies
;;======================================================================








>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
	 (begin
	   (debug:print 0 *default-log-port* "WARNING: path given, " outputfile " is relative, prefixing with current directory")
	   (conc (current-directory) "/" outputfile)))
     results)
    ;; brutal clean up
    (dbfile:add-dbdat dbstruct #f dbdat)
    (system "rm -rf tempdir")))

;;======================================================================
;; cached writes stuff
;;======================================================================

(define (db:add-cached-write dbstruct proc run-id params)
  (debug:print 0 *default-log-port* "Adding cached write for run-id "run-id" params " params)
  (mutex-lock! *cached-writes-mutex*)
  (let* ((hkey                (cons dbstruct run-id))
	 (cached-writes-queue (hash-table-ref/default *cached-writes-queues* hkey '())))
    (hash-table-set! *cached-writes-queues* hkey (cons (list proc params) cached-writes-queue)))
  (if (not *cached-writes-flag*)
      (begin
	(set! *cached-writes-flag* #t)
	(thread-start! (make-thread
			(lambda ()
			  (debug:print 0 *default-log-port* "process cached writes thread started.")
			  (thread-sleep! 1)
			  (db:process-cached-writes-queue))))))
  (mutex-unlock! *cached-writes-mutex*))

(define (db:process-cached-writes-queue)
  (mutex-lock! *cached-writes-mutex*)
  (hash-table-for-each
   *cached-writes-queues*
   (lambda (hkey writes-list)
     (let* ((dbstruct (car hkey))
	    (run-id   (cdr hkey)))
       (debug:print 0 *default-log-port* "Processing "(length writes-list)" cached writes for run "run-id)
       (db:with-db
	dbstruct
	run-id
	#t 
	(lambda (dbdat db)
	  (sqlite3:with-transaction
	   db
	   (lambda ()
	     (for-each
	      (lambda (queued-write)
		(match queued-write
		       ((proc params)(apply proc dbstruct params))
		       (else (assert #f "BAD queued-write"))))
	      writes-list)))
	  (hash-table-delete! *cached-writes-queues* hkey))))))
  (set! *cached-writes-flag* #f)
  (mutex-unlock! *cached-writes-mutex*))

;; (db:extract-ods-file db "outputfile.ods" '(("sysname" "%")("fsname" "%")("datapath" "%")) "%")

;;======================================================================
;; moving watch dogs here due to dependencies
;;======================================================================

Modified dbfile.scm from [b5eea0764a] to [e2ffa01a50].

444
445
446
447
448
449
450
451

452
453
454
455
456
457
458
				 (if sync-mode
				     (sqlite3:execute db (conc "PRAGMA synchronous = "sync-mode";")))
				 (if journal-mode
				     (sqlite3:execute db (conc "PRAGMA journal_mode = "journal-mode";")))
				 (if (and init-proc (or force-init
							(not db-exists)))
				     (init-proc db))
				 db)))

                            (begin
			      (if (file-exists? fname )
                                  (let ((db (sqlite3:open-database fname)))
				    ;; pragmas synchronous not needed because this db is used read-only
				    ;; (sqlite3:execute db (conc "PRAGMA synchronous = "mode";")
				    (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 30000)) ;; read-only but still need timeout
				    db )







|
>







444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
				 (if sync-mode
				     (sqlite3:execute db (conc "PRAGMA synchronous = "sync-mode";")))
				 (if journal-mode
				     (sqlite3:execute db (conc "PRAGMA journal_mode = "journal-mode";")))
				 (if (and init-proc (or force-init
							(not db-exists)))
				     (init-proc db))
				 db))
                             expire-time: 30)
                            (begin
			      (if (file-exists? fname )
                                  (let ((db (sqlite3:open-database fname)))
				    ;; pragmas synchronous not needed because this db is used read-only
				    ;; (sqlite3:execute db (conc "PRAGMA synchronous = "mode";")
				    (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 30000)) ;; read-only but still need timeout
				    db )
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
	  (if (file-exists? fname)
	      (handle-exceptions exn
                #f 
                (with-input-from-file fname
	  	  (lambda ()
		    (equal? key-string (read-line)))))
              (begin
                 (dbfile:print-err "dbfile:simple-file-lock created " fname " but it was gone 3 seconds later")
	         #f
              )
          )
       )
    )
  )
)







|







1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
	  (if (file-exists? fname)
	      (handle-exceptions exn
                #f 
                (with-input-from-file fname
	  	  (lambda ()
		    (equal? key-string (read-line)))))
              (begin
                 (dbfile:print-err "dbfile:simple-file-lock created " fname " but it was gone 0.25 seconds later")
	         #f
              )
          )
       )
    )
  )
)
1577
1578
1579
1580
1581
1582
1583
1584




1585






	 (result      (or stmth
			  (let* ((newstmth (sqlite3:prepare db stmt)))
			    ;; (db:hoh-set! stmt-cache db stmt newstmth)
			    (hash-table-set! stmt-cache stmt newstmth)
			    newstmth))))
    (mutex-unlock! *get-cache-stmth-mutex*)
    result))





)














>
>
>
>
|
>
>
>
>
>
>
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
	 (result      (or stmth
			  (let* ((newstmth (sqlite3:prepare db stmt)))
			    ;; (db:hoh-set! stmt-cache db stmt newstmth)
			    (hash-table-set! stmt-cache stmt newstmth)
			    newstmth))))
    (mutex-unlock! *get-cache-stmth-mutex*)
    result))

;;======================================================================
;; cached writes - run list of procs inside transaction
;;   NOTE: this only works because we have once database per process
;;======================================================================

(define *cached-writes-mutex* (make-mutex))
(define *cached-writes-flag*  #f)
(define *cached-writes-queues* (make-hash-table)) ;; dbstruct->list of writes


)

Modified dbmod.scm from [9f0ce614a3] to [fcf26556ab].

75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
;; called in rmt.scm nfs-transport-handler
(define (dbmod:nfs-get-dbstruct run-id keys init-proc areapath #!key (tmpadj ""))
  (assert areapath "FATAL: dbmod:nfs-get-dbstruct called without areapath set.")
  (let* ((dbfname  (dbmod:run-id->dbfname run-id))
	 (dbstruct (hash-table-ref/default *dbmod:nfs-db-handles* dbfname #f)))
    (if dbstruct
	dbstruct
	(let* ((newdbstruct (dbmod:open-dbmoddb areapath run-id dbfname init-proc keys syncdir: 'fromdisk tmpadj: tmpadj)))
	  (hash-table-set! *dbmod:nfs-db-handles* dbfname newdbstruct)
	  newdbstruct))))

;;======================================================================
;; The cachedb one-db file per server method goes in here
;;======================================================================








|







75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
;; called in rmt.scm nfs-transport-handler
(define (dbmod:nfs-get-dbstruct run-id keys init-proc areapath #!key (tmpadj ""))
  (assert areapath "FATAL: dbmod:nfs-get-dbstruct called without areapath set.")
  (let* ((dbfname  (dbmod:run-id->dbfname run-id))
	 (dbstruct (hash-table-ref/default *dbmod:nfs-db-handles* dbfname #f)))
    (if dbstruct
	dbstruct
	(let* ((newdbstruct (dbmod:open-dbmoddb areapath run-id dbfname init-proc keys syncdir: 'todisk)))
	  (hash-table-set! *dbmod:nfs-db-handles* dbfname newdbstruct)
	  newdbstruct))))

;;======================================================================
;; The cachedb one-db file per server method goes in here
;;======================================================================

117
118
119
120
121
122
123
124

125

126
127
128
129
130
131
132
			      (debug:print-info 0 *default-log-port* "dbmod:with-db, database is busy, will try "count" more times.")
			      (thread-sleep! 1)
			      (loop (- count 1)))
			    (begin
			      (debug:print-info 0 *default-log-port* "dbmod:with-db, database is busy, giving up.")
			      (exit 1))))
		   (exn ()
			(dbfile:print-err exn "ERROR: Unknown error with database for run-id "run-id", message: "

					  ((condition-property-accessor 'exn 'message) exn))

			(exit 2))))))
      (if use-mutex (mutex-unlock! *db-with-db-mutex*))
      res)))

(define (db:with-db dbstruct run-id w/r proc . params)
  (dbmod:with-db dbstruct run-id w/r proc params))








|
>
|
>







117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
			      (debug:print-info 0 *default-log-port* "dbmod:with-db, database is busy, will try "count" more times.")
			      (thread-sleep! 1)
			      (loop (- count 1)))
			    (begin
			      (debug:print-info 0 *default-log-port* "dbmod:with-db, database is busy, giving up.")
			      (exit 1))))
		   (exn ()
			(dbfile:print-err exn "ERROR: Unknown error with db for run-id "
					  run-id", message: "
					  ((condition-property-accessor 'exn 'message) exn)
					  ", details: "(condition->list exn))
			(exit 2))))))
      (if use-mutex (mutex-unlock! *db-with-db-mutex*))
      res)))

(define (db:with-db dbstruct run-id w/r proc . params)
  (dbmod:with-db dbstruct run-id w/r proc params))

518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
	  ;; for each table
	  ;;    insert into dest.<table> select * from src.<table> where last_update>last_update
	  ;; done
	  (debug:print 2 *default-log-port* "Attaching "destdbfile" as auxdb")
	  (handle-exceptions
	      exn
	      (begin
		(debug:print 0 "ATTACH failed, exiting. exn="(condition->list exn))
		(exit 1))
	    (sqlite3:execute dbh (conc "ATTACH '"destdbfile"' AS auxdb;")))
	  (for-each
	   (lambda (table)
	     (let* ((dummy (debug:print 2 *default-log-port* "Doing table " table))
                    (tbldat (alist-ref table tables equal?))
		    (fields (map car tbldat))







|







520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
	  ;; for each table
	  ;;    insert into dest.<table> select * from src.<table> where last_update>last_update
	  ;; done
	  (debug:print 2 *default-log-port* "Attaching "destdbfile" as auxdb")
	  (handle-exceptions
	      exn
	      (begin
		(debug:print 0 *default-log-port* "ATTACH failed, exiting. exn="(condition->list exn))
		(exit 1))
	    (sqlite3:execute dbh (conc "ATTACH '"destdbfile"' AS auxdb;")))
	  (for-each
	   (lambda (table)
	     (let* ((dummy (debug:print 2 *default-log-port* "Doing table " table))
                    (tbldat (alist-ref table tables equal?))
		    (fields (map car tbldat))

Modified launch.scm from [470997d4b0] to [9b7b7e2640].

339
340
341
342
343
344
345













































346
347
348
349
350
351
352
	      (if (hash-table-ref/default misc-flags 'keep-going #f)  ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta
		  (loop (calc-minutes)
                        (or new-cpu-load cpu-load)
                        (or new-disk-free disk-free)
                        (if do-sync (current-seconds) last-sync)))))))
    (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional















































(define (launch:execute encoded-cmd)
  (let* ((cmdinfo    (common:read-encoded-string encoded-cmd))
	 (tconfigreg #f))
    (setenv "MT_CMDINFO" encoded-cmd)
    ;;(bb-check-path msg: "launch:execute incoming")
    (if (list? cmdinfo) ;; ((testpath /tmp/mrwellan/jazzmind/src/example_run/tests/sqlitespeed)







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
	      (if (hash-table-ref/default misc-flags 'keep-going #f)  ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta
		  (loop (calc-minutes)
                        (or new-cpu-load cpu-load)
                        (or new-disk-free disk-free)
                        (if do-sync (current-seconds) last-sync)))))))
    (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional


;; read testconfig and create .logpro and script files
;;    - use #f for tconfigreg to re-read the testconfigs from disk
;;
(define (launch:extract-scripts-logpro  test-dir test-name item-path tconfigreg-in)
  (let* ((tconfigreg      (or tconfigreg-in
			      (tests:get-all)))
	 (tconfig-fname   (conc test-dir "/.testconfig"))
	 (tconfig-tmpfile (conc tconfig-fname ".tmp"))
	 (tconfig         (tests:get-testconfig test-name item-path tconfigreg #t force-create: #t)) ;; 'return-procs)))
	 (scripts         (configf:get-section tconfig "scripts"))
	 (logpros         (configf:get-section tconfig "logpro")))
    ;; create .testconfig file
    (configf:write-alist tconfig tconfig-tmpfile)
    (file-move tconfig-tmpfile tconfig-fname #t)
    (delete-file* ".final-status")
    
    ;; extract scripts from testconfig and write them to files in test run dir
    (for-each
     (lambda (scriptdat)
       (match scriptdat
	      ((name content)
	       (debug:print-info 2 *default-log-port* "Creating script "(current-directory)"/"name)
	       (with-output-to-file name
		 (lambda ()
		   (print content)))
	       (change-file-mode name (bitwise-ior perm/irwxg perm/irwxu)))
	      (else
	       (debug:print-info 0 "Invalid script definiton found in [scripts] section of testconfig. \"" scriptdat "\""))))
     scripts)

    ;; extract logpro from testconfig and write them to files in test run dir
    (for-each
     (lambda (logprodat)
       (match logprodat
	      ((name content)
	       (debug:print-info 2 *default-log-port* "Creating logpro file "(current-directory)"/"name)
	       (with-output-to-file name
		 (lambda ()
		   (print content)
		   ;; (change-file-mode name (bitwise-ior perm/irwxg perm/irwxu))
		   )))
	      (else
	       (debug:print-info 0 "Invalid logpro definiton found in [logpro] section of testconfig. \"" logprodat "\""))))
     logpros)))

(define (launch:execute encoded-cmd)
  (let* ((cmdinfo    (common:read-encoded-string encoded-cmd))
	 (tconfigreg #f))
    (setenv "MT_CMDINFO" encoded-cmd)
    ;;(bb-check-path msg: "launch:execute incoming")
    (if (list? cmdinfo) ;; ((testpath /tmp/mrwellan/jazzmind/src/example_run/tests/sqlitespeed)
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
	    (if (string-search tmppath " ")
		(debug:print 0 *default-log-port* "WARNING: spaces in PATH are not supported."))
	    (if mt-bindir-path (setenv "PATH" (conc tmppath":"mt-bindir-path))))
          ;;(bb-check-path msg: "launch:execute post block 4")
	  ;; (change-directory top-path)
	  ;; Can setup as client for server mode now
	  ;; (client:setup)

	  
	  ;; environment overrides are done *before* the remaining critical envars.
	  (alist->env-vars env-ovrd)
          ;;(bb-check-path msg: "launch:execute post block 41")
	  (runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals)
          ;;(bb-check-path msg: "launch:execute post block 42")
	  (set-item-env-vars itemdat)
          ;;(bb-check-path msg: "launch:execute post block 43")







|
<







659
660
661
662
663
664
665
666

667
668
669
670
671
672
673
	    (if (string-search tmppath " ")
		(debug:print 0 *default-log-port* "WARNING: spaces in PATH are not supported."))
	    (if mt-bindir-path (setenv "PATH" (conc tmppath":"mt-bindir-path))))
          ;;(bb-check-path msg: "launch:execute post block 4")
	  ;; (change-directory top-path)
	  ;; Can setup as client for server mode now
	  ;; (client:setup)
  

	  ;; environment overrides are done *before* the remaining critical envars.
	  (alist->env-vars env-ovrd)
          ;;(bb-check-path msg: "launch:execute post block 41")
	  (runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals)
          ;;(bb-check-path msg: "launch:execute post block 42")
	  (set-item-env-vars itemdat)
          ;;(bb-check-path msg: "launch:execute post block 43")
645
646
647
648
649
650
651

652
653
654
655
656
657
658
659
660
661

662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699








700
701
702
703
704
705
706

	  (if (args:get-arg "-xterm")
	      (set! fullrunscript "xterm")
	      (if (and fullrunscript 
		       (common:file-exists? fullrunscript)
		       (not (file-execute-access? fullrunscript)))
		  (system (conc "chmod ug+x " fullrunscript))))


	  ;; We are about to actually kick off the test
	  ;; so this is a good place to remove the records for 
	  ;; any previous runs
	  ;; (db:test-remove-steps db run-id testname itemdat)
	  ;; now is also a good time to write the .testconfig file
	  (let* ((tconfig-fname   (conc work-area "/.testconfig"))
		 (tconfig-tmpfile (conc tconfig-fname ".tmp"))
		 (tconfig         (tests:get-testconfig test-name item-path tconfigreg #t force-create: #t)) ;; 'return-procs)))
		 (scripts (configf:get-section tconfig "scripts")))

	    ;; create .testconfig file
	    (configf:write-alist tconfig tconfig-tmpfile)
	    (file-move tconfig-tmpfile tconfig-fname #t)
	    (delete-file* ".final-status")

	    ;; extract scripts from testconfig and write them to files in test run dir
	    (for-each
	     (lambda (scriptdat)
	       (match scriptdat
		      ((name content)
		       (with-output-to-file name
			 (lambda ()
			   (print content)
			   (change-file-mode name (bitwise-ior perm/irwxg perm/irwxu)))))
		      (else
		       (debug:print-info 0 "Invalid script definiton found in [scripts] section of testconfig. \"" scriptdat "\""))))
	     scripts))
	  ;;

	  (let* ((m            (make-mutex))
		 (kill-job?    #f)
		 (exit-info    (make-launch:einf pid: #t exit-status: #t exit-code: #t rollup-status: 0)) ;; pid exit-status exit-code (i.e. process was successfully run) rollup-status
		 (job-thread   #f)
		 ;; (keep-going   #t)
		 (misc-flags   (let ((ht (make-hash-table)))
				 (hash-table-set! ht 'keep-going #t)
				 ht))
		 (runit        (lambda ()
				 (launch:manage-steps run-id test-id item-path fullrunscript ezsteps subrun test-name tconfigreg exit-info m)))
		 (monitorjob   (lambda ()
				 (launch:monitor-job  run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags)))
		 (th1          (make-thread monitorjob "monitor job"))
		 (th2          (make-thread runit "run job"))
                 (tconfig         (tests:get-testconfig test-name item-path tconfigreg #t))
                 (propagate-exit-code (configf:lookup *configdat* "setup" "propagate-exit-code"))
                 (propagate-status-list '("FAIL" "KILLED" "ABORT" "DEAD" "CHECK" "SKIP" "WAIVED"))
                 (test-status "not set")
                 )








	    (set! job-thread th2)
	    (thread-start! th1)
	    (thread-start! th2)
	    (thread-join! th2)
	    (debug:print-info 0 *default-log-port* "Megatest execute of test " test-name ", item path " item-path " complete. Notifying the db ...")
            (debug:print-info 2 *default-log-port* "exit-info = " exit-info)
	    (hash-table-set! misc-flags 'keep-going #f)







>

|
|
|
|
|
|
|
|
|
>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|



















|
<
>
>
>
>
>
>
>
>







689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744

745
746
747
748
749
750
751
752
753
754
755
756
757
758
759

	  (if (args:get-arg "-xterm")
	      (set! fullrunscript "xterm")
	      (if (and fullrunscript 
		       (common:file-exists? fullrunscript)
		       (not (file-execute-access? fullrunscript)))
		  (system (conc "chmod ug+x " fullrunscript))))
	  (launch:extract-scripts-logpro work-area test-name item-path tconfigreg)

;;;;;	  ;; We are about to actually kick off the test
;;;;;	  ;; so this is a good place to remove the records for 
;;;;;	  ;; any previous runs
;;;;;	  ;; (db:test-remove-steps db run-id testname itemdat)
;;;;;	  ;; now is also a good time to write the .testconfig file
;;;;;	  (let* ((tconfig-fname   (conc work-area "/.testconfig"))
;;;;;		 (tconfig-tmpfile (conc tconfig-fname ".tmp"))
;;;;;		 (tconfig         (tests:get-testconfig test-name item-path tconfigreg #t force-create: #t)) ;; 'return-procs)))
;;;;;		 (scripts         (configf:get-section tconfig "scripts"))
;;;;;		 (precmd          (configf:lookup tconfig )
;;;;;	    ;; create .testconfig file
;;;;;	    (configf:write-alist tconfig tconfig-tmpfile)
;;;;;	    (file-move tconfig-tmpfile tconfig-fname #t)
;;;;;	    (delete-file* ".final-status")
;;;;;
;;;;;	    ;; extract scripts from testconfig and write them to files in test run dir
;;;;;	    (for-each
;;;;;	     (lambda (scriptdat)
;;;;;	       (match scriptdat
;;;;;		      ((name content)
;;;;;		       (with-output-to-file name
;;;;;			 (lambda ()
;;;;;			   (print content)
;;;;;			   (change-file-mode name (bitwise-ior perm/irwxg perm/irwxu)))))
;;;;;		      (else
;;;;;		       (debug:print-info 0 "Invalid script definiton found in [scripts] section of testconfig. \"" scriptdat "\""))))
;;;;;	     scripts))
	  ;;

	  (let* ((m            (make-mutex))
		 (kill-job?    #f)
		 (exit-info    (make-launch:einf pid: #t exit-status: #t exit-code: #t rollup-status: 0)) ;; pid exit-status exit-code (i.e. process was successfully run) rollup-status
		 (job-thread   #f)
		 ;; (keep-going   #t)
		 (misc-flags   (let ((ht (make-hash-table)))
				 (hash-table-set! ht 'keep-going #t)
				 ht))
		 (runit        (lambda ()
				 (launch:manage-steps run-id test-id item-path fullrunscript ezsteps subrun test-name tconfigreg exit-info m)))
		 (monitorjob   (lambda ()
				 (launch:monitor-job  run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags)))
		 (th1          (make-thread monitorjob "monitor job"))
		 (th2          (make-thread runit "run job"))
                 (tconfig         (tests:get-testconfig test-name item-path tconfigreg #t))
                 (propagate-exit-code (configf:lookup *configdat* "setup" "propagate-exit-code"))
                 (propagate-status-list '("FAIL" "KILLED" "ABORT" "DEAD" "CHECK" "SKIP" "WAIVED"))
                 (test-status     "not set")

		 (test-state      "not set")
		 (precmd          (configf:lookup tconfig "setup" "precmd"))
		 (postcmd         (configf:lookup tconfig "setup" "postcmd")))
	    ;; first, if set, run the precmd
	    (if precmd ;; (file-exists? precmd)(file-execute-access? precmd))
		(begin
		  ;; (save-environment-as-files "precmd-envt")
		  (system precmd))) ;; up to test author to put nbfake if desired.
	    (set! job-thread th2)
	    (thread-start! th1)
	    (thread-start! th2)
	    (thread-join! th2)
	    (debug:print-info 0 *default-log-port* "Megatest execute of test " test-name ", item path " item-path " complete. Notifying the db ...")
            (debug:print-info 2 *default-log-port* "exit-info = " exit-info)
	    (hash-table-set! misc-flags 'keep-going #f)
755
756
757
758
759
760
761
762
763

764
765

766
767
768
769
770
771
772
773






774
775
776
777
778
779
780
	      (rmt:update-run-stats run-id (rmt:get-raw-run-stats run-id))) ;; end of let*

	    (mutex-unlock! m)
            (launch:end-of-run-check run-id )
	    (debug:print 2 *default-log-port* "Output from running " fullrunscript ", pid " (launch:einf-pid exit-info) " in work area " 
			 work-area ":\n====\n exit code " (launch:einf-exit-code exit-info) "\n" "====\n")


            (set! test-status (db:test-get-status (rmt:get-testinfo-state-status run-id test-id)))


            ;; If the propagate-exit-code option has been set in the megatest config, and the test status matches the list, set the exit code to 1.


            (if (and propagate-exit-code (string=? propagate-exit-code "yes") (member test-status propagate-status-list))
               (begin
                (debug:print 1 *default-log-port* "Setting exit status to 1 because of test status of " test-status) 
                (set! *globalexitstatus* 1)
               )
            )







	    (if (not (launch:einf-exit-status exit-info))
		(exit 4))))
        )))

;; Spec for End of test
;; At end of each test call, after marking self as COMPLETED do run-state-status-rollup
;; At transition to run COMPLETED/X do hooks







|
|
>


>








>
>
>
>
>
>







808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
	      (rmt:update-run-stats run-id (rmt:get-raw-run-stats run-id))) ;; end of let*

	    (mutex-unlock! m)
            (launch:end-of-run-check run-id )
	    (debug:print 2 *default-log-port* "Output from running " fullrunscript ", pid " (launch:einf-pid exit-info) " in work area " 
			 work-area ":\n====\n exit code " (launch:einf-exit-code exit-info) "\n" "====\n")

	    (let* ((testrec  (rmt:get-testinfo-state-status run-id test-id)))
              (set! test-status (db:test-get-status testrec))
	      (set! test-state  (db:test-get-state  testrec)))

            ;; If the propagate-exit-code option has been set in the megatest config, and the test status matches the list, set the exit code to 1.


            (if (and propagate-exit-code (string=? propagate-exit-code "yes") (member test-status propagate-status-list))
               (begin
                (debug:print 1 *default-log-port* "Setting exit status to 1 because of test status of " test-status) 
                (set! *globalexitstatus* 1)
               )
            )

	    (if postcmd
		(begin
		  (setenv "MT_TEST_STATE" test-state)
		  (setenv "MT_TEST_STATUS" test-status)
		  ;; (save-environment-as-files "postcmd-envt")
		  (system postcmd)))
	    (if (not (launch:einf-exit-status exit-info))
		(exit 4))))
        )))

;; Spec for End of test
;; At end of each test call, after marking self as COMPLETED do run-state-status-rollup
;; At transition to run COMPLETED/X do hooks
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
		       runname
		       (common:file-exists? fulldir))
		  (let ((tmpfile  (conc fulldir "/.megatest.cfg." (current-seconds)))
			(targfile (conc fulldir "/.megatest.cfg-"  megatest-version "-" megatest-fossil-hash))
			(rconfig  (conc fulldir "/.runconfig." megatest-version "-" megatest-fossil-hash)))
		    (if (common:file-exists? rconfig) ;; only cache megatest.config AFTER runconfigs has been cached
			(begin
			  (debug:print-info 0 *default-log-port* "Caching megatest.config in " tmpfile)
                          (if (not (common:in-running-test?))
                              (configf:write-alist *configdat* tmpfile))
			  (system (conc "ln -sf " tmpfile " " targfile))))
		    )))
	    (debug:print-info 1 *default-log-port* "No linktree yet, no caching configs.")))))









|







946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
		       runname
		       (common:file-exists? fulldir))
		  (let ((tmpfile  (conc fulldir "/.megatest.cfg." (current-seconds)))
			(targfile (conc fulldir "/.megatest.cfg-"  megatest-version "-" megatest-fossil-hash))
			(rconfig  (conc fulldir "/.runconfig." megatest-version "-" megatest-fossil-hash)))
		    (if (common:file-exists? rconfig) ;; only cache megatest.config AFTER runconfigs has been cached
			(begin
			  (debug:print-info 2 *default-log-port* "Caching megatest.config in " tmpfile)
                          (if (not (common:in-running-test?))
                              (configf:write-alist *configdat* tmpfile))
			  (system (conc "ln -sf " tmpfile " " targfile))))
		    )))
	    (debug:print-info 1 *default-log-port* "No linktree yet, no caching configs.")))))


Modified megatest-version.scm from [5a374d2bf1] to [edb4f63b60].

16
17
18
19
20
21
22
23
;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.

;; Always use two or four digit decimal
;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00..

;; (declare (unit megatest-version))

(define megatest-version 1.8017)







|
16
17
18
19
20
21
22
23
;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.

;; Always use two or four digit decimal
;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00..

;; (declare (unit megatest-version))

(define megatest-version 1.8032)

Modified megatest.scm from [af8974dd23] to [bb25ae0786].

109
110
111
112
113
114
115



116



117
118
119
120
121
122
123
 ;;(lambda params (apply rmt:send-receive params))) ;; make send-receive available to rmtmod via parameter


;; load the ~/.megatestrc file, put (use trace)(trace-call-sites #t)(trace function-you-want-to-trace) in this file
;;
(let ((debugcontrolf (conc (get-environment-variable "HOME") "/.megatestrc")))
  (if (common:file-exists? debugcontrolf)



      (load debugcontrolf)))




;; usage logging, careful with this, it is not designed to deal with all real world challenges!
;;
(if (and *usage-log-file*
         (file-write-access? *usage-log-file*))
    (with-output-to-file
        *usage-log-file*







>
>
>
|
>
>
>







109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
 ;;(lambda params (apply rmt:send-receive params))) ;; make send-receive available to rmtmod via parameter


;; load the ~/.megatestrc file, put (use trace)(trace-call-sites #t)(trace function-you-want-to-trace) in this file
;;
(let ((debugcontrolf (conc (get-environment-variable "HOME") "/.megatestrc")))
  (if (common:file-exists? debugcontrolf)
    (begin
      ;; for some reason, debug:print does not work here. Had to use print.
      (print (conc "WARNING: loading " debugcontrolf))
      (load debugcontrolf)
    )
  )
)

;; usage logging, careful with this, it is not designed to deal with all real world challenges!
;;
(if (and *usage-log-file*
         (file-write-access? *usage-log-file*))
    (with-output-to-file
        *usage-log-file*
255
256
257
258
259
260
261

262
263
264
265
266
267
268
269
  -mark-incompletes       : find and mark incomplete tests
  -ping run-id|host:port  : ping server, exit with 0 if found
  -debug N|N,M,O...       : enable debug 0-N or N and M and O ...
  -debug-noprop N|M,M,O...: enable debug but do not propagate to subprocesses via MT_DEBUG
  -config fname           : override the megatest.config file with fname
  -append-config fname    : append fname to the megatest.config file
  -import-sexpr fname     : import a sexpr file (use -list-runs % -dumpmode sexpr to create)


Utilities
  -env2file fname         : write the environment to fname.csh and fname.sh
  -envcap a               : save current variables labeled as context 'a' in file envdat.db
  -envdelta a-b           : output enviroment delta from context a to context b to -o fname
                            set the output mode with -dumpmode csh, bash or ini
                            note: ini format will use calls to use curr and minimize path
  -refdb2dat refdb        : convert refdb to sexp or to format specified by s-dumpmode







>
|







261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
  -mark-incompletes       : find and mark incomplete tests
  -ping run-id|host:port  : ping server, exit with 0 if found
  -debug N|N,M,O...       : enable debug 0-N or N and M and O ...
  -debug-noprop N|M,M,O...: enable debug but do not propagate to subprocesses via MT_DEBUG
  -config fname           : override the megatest.config file with fname
  -append-config fname    : append fname to the megatest.config file
  -import-sexpr fname     : import a sexpr file (use -list-runs % -dumpmode sexpr to create)
  -regen-testfiles        : regenerate scripts and logpro files from testconfig, run in test context
  
Utilities
  -env2file fname         : write the environment to fname.csh and fname.sh
  -envcap a               : save current variables labeled as context 'a' in file envdat.db
  -envdelta a-b           : output enviroment delta from context a to context b to -o fname
                            set the output mode with -dumpmode csh, bash or ini
                            note: ini format will use calls to use curr and minimize path
  -refdb2dat refdb        : convert refdb to sexp or to format specified by s-dumpmode
464
465
466
467
468
469
470

471
472
473
474
475
476
477
                        "-run-wait"      ;; wait on a run to complete (i.e. no RUNNING)
			"-one-pass"      ;;
			"-local"         ;; run some commands using local db access
			"-generate-html"
			"-generate-html-structure" 
			"-list-run-time"
                        "-list-test-time"

			
			;; misc queries
			"-list-disks"
			"-list-targets"
			"-list-db-targets"
			"-show-runconfig"
			"-show-config"







>







471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
                        "-run-wait"      ;; wait on a run to complete (i.e. no RUNNING)
			"-one-pass"      ;;
			"-local"         ;; run some commands using local db access
			"-generate-html"
			"-generate-html-structure" 
			"-list-run-time"
                        "-list-test-time"
			"-regen-testfiles"
			
			;; misc queries
			"-list-disks"
			"-list-targets"
			"-list-db-targets"
			"-show-runconfig"
			"-show-config"
963
964
965
966
967
968
969





970
971
972
973
974
975
976
;; Server? Start up here.
;;
(if (args:get-arg "-server")
    (let* (;; (run-id     (args:get-arg "-run-id"))
	   (dbfname    (args:get-arg "-db"))
	   (tl         (launch:setup))
	   (keys       (keys:config-get-fields *configdat*)))





      (case (rmt:transport-mode)
	((tcp)
	 (let* ((timeout    (server:expiration-timeout)))
	   (debug:print 0 *default-log-port* "INFO: Starting server for " dbfname " using tcp method with server timeout of "timeout)
	   (tt-server-timeout-param timeout)
	   (if dbfname
	       (tt:start-server tl #f dbfname api:tcp-dispatch-request-make-handler keys)







>
>
>
>
>







971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
;; Server? Start up here.
;;
(if (args:get-arg "-server")
    (let* (;; (run-id     (args:get-arg "-run-id"))
	   (dbfname    (args:get-arg "-db"))
	   (tl         (launch:setup))
	   (keys       (keys:config-get-fields *configdat*)))
      (if (not (common:on-homehost?))
       (begin
        (debug:print 0 *default-log-port* "Attempt to start a server on a machine that is not the homehost. Aborting")
        (exit
        )))
      (case (rmt:transport-mode)
	((tcp)
	 (let* ((timeout    (server:expiration-timeout)))
	   (debug:print 0 *default-log-port* "INFO: Starting server for " dbfname " using tcp method with server timeout of "timeout)
	   (tt-server-timeout-param timeout)
	   (if dbfname
	       (tt:start-server tl #f dbfname api:tcp-dispatch-request-make-handler keys)
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
        (ttdat (make-tt areapath: *toppath*))
     )
     (format #t fmtstr "DB" "host:port" "PID" "age" "last mod" "state")
     (for-each
        (lambda (dbfile)
          (let* (
            (dbfname (conc (pathname-file dbfile) ".db"))
            (sfiles   (tt:find-server *toppath* dbfname))
            )
            (for-each 
              (lambda (sfile)
                (let (
                  (sinfos (tt:get-server-info-sorted ttdat dbfname))
                  )
                  (for-each 
                     (lambda (sinfo)
                       (let* (
                         (db (list-ref sinfo 5))







<

<
<







1010
1011
1012
1013
1014
1015
1016

1017


1018
1019
1020
1021
1022
1023
1024
        (ttdat (make-tt areapath: *toppath*))
     )
     (format #t fmtstr "DB" "host:port" "PID" "age" "last mod" "state")
     (for-each
        (lambda (dbfile)
          (let* (
            (dbfname (conc (pathname-file dbfile) ".db"))

            )


                (let (
                  (sinfos (tt:get-server-info-sorted ttdat dbfname))
                  )
                  (for-each 
                     (lambda (sinfo)
                       (let* (
                         (db (list-ref sinfo 5))
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
                         )
                         (format #t fmtstr db (conc host ":" port) pid age last-mod state)
                       )
                     )
                     sinfos
                  )
                ) 
              )
              sfiles
            )
          )
       )
       dbfiles
     )
     (set! *didsomething* #t)
     (exit)  
  )







<
<
<







1036
1037
1038
1039
1040
1041
1042



1043
1044
1045
1046
1047
1048
1049
                         )
                         (format #t fmtstr db (conc host ":" port) pid age last-mod state)
                       )
                     )
                     sinfos
                  )
                ) 



          )
       )
       dbfiles
     )
     (set! *didsomething* #t)
     (exit)  
  )
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
                         (age (seconds->hr-min-sec (- (current-seconds) (list-ref sinfo 2))))
                         (last-mod (seconds->string (list-ref sinfo 2)))
                         (killed (system (conc "ssh " host " kill " pid " > /dev/null")))
                         (dummy2 (sleep 1))
                         (state (if (> (system (conc "ssh " host " ps " pid " > /dev/null")) 0) "dead" "alive"))
                            )
                         (format #t fmtstr db (conc host ":" port) pid age last-mod state)
                         (system (conc "rm " sfile))
                       )
                     )
                     sinfos
                  )
                ) 
              )
              sfiles







|







1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
                         (age (seconds->hr-min-sec (- (current-seconds) (list-ref sinfo 2))))
                         (last-mod (seconds->string (list-ref sinfo 2)))
                         (killed (system (conc "ssh " host " kill " pid " > /dev/null")))
                         (dummy2 (sleep 1))
                         (state (if (> (system (conc "ssh " host " ps " pid " > /dev/null")) 0) "dead" "alive"))
                            )
                         (format #t fmtstr db (conc host ":" port) pid age last-mod state)
                         (delete-file* sfile)
                       )
                     )
                     sinfos
                  )
                ) 
              )
              sfiles
1966
1967
1968
1969
1970
1971
1972


1973
1974
1975
1976
1977
1978
1979
;; run all tests are are Not COMPLETED and PASS or CHECK
(if (or (args:get-arg "-runall")
	(args:get-arg "-run")
	(args:get-arg "-rerun-clean")
	(args:get-arg "-rerun-all")
	(args:get-arg "-runtests")
        (args:get-arg "-kill-rerun"))


    (let ((need-clean (or (args:get-arg "-rerun-clean")
                          (args:get-arg "-rerun-all")))
	  (orig-cmdline (string-intersperse (argv) " ")))
      (general-run-call 
       "-runall"
       "run all tests"
       (lambda (target runname keys keyvals)







>
>







1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
;; run all tests are are Not COMPLETED and PASS or CHECK
(if (or (args:get-arg "-runall")
	(args:get-arg "-run")
	(args:get-arg "-rerun-clean")
	(args:get-arg "-rerun-all")
	(args:get-arg "-runtests")
        (args:get-arg "-kill-rerun"))
  (begin  
    (common:get-homehost) ;; set the .homehost if it's not set.
    (let ((need-clean (or (args:get-arg "-rerun-clean")
                          (args:get-arg "-rerun-all")))
	  (orig-cmdline (string-intersperse (argv) " ")))
      (general-run-call 
       "-runall"
       "run all tests"
       (lambda (target runname keys keyvals)
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
						 (conc "runname " runname)
						 (conc "runname " (simple-run-runname spec))
						 orig-cmdline)))))
			     (debug:print 0 *default-log-port* "ORIG: " orig-cmdline)
			     (debug:print 0 *default-log-port* "NEW:  " newcmdline)
			     (system newcmdline)))
			 run-specs))
	     (handle-run-requests target runname keys keyvals need-clean))))))

;;======================================================================
;; run one test
;;======================================================================

;; 1. find the config file
;; 2. change to the test directory







|







2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
						 (conc "runname " runname)
						 (conc "runname " (simple-run-runname spec))
						 orig-cmdline)))))
			     (debug:print 0 *default-log-port* "ORIG: " orig-cmdline)
			     (debug:print 0 *default-log-port* "NEW:  " newcmdline)
			     (system newcmdline)))
			 run-specs))
	     (handle-run-requests target runname keys keyvals need-clean)))))))

;;======================================================================
;; run one test
;;======================================================================

;; 1. find the config file
;; 2. change to the test directory
2117
2118
2119
2120
2121
2122
2123















2124
2125
2126
2127
2128
2129
2130
	   (let* ((db       #f)
		  ;; DO NOT run remote
		  (paths    (tests:test-get-paths-matching keys target (args:get-arg "-test-files"))))
	     (for-each (lambda (path)
			 (print path))
		       paths))))))
















;;======================================================================
;; Archive tests
;;======================================================================
;; Archive tests matching target, runname, and testpatt
(if (equal? (args:get-arg "-archive") "replicate-db")
    (begin
          ;; check if source







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
	   (let* ((db       #f)
		  ;; DO NOT run remote
		  (paths    (tests:test-get-paths-matching keys target (args:get-arg "-test-files"))))
	     (for-each (lambda (path)
			 (print path))
		       paths))))))

;;======================================================================
;; Utils for test areas
;;======================================================================

(if (args:get-arg "-regen-testfiles")
    (if (getenv "MT_TEST_RUN_DIR")
	(begin
	  (launch:setup)
	  (change-directory (getenv "MT_TEST_RUN_DIR"))
	  (let* ((testname (getenv "MT_TEST_NAME"))
		 (itempath (getenv "MT_ITEMPATH")))
	    (launch:extract-scripts-logpro (getenv "MT_TEST_RUN_DIR") testname itempath #f))
	  (set! *didsomething* #t))
	(debug:print 0 *default-log-port* "ERROR: Must run -regen-testfiles in a test environment (i.e. test xterm from dashboard)")))
		 	  
;;======================================================================
;; Archive tests
;;======================================================================
;; Archive tests matching target, runname, and testpatt
(if (equal? (args:get-arg "-archive") "replicate-db")
    (begin
          ;; check if source

Modified mtargs/mtargs.scm from [09e4f74c98] to [c1d2bd2b3a].

18
19
20
21
22
23
24

25
26
27
28
29
30
31

(module mtargs
    (
     arg-hash
     get-arg
     get-arg-number
     get-arg-from

     get-args
     usage
     print-args
     any-defined?
     ) 

(import scheme) ;; gives us cond-expand in chicken-4







>







18
19
20
21
22
23
24
25
26
27
28
29
30
31
32

(module mtargs
    (
     arg-hash
     get-arg
     get-arg-number
     get-arg-from
     remove-arg-from-ht
     get-args
     usage
     print-args
     any-defined?
     ) 

(import scheme) ;; gives us cond-expand in chicken-4
62
63
64
65
66
67
68




69
70
71
72
73
74
75

;; (define any any-defined?)

(define (get-arg-from ht arg . default)
  (if (null? default)
      (hash-table-ref/default ht arg #f)
      (hash-table-ref/default ht arg (car default))))





(define (get-args args params switches arg-hash num-needed)
  (let* ((numtargs (length args))
	 (adj-num-needed (if num-needed (+ num-needed 2) #f)))
    (if (< numtargs (if adj-num-needed adj-num-needed 2))
	(if (>= num-needed 1)
	    (usage "No arguments provided")







>
>
>
>







63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80

;; (define any any-defined?)

(define (get-arg-from ht arg . default)
  (if (null? default)
      (hash-table-ref/default ht arg #f)
      (hash-table-ref/default ht arg (car default))))

(define (remove-arg-from-ht arg)
      (hash-table-delete! arg-hash arg)
)

(define (get-args args params switches arg-hash num-needed)
  (let* ((numtargs (length args))
	 (adj-num-needed (if num-needed (+ num-needed 2) #f)))
    (if (< numtargs (if adj-num-needed adj-num-needed 2))
	(if (>= num-needed 1)
	    (usage "No arguments provided")

Modified portlogger.scm from [3334cefb6f] to [9754766f61].

29
30
31
32
33
34
35
36

37
38
39
40
41
42
43
(import srfi-1 posix srfi-69 hostinfo dot-locking z3
	(srfi 18) extras s11n)
(import (prefix sqlite3 sqlite3:))
(import debugprint dbmod)
;; lsof -i

(define (portlogger:open-db fname)
  (let* ((avail    (tasks:wait-on-journal fname 5 remove: #t)) ;; wait up to about 10 seconds for the journal to go away

	 (exists   (file-exists? fname))
	 (db       (if avail 
		       (sqlite3:open-database fname)
		       (begin
			 (system (conc "rm -f " fname))
			 (sqlite3:open-database fname))))
	 (handler  (sqlite3:make-busy-timeout 136000))







|
>







29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
(import srfi-1 posix srfi-69 hostinfo dot-locking z3
	(srfi 18) extras s11n)
(import (prefix sqlite3 sqlite3:))
(import debugprint dbmod)
;; lsof -i

(define (portlogger:open-db fname)
  (let* (;; (avail    (tasks:wait-on-journal fname 5 remove: #t)) ;; wait up to about 10 seconds for the journal to go away
         (avail #t)
	 (exists   (file-exists? fname))
	 (db       (if avail 
		       (sqlite3:open-database fname)
		       (begin
			 (system (conc "rm -f " fname))
			 (sqlite3:open-database fname))))
	 (handler  (sqlite3:make-busy-timeout 136000))

Modified rmt.scm from [564930aec3] to [07e78d96d7].

70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107

(define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id

;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname))
;;
(define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected
  (assert *toppath* "FATAL: rmt:send-receive called with *toppath* not set.")

  (if (not (eq? (rmt:transport-mode) 'nfs))
      (begin
	(if (> attemptnum 2)
	    (debug:print 0 *default-log-port* "INFO: attemptnum in rmt:send-receive is " attemptnum))
	
	(cond
	 ((> attemptnum 2) (thread-sleep! 0.05))
	 ((> attemptnum 10) (thread-sleep! 0.5))
	 ((> attemptnum 20) (thread-sleep! 1)))
	
	;; I'm turning this off, it may make sense to move it
	;; into http-transport-handler
	(if (and (> attemptnum 5) (= 0 (modulo attemptnum 15)))  
	    (begin
	      (debug:print 0 *default-log-port* "ERROR: can't connect to server, trying to start a server.")
	      (case (rmt:transport-mode)
		((http)
		 (server:run *toppath*)
		 (thread-sleep! 3))
		(else
		 (thread-sleep! 1) ;; for tcp the server is started by routines in tcp-transportmod. For nfs there is no server
		 ))))))
  
  ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote
  ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds.
  ;; 3. do the query, if on homehost use local access
  ;;
  (let* ((start-time    (current-seconds)) ;; snapshot time so all use cases get same value
         (areapath      *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas
	 (runremote     (or area-dat







|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







70
71
72
73
74
75
76
77























78
79
80
81
82
83
84

(define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id

;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname))
;;
(define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected
  (assert *toppath* "FATAL: rmt:send-receive called with *toppath* not set.")
	























  ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote
  ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds.
  ;; 3. do the query, if on homehost use local access
  ;;
  (let* ((start-time    (current-seconds)) ;; snapshot time so all use cases get same value
         (areapath      *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas
	 (runremote     (or area-dat
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
      ((http)(http-transport-handler runremote cmd rid params attemptnum area-dat areapath readonly-mode))
      ((tcp) (tcp-transport-handler  runremote cmd rid params attemptnum area-dat areapath readonly-mode testsuite mtexe))
      ((nfs) (nfs-transport-handler  runremote cmd rid params attemptnum area-dat areapath readonly-mode testsuite mtexe))
      )))

(define (nfs-transport-handler  runremote cmd run-id params attemptnum area-dat areapath readonly-mode testsuite mtexe)
  (let* ((keys     (common:get-fields *configdat*))
	 (dbstruct (dbmod:nfs-get-dbstruct run-id keys (dbfile:db-init-proc) areapath tmpadj: "/dashboard")))
    (api:dispatch-request dbstruct cmd run-id params)))
	 
(define (tcp-transport-handler runremote cmd run-id params attemptnum area-dat areapath readonly-mode testsuite mtexe)
  (if (not runremote)
      (let* ((newremote  (make-and-init-remote areapath)))
	(set! *runremote* newremote)
	(set! runremote newremote)))







|







92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
      ((http)(http-transport-handler runremote cmd rid params attemptnum area-dat areapath readonly-mode))
      ((tcp) (tcp-transport-handler  runremote cmd rid params attemptnum area-dat areapath readonly-mode testsuite mtexe))
      ((nfs) (nfs-transport-handler  runremote cmd rid params attemptnum area-dat areapath readonly-mode testsuite mtexe))
      )))

(define (nfs-transport-handler  runremote cmd run-id params attemptnum area-dat areapath readonly-mode testsuite mtexe)
  (let* ((keys     (common:get-fields *configdat*))
	 (dbstruct (dbmod:nfs-get-dbstruct run-id keys (dbfile:db-init-proc) areapath)))
    (api:dispatch-request dbstruct cmd run-id params)))
	 
(define (tcp-transport-handler runremote cmd run-id params attemptnum area-dat areapath readonly-mode testsuite mtexe)
  (if (not runremote)
      (let* ((newremote  (make-and-init-remote areapath)))
	(set! *runremote* newremote)
	(set! runremote newremote)))

Modified rmtmod.scm from [c4f748fe17] to [d2f108f299].

85
86
87
88
89
90
91



92
93
94
95
96
97





98
99
100
101
102
103
104
105
106
107
108
109
110
111
112





113
114




115
116
117
118
119
120
121

(define (rmt:import-run target run-dat)
  (let* ((runname    (car run-dat))
	 (all-dat    (cdr run-dat))
	 (tests-data (alist-ref "data" all-dat equal?))
	 (run-meta   (alist-ref "meta" all-dat equal?))
	 (run-id     (rmt:insert-run target runname run-meta)))



    (for-each
     (lambda (test-dat)
       (let* ((test-id  (car test-dat))
	      (test-rec (cdr test-dat)))
	 (rmt:insert-test run-id test-rec)))
     tests-data)))






;; insert run if not there, return id either way
(define (rmt:insert-run target runname run-meta)
  ;; look for id, return if found
  (debug:print 0 *default-log-port* "Insert run: "target"/"runname)
  (let* ((runs (rmtmod:send-receive 'simple-get-runs #f
				    ;;    runpatt count offset target last-update)
				    (list runname #f    #f     target #f))))
    (if (null? runs)
	(rmtmod:send-receive 'insert-run #f (list target runname run-meta))
	(simple-run-id (car runs)))))

(define (rmt:insert-test run-id test-rec)
  (let* ((testname  (alist-ref "testname" test-rec equal?))
	 (item-path (alist-ref "item_path" test-rec equal?)))





    (debug:print 0 *default-log-port* "   Insert test in run "run-id": "testname"/"item-path)
    (rmtmod:send-receive 'insert-test run-id test-rec)))





;;======================================================================
;;  T E S T S
;;======================================================================

;; Just some syntatic sugar
(define (rmt:register-test run-id test-name item-path)







>
>
>
|
|
|

|
|
>
>
>
>
>














|
>
>
>
>
>
|
|
>
>
>
>







85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138

(define (rmt:import-run target run-dat)
  (let* ((runname    (car run-dat))
	 (all-dat    (cdr run-dat))
	 (tests-data (alist-ref "data" all-dat equal?))
	 (run-meta   (alist-ref "meta" all-dat equal?))
	 (run-id     (rmt:insert-run target runname run-meta)))
    (if (list? tests-data)
      (begin
        (debug:print 0 *default-log-port* "import-run: inserting " (length tests-data) " tests")
      (for-each
        (lambda (test-dat)
          (let* ((test-id  (car test-dat))
	      (test-rec (cdr test-dat)))
	  (rmt:insert-test run-id test-rec)))
      tests-data)
      )
      (debug:print 0 *default-log-port* "import-run: run has no tests")
    )
  )
)

;; insert run if not there, return id either way
(define (rmt:insert-run target runname run-meta)
  ;; look for id, return if found
  (debug:print 0 *default-log-port* "Insert run: "target"/"runname)
  (let* ((runs (rmtmod:send-receive 'simple-get-runs #f
				    ;;    runpatt count offset target last-update)
				    (list runname #f    #f     target #f))))
    (if (null? runs)
	(rmtmod:send-receive 'insert-run #f (list target runname run-meta))
	(simple-run-id (car runs)))))

(define (rmt:insert-test run-id test-rec)
  (let* ((testname  (alist-ref "testname" test-rec equal?))
	 (item-path (alist-ref "item_path" test-rec equal?))
         (test-id (rmt:get-test-id run-id testname item-path))
         )
        (if test-id
          (debug:print 0 *default-log-port* "test "testname"/"item-path " already exists in run-id " run-id)
          (begin
            (debug:print 0 *default-log-port* "   Insert test in run "run-id": "testname"/"item-path)
            (rmtmod:send-receive 'insert-test run-id test-rec)
          )
        )
  )
)

;;======================================================================
;;  T E S T S
;;======================================================================

;; Just some syntatic sugar
(define (rmt:register-test run-id test-name item-path)

Modified runs.scm from [77337ff0b0] to [33d2ba21e8].

992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
	  (if (list? items-list)
	      (begin
		(if (null? items-list)
		    (let ((test-id   (rmt:get-test-id run-id test-name ""))
			  (num-items (rmt:test-toplevel-num-items run-id test-name)))
		      (if (and test-id
			       (not (> num-items 0)))
			  (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "ZERO_ITEMS" "Failed to run due to failed prerequisites"))))
		(tests:testqueue-set-items! test-record items-list)
		(list hed tal reg reruns))
	      (begin
		(debug:print-error 0 *default-log-port* "The proc from reading the items table did not yield a list - please report this")
		(exit 1))))))

    ((and (null? fails)







|







992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
	  (if (list? items-list)
	      (begin
		(if (null? items-list)
		    (let ((test-id   (rmt:get-test-id run-id test-name ""))
			  (num-items (rmt:test-toplevel-num-items run-id test-name)))
		      (if (and test-id
			       (not (> num-items 0)))
			  (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "ZERO_ITEMS" (conc "Failed to run due to failed prerequisites: " (runs:pretty-string fails))))))
		(tests:testqueue-set-items! test-record items-list)
		(list hed tal reg reruns))
	      (begin
		(debug:print-error 0 *default-log-port* "The proc from reading the items table did not yield a list - please report this")
		(exit 1))))))

    ((and (null? fails)
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
			    (set! give-up #t)))
		      prereqstrs))

	(if (and give-up
		 (not (and (null? tal)(null? reg))))
	    (let ((trimmed-tal (mt:discard-blocked-tests run-id hed tal test-records))
		  (trimmed-reg (mt:discard-blocked-tests run-id hed reg test-records)))
	      (debug:print 1 *default-log-port* "WARNING: test " hed " has discarded prerequisites, removing it from the queue")

	      (let ((test-id (rmt:get-test-id run-id hed "")))
		(if test-id (mt:test-set-state-status-by-id run-id test-id "COMPLETED" "PREQ_DISCARDED" "Failed to run due to discarded prerequisites")))
	      
	      (if (and (null? trimmed-tal)
		       (null? trimmed-reg))
		  #f
                  (runs:loop-values trimmed-tal trimmed-reg reglen regfull reruns)
                  ))
	      (list (car newtal)(append (cdr newtal) reg) '() reruns))))

    ((and (null? fails) ;; have not-started tests, but unable to run them.  everything looks completed with no prospect of unsticking something that is stuck.  we should mark hed as moribund and exit or continue if there are more tests to consider
	   (null? prereq-fails)
	   (null? non-completed))
     (debug:print-info 4 *default-log-port* "cond branch - "  "ei-4")
      (if  (runs:can-keep-running? hed 20)
	  (begin
	    (runs:inc-cant-run-tests hed)
	    (debug:print-info 0 *default-log-port* "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0) ", going to wait 60 sec.") ;; 
	    ;; getting here likely means the system is way overloaded, kill a full minute before continuing
	    ;; (thread-sleep! 60) ;; TODO: gate by normalized server load > 1.0 (maxload config thing) CHECKTHIS!!!
	    ;; No runsdat, can't do this yet
	    ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
	    ;;
	    (thread-sleep! 5) ;; TODO: gate by normalized server load > 1.0 (maxload config thing)
	    ;; num-retries code was here
	    ;; we use this opportunity to move contents of reg to tal
	    (list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met?
	  (begin
	    (debug:print-info 1 *default-log-port* "no fails in prerequisites for " hed " but nothing seen running in a while, dropping test " hed " from the run queue")
	    (let ((test-id (rmt:get-test-id run-id hed "")))
	      (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "TIMED_OUT" "Nothing seen running in a while.")))
            (runs:loop-values tal reg reglen regfull reruns)
            )))

    ((and 
       (or (not (null? fails))
	   (not (null? prereq-fails)))
       (member 'normal testmode))
      (debug:print-info 4 *default-log-port* "cond branch - "  "ei-5")
      (debug:print-info 1 *default-log-port* "test "  hed " (mode=" testmode ") has failed prerequisite(s); "
			(string-intersperse (map (lambda (t)(conc (db:test-get-testname t) ":" (db:test-get-state t)"/"(db:test-get-status t))) fails) ", ")
			", removing it from to-do list")
      (let ((test-id (rmt:get-test-id run-id hed "")))
	(if test-id
	    (if (not (null? prereq-fails))
		(mt:test-set-state-status-by-id-unless-completed run-id test-id "COMPLETED" "PREQ_DISCARDED" "Failed to run due to prior failed prerequisites")
		(mt:test-set-state-status-by-id-unless-completed run-id test-id "COMPLETED" "PREQ_FAIL"      "Failed to run due to failed prerequisites"))))
      ;; (debug:print 4 *default-log-port*"BB> set PREQ_FAIL on "hed)
      ;; (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "PREQ_FAIL"      "Failed to run due to failed prerequisites")))) ;; BB: this works, btu equivalent for itemwait mode does not work.
      (if (or (not (null? reg))(not (null? tal)))
	  (begin
	    (hash-table-set! test-registry hed 'CANNOTRUN)
            (runs:loop-values tal reg reglen regfull (cons hed reruns))
            )







|


|















|










|

|








|





|
|







1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
			    (set! give-up #t)))
		      prereqstrs))

	(if (and give-up
		 (not (and (null? tal)(null? reg))))
	    (let ((trimmed-tal (mt:discard-blocked-tests run-id hed tal test-records))
		  (trimmed-reg (mt:discard-blocked-tests run-id hed reg test-records)))
	      (debug:print 1 *default-log-port* "WARNING: test " hed " has discarded prerequisites: " prereqstrs ", removing it from the queue")

	      (let ((test-id (rmt:get-test-id run-id hed "")))
		(if test-id (mt:test-set-state-status-by-id run-id test-id "COMPLETED" "PREQ_DISCARDED" (conc "Failed to run due to discarded prerequisites: " prereqstrs))))
	      
	      (if (and (null? trimmed-tal)
		       (null? trimmed-reg))
		  #f
                  (runs:loop-values trimmed-tal trimmed-reg reglen regfull reruns)
                  ))
	      (list (car newtal)(append (cdr newtal) reg) '() reruns))))

    ((and (null? fails) ;; have not-started tests, but unable to run them.  everything looks completed with no prospect of unsticking something that is stuck.  we should mark hed as moribund and exit or continue if there are more tests to consider
	   (null? prereq-fails)
	   (null? non-completed))
     (debug:print-info 4 *default-log-port* "cond branch - "  "ei-4")
      (if  (runs:can-keep-running? hed 20)
	  (begin
	    (runs:inc-cant-run-tests hed)
	    (debug:print-info 0 *default-log-port* "no fails in prerequisites (" (runs:pretty-string prereqs-not-met) ") for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0) ", going to wait 60 sec.") ;; 
	    ;; getting here likely means the system is way overloaded, kill a full minute before continuing
	    ;; (thread-sleep! 60) ;; TODO: gate by normalized server load > 1.0 (maxload config thing) CHECKTHIS!!!
	    ;; No runsdat, can't do this yet
	    ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
	    ;;
	    (thread-sleep! 5) ;; TODO: gate by normalized server load > 1.0 (maxload config thing)
	    ;; num-retries code was here
	    ;; we use this opportunity to move contents of reg to tal
	    (list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met?
	  (begin
	    (debug:print-info 1 *default-log-port* "no fails in prerequisites (" (runs:pretty-string prereqs-not-met) ") for " hed " but nothing seen running in a while, dropping test " hed " from the run queue")
	    (let ((test-id (rmt:get-test-id run-id hed "")))
	      (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "TIMED_OUT" (conc "Prerequisites (" (runs:pretty-string prereqs-not-met) ") not seen running in a while."))))
            (runs:loop-values tal reg reglen regfull reruns)
            )))

    ((and 
       (or (not (null? fails))
	   (not (null? prereq-fails)))
       (member 'normal testmode))
      (debug:print-info 4 *default-log-port* "cond branch - "  "ei-5")
      (debug:print-info 1 *default-log-port* "test "  hed " (mode=" testmode ") has failed prerequisite(s): "
			(string-intersperse (map (lambda (t)(conc (db:test-get-testname t) ":" (db:test-get-state t)"/"(db:test-get-status t))) fails) ", ")
			", removing it from to-do list")
      (let ((test-id (rmt:get-test-id run-id hed "")))
	(if test-id
	    (if (not (null? prereq-fails))
		(mt:test-set-state-status-by-id-unless-completed run-id test-id "COMPLETED" "PREQ_DISCARDED" (conc "Failed to run due to prior failed prerequisites: "(runs:pretty-string prereq-fails)))
		(mt:test-set-state-status-by-id-unless-completed run-id test-id "COMPLETED" "PREQ_FAIL"      (conc "Failed to run due to failed prerequisites: " (runs:pretty-string fails ))))))
      ;; (debug:print 4 *default-log-port*"BB> set PREQ_FAIL on "hed)
      ;; (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "PREQ_FAIL"      "Failed to run due to failed prerequisites")))) ;; BB: this works, btu equivalent for itemwait mode does not work.
      (if (or (not (null? reg))(not (null? tal)))
	  (begin
	    (hash-table-set! test-registry hed 'CANNOTRUN)
            (runs:loop-values tal reg reglen regfull (cons hed reruns))
            )
1160
1161
1162
1163
1164
1165
1166

1167
1168
1169
1170
1171
1172
1173
	 (have-resources         (car run-limits-info))
	 (num-running            (list-ref run-limits-info 1))
	 (num-running-in-jobgroup(list-ref run-limits-info 2)) 
	 (max-concurrent-jobs    (list-ref run-limits-info 3))
	 (job-group-limit        (list-ref run-limits-info 4))
	 ;; (prereqs-not-met        (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))
	 ;; (prereqs-not-met         (mt:lazy-get-prereqs-not-met run-id waitons item-path mode: testmode itemmap: itemmap))

	 (fails                  (if (list? prereqs-not-met) ;; TODO: rename fails to failed-prereqs
				      (runs:calc-fails prereqs-not-met)
				      (begin
					(debug:print-error 0 *default-log-port* "prereqs-not-met is not a list! " prereqs-not-met)
					'())))
	 (non-completed           (filter (lambda (x)             ;; remove hed from not completed list, duh, of course it is not completed!
					    (not (equal? x hed)))







>







1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
	 (have-resources         (car run-limits-info))
	 (num-running            (list-ref run-limits-info 1))
	 (num-running-in-jobgroup(list-ref run-limits-info 2)) 
	 (max-concurrent-jobs    (list-ref run-limits-info 3))
	 (job-group-limit        (list-ref run-limits-info 4))
	 ;; (prereqs-not-met        (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))
	 ;; (prereqs-not-met         (mt:lazy-get-prereqs-not-met run-id waitons item-path mode: testmode itemmap: itemmap))
         (prereqs-running        (runs:calc-prereqs-running prereqs-not-met))
	 (fails                  (if (list? prereqs-not-met) ;; TODO: rename fails to failed-prereqs
				      (runs:calc-fails prereqs-not-met)
				      (begin
					(debug:print-error 0 *default-log-port* "prereqs-not-met is not a list! " prereqs-not-met)
					'())))
	 (non-completed           (filter (lambda (x)             ;; remove hed from not completed list, duh, of course it is not completed!
					    (not (equal? x hed)))
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362

1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
	    
	    ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
	    (thread-sleep! 5)
	    (list (car newtal)(cdr newtal) reg reruns))
	  ;; the waiton is FAIL so no point in trying to run hed ever again
	  (begin
            (let ((my-test-id (rmt:get-test-id run-id test-name item-path)))
              (mt:test-set-state-status-by-id-unless-completed run-id my-test-id "COMPLETED" "PREQ_FAIL" "Failed to run due to failed prerequisites2"))            
            (if (or (not (null? reg))(not (null? tal)))
                (if (vector? hed)
		  (begin
		    (debug:print 1 *default-log-port* "WARNING: Dropping test " test-name "/" item-path
				 " from the launch list as it has prerequistes that are FAIL")
		    (let ((test-id (rmt:get-test-id run-id hed "")))
		      (if test-id (mt:test-set-state-status-by-id-unless-completed run-id test-id "COMPLETED" "PREQ_FAIL" "Failed to run due to failed prerequisites")))
		    (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?)
		    ;; (thread-sleep! *global-delta*)
		    ;; This next is for the items

                    (if (not (null? fails))
                        ;;(mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "PREQ_FAIL" #f)
                        (rmt:set-state-status-and-roll-up-items run-id test-name item-path "NOT_STARTED" "PREQ_FAIL" #f) 
                        ;;(mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "BLOCKED" #f)
                        (rmt:set-state-status-and-roll-up-items run-id test-name item-path "NOT_STARTED" "BLOCKED" #f) )
		    (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'removed)
		    (runs:loop-values tal reg reglen regfull reruns))
		  (let ((nth-try (hash-table-ref/default test-registry hed 0))) ;; hed not a vector...
                    (debug:print 2 *default-log-port* "nth-try("hed")="nth-try)
		    (cond
		     ((member "RUNNING" (map db:test-get-state prereqs-not-met))

		      (if (runs:lownoise (conc "possible RUNNING prerequistes " hed) 60)
			  (debug:print 0 *default-log-port* "WARNING: test " hed " has possible RUNNING prerequisites, don't give up on it yet."))
		      (thread-sleep! 0.1)
		      (runs:loop-values tal reg reglen regfull reruns))
		     ((or (not nth-try) ;; BB: condition on subsequent tries, condition below fires on first try 
			  (and (number? nth-try)
			       (< nth-try 2)))
		      (hash-table-set! test-registry hed (if (number? nth-try)
							     (+ nth-try 1)
							     0))
		      (if (runs:lownoise (conc "not removing test " hed) 60)
			  (debug:print 1 *default-log-port* "WARNING: not removing test " hed " from queue although it may not be runnable due to FAILED prerequisites"))
		      ;; may not have processed correctly. Could be a race condition in your test implementation? Dropping test " hed) ;;  " as it has prerequistes that are FAIL. (NOTE: hed is not a vector)")
		      (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?)
		      (runs:loop-values newtal reg reglen regfull reruns))
		     ((symbol? nth-try) ;; BB: 'done matches here in one case where prereq itemwait failed.  This is first "try"
		      (if (eq? nth-try 'removed) ;; removed is removed - drop it NOW
			  (if (null? tal)
			      #f ;; yes, really
			      (list (car tal)(cdr tal) reg reruns))
			  (begin
			    (if (runs:lownoise (conc "FAILED prerequisites or other issue" hed) 60)
				(debug:print 0 *default-log-port* "WARNING: test " hed " has FAILED prerequisites or other issue. Internal state >" nth-try "< will be overridden and we'll retry."))
			    ;; was: (mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "KEEP_TRYING" #f)
                            (mt:test-set-state-status-by-testname-unless-completed run-id test-name item-path "COMPLETED" "PREQ_FAIL" #f)
			    (hash-table-set! test-registry hed 'removed) ;; was 0
                            (if (not (and (null? reg) (null? tal)))
                                (runs:loop-values tal reg reglen regfull reruns)
                                #f))))
		     (else
		      (if (runs:lownoise (conc "FAILED prerequitests and we tried" hed) 60)
			  (debug:print 0 *default-log-port* "WARNING: test " hed " has FAILED prerequitests and we've tried at least 10 times to run it. Giving up now."))
		      ;; (debug:print 0 *default-log-port* "         prereqs: " prereqs-not-met)
		      (hash-table-set! test-registry hed 'removed)
		      (mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "TEN_STRIKES" #f)
		      ;; I'm unclear on if this roll up is needed - it may be the root cause of the "all set to FAIL" bug.
		      (rmt:set-state-status-and-roll-up-items run-id test-name item-path #f "FAIL" #f) ;; treat as FAIL
		      (list (if (null? tal)(car newtal)(car tal))
			    tal







|




|

|





<
|
<






|
>
|
|









|










|








|







1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353

1354

1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
	    
	    ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
	    (thread-sleep! 5)
	    (list (car newtal)(cdr newtal) reg reruns))
	  ;; the waiton is FAIL so no point in trying to run hed ever again
	  (begin
            (let ((my-test-id (rmt:get-test-id run-id test-name item-path)))
              (mt:test-set-state-status-by-id-unless-completed run-id my-test-id "COMPLETED" "PREQ_FAIL" (conc "Failed to run due to failed prerequisites: " (runs:pretty-string fails))))            
            (if (or (not (null? reg))(not (null? tal)))
                (if (vector? hed)
		  (begin
		    (debug:print 1 *default-log-port* "WARNING: Dropping test " test-name "/" item-path
				 " from the launch list as it has prerequistes that are FAIL: " (runs:pretty-string fails))
		    (let ((test-id (rmt:get-test-id run-id hed "")))
		      (if test-id (mt:test-set-state-status-by-id-unless-completed run-id test-id "COMPLETED" "PREQ_FAIL" (conc "Failed to run due to failed prerequisites: " (runs:pretty-string fails)) )))
		    (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?)
		    ;; (thread-sleep! *global-delta*)
		    ;; This next is for the items

                    (if (not (null? fails))

                        (rmt:set-state-status-and-roll-up-items run-id test-name item-path "NOT_STARTED" "PREQ_FAIL" (runs:pretty-string fails)) 

                        (rmt:set-state-status-and-roll-up-items run-id test-name item-path "NOT_STARTED" "BLOCKED" #f) )
		    (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'removed)
		    (runs:loop-values tal reg reglen regfull reruns))
		  (let ((nth-try (hash-table-ref/default test-registry hed 0))) ;; hed not a vector...
                    (debug:print 2 *default-log-port* "nth-try("hed")="nth-try)
		    (cond
		     ;;((member "RUNNING" (map db:test-get-state prereqs-not-met))
                      ((> 0 (length prereqs-running))
		      (if (runs:lownoise (conc "possible RUNNING prerequisites " hed) 60)
			  (debug:print 0 *default-log-port* "WARNING: test " hed " has possible RUNNING prerequisites: " prereqs-running ", don't give up on it yet."))
		      (thread-sleep! 0.1)
		      (runs:loop-values tal reg reglen regfull reruns))
		     ((or (not nth-try) ;; BB: condition on subsequent tries, condition below fires on first try 
			  (and (number? nth-try)
			       (< nth-try 2)))
		      (hash-table-set! test-registry hed (if (number? nth-try)
							     (+ nth-try 1)
							     0))
		      (if (runs:lownoise (conc "not removing test " hed) 60)
			  (debug:print 1 *default-log-port* "WARNING: not removing test " hed " from queue although it may not be runnable due to FAILED prerequisites: "(runs:pretty-string fails)))
		      ;; may not have processed correctly. Could be a race condition in your test implementation? Dropping test " hed) ;;  " as it has prerequistes that are FAIL. (NOTE: hed is not a vector)")
		      (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?)
		      (runs:loop-values newtal reg reglen regfull reruns))
		     ((symbol? nth-try) ;; BB: 'done matches here in one case where prereq itemwait failed.  This is first "try"
		      (if (eq? nth-try 'removed) ;; removed is removed - drop it NOW
			  (if (null? tal)
			      #f ;; yes, really
			      (list (car tal)(cdr tal) reg reruns))
			  (begin
			    (if (runs:lownoise (conc "FAILED prerequisites or other issue" hed) 60)
				(debug:print 0 *default-log-port* "WARNING: test " hed " has FAILED prerequisites (" (runs:pretty-string fails)") or other issue. Internal state >" nth-try "< will be overridden and we'll retry."))
			    ;; was: (mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "KEEP_TRYING" #f)
                            (mt:test-set-state-status-by-testname-unless-completed run-id test-name item-path "COMPLETED" "PREQ_FAIL" #f)
			    (hash-table-set! test-registry hed 'removed) ;; was 0
                            (if (not (and (null? reg) (null? tal)))
                                (runs:loop-values tal reg reglen regfull reruns)
                                #f))))
		     (else
		      (if (runs:lownoise (conc "FAILED prerequitests and we tried" hed) 60)
			  (debug:print 0 *default-log-port* "WARNING: test " hed " has FAILED prerequitests: " (runs:pretty-string fails) "  and we've tried at least 10 times to run it. Giving up now."))
		      ;; (debug:print 0 *default-log-port* "         prereqs: " prereqs-not-met)
		      (hash-table-set! test-registry hed 'removed)
		      (mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "TEN_STRIKES" #f)
		      ;; I'm unclear on if this roll up is needed - it may be the root cause of the "all set to FAIL" bug.
		      (rmt:set-state-status-and-roll-up-items run-id test-name item-path #f "FAIL" #f) ;; treat as FAIL
		      (list (if (null? tal)(car newtal)(car tal))
			    tal
2001
2002
2003
2004
2005
2006
2007












2008
2009
2010
2011
2012
2013
2014
   (lambda (t)
     (or (not (vector? t))
	 (and (equal? "NOT_STARTED" (db:test-get-state t))
	      (member (db:test-get-status t)
		      '("n/a" "KEEP_TRYING")))
	 (and (equal? "RUNNING" (db:test-get-state t))))) ;; account for a test that is running
   prereqs-not-met))













(define (runs:pretty-string lst)
  (map (lambda (t)
	 (if (not (vector? t))
	     (conc t)
	     (conc (db:test-get-testname t) ":" (db:test-get-state t) "/" (db:test-get-status t))))
       lst))







>
>
>
>
>
>
>
>
>
>
>
>







2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
   (lambda (t)
     (or (not (vector? t))
	 (and (equal? "NOT_STARTED" (db:test-get-state t))
	      (member (db:test-get-status t)
		      '("n/a" "KEEP_TRYING")))
	 (and (equal? "RUNNING" (db:test-get-state t))))) ;; account for a test that is running
   prereqs-not-met))

(define (runs:calc-prereqs-running prereqs-not-met)
  (if (list? prereqs-not-met)
    (filter 
     (lambda (t)
       (or (not (vector? t))
         (member (db:test-get-state t) '("RUNNING" "LAUNCHED" "REMOTE_HOST_START"))
       ))
     prereqs-not-met)
    '()
  )
)

(define (runs:pretty-string lst)
  (map (lambda (t)
	 (if (not (vector? t))
	     (conc t)
	     (conc (db:test-get-testname t) ":" (db:test-get-state t) "/" (db:test-get-status t))))
       lst))
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
     (lambda (key)
       (let* ((idx (cadr key))
	      (fld (car  key))
	      (val (configf:lookup test-conf "test_meta" fld)))
	 ;; (debug:print 5 *default-log-port* "idx: " idx " fld: " fld " val: " val)
	 (if (and val (not (equal? (vector-ref currrecord idx) val)))
	     (begin
	       (debug:print 0 *default-log-port* "Updating " test-name " " fld " to " val)
	       (rmt:testmeta-update-field test-name fld val)))))
     '(("author" 2)("owner" 3)("description" 4)("reviewed" 5)("tags" 9)("jobgroup" 10)))))

;; find tests with matching tags, tagpatt is a string "tagpatt1,tagpatt2%, ..."
;;
(define (runs:get-tests-matching-tags tagpatt)
  (let* ((tagdata (rmt:get-tests-tags))







|







2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
     (lambda (key)
       (let* ((idx (cadr key))
	      (fld (car  key))
	      (val (configf:lookup test-conf "test_meta" fld)))
	 ;; (debug:print 5 *default-log-port* "idx: " idx " fld: " fld " val: " val)
	 (if (and val (not (equal? (vector-ref currrecord idx) val)))
	     (begin
	       (debug:print 2 *default-log-port* "Updating " test-name " " fld " to " val)
	       (rmt:testmeta-update-field test-name fld val)))))
     '(("author" 2)("owner" 3)("description" 4)("reviewed" 5)("tags" 9)("jobgroup" 10)))))

;; find tests with matching tags, tagpatt is a string "tagpatt1,tagpatt2%, ..."
;;
(define (runs:get-tests-matching-tags tagpatt)
  (let* ((tagdata (rmt:get-tests-tags))

Modified tcp-transportmod.scm from [cc561d90e9] to [27966bec9e].

127
128
129
130
131
132
133






134




















135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
       (equal? (dbfile:run-id->dbfname run-id) dbfname)))

(tcp-buffer-size 2048)
;; (max-connections 4096) 

;; do all the busy work of finding and setting up conn for
;; connecting to a server






;; 




















(define (tt:client-connect-to-server ttdat dbfname run-id testsuite)
  (assert (tt:valid-run-id run-id dbfname) "FATAL: invalid run-id "run-id)
  (debug:print-info 2 *default-log-port* "tt:client-connect-to-server " dbfname " " run-id)
  (let* ((conn (hash-table-ref/default (tt-conns ttdat) dbfname #f))
	 (server-start-proc (lambda ()
			      (tt:server-process-run
			       (tt-areapath ttdat)
			       testsuite ;; (dbfile:testsuite-name)
			       (common:find-local-megatest)
			       run-id))))
    (if conn
	(begin 
          (debug:print-info 2 *default-log-port* "already connected to a server")
           conn) ;; we are already connected to the server
	(let* ((sdat (tt:get-current-server-info ttdat dbfname)))
	  (match sdat
	    ((host port start-time server-id pid dbfname2 servinffile)
	     (assert (equal? dbfname dbfname2) "FATAL: read server info from wrong file.")
             (debug:print-info 2 *default-log-port* "no conn - in match servinffile:" servinffile)
	     (let* ((host-port (conc host":"port))







>
>
>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>


|









|







127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
       (equal? (dbfile:run-id->dbfname run-id) dbfname)))

(tcp-buffer-size 2048)
;; (max-connections 4096) 

;; do all the busy work of finding and setting up conn for
;; connecting to a server
;; This function, `tt:client-connect-to-server`, is designed to manage connections between a client and a server within a testing framework.
;; The function takes four arguments:
;; 1. `ttdat`: a data structure that holds information about the testing environment or connections.
;; 2. `dbfname`: The name of the database file that the client wants to connect to.
;; 3. `run-id`: An identifier for the current run of the test suite.
;; 4. `testsuite`: The test suite that is being run.
;;
;; Here's a step-by-step explanation of what the function does:
;;
;; 1. It first asserts that the `run-id` is valid for the given `dbfname` using the `tt:valid-run-id` function. If the `run-id` is not valid, it raises a fatal error.
;; 2. It prints debug information indicating that the function `tt:client-connect-to-server` has been called with the given `dbfname`.
;; 3. It attempts to retrieve an existing connection to the server from a hash table (`tt-conns`) using the `dbfname` as the key. If a connection already exists, it prints debug information and returns the existing connection.
;; 4. If no existing connection is found, it retrieves the current server information from the servinfo file, using the `tt:get-current-server-info` function.
;; 5. It uses pattern matching to destructure the server information into variables (`host`, `port`, `start-time`, `server-id`, `pid`, `dbfname2`, `servinffile`). It then asserts that the `dbfname` from the server info matches the one provided to the function.
;; 6. It constructs a connection object (`conn`) with the server information.
;; 7. It attempts to ping the server using `tt:timed-ping` to verify that the server is running and can be communicated with.
;; 8. Depending on the result of the ping:
;;    - If the server is running (`running`), it prints debug information, saves the connection in the hash table, and returns the connection.
;;    - If the server is starting (`starting`), it sleeps for 2 seconds and then recursively calls itself to retry the connection.
;;    - If the server is neither running nor starting, it checks if it's been more than 10 seconds since the last server start attempt. If so, it attempts to start the server using `server-start-proc` and then sleeps for 1 second before retrying the connection.
;; 9. If no server information is found (`else` case), it checks if it's been more than 3 seconds since the last server start attempt. If so, it starts a new server using `server-start-proc`, updates the last server start time, and sleeps for 4 seconds.
;; 10. It then sleeps for 1 second and prints debug information before recursively calling itself to retry the connection.
;;
;; The function uses recursion to keep trying to connect to the server, with various sleep intervals to prevent overwhelming the system with connection attempts or server starts.
;; It also uses a hash table to cache connections and avoid reconnecting to a server if a connection already exists.
;; The function is designed to handle different server states and ensure that a server is running and available before returning a valid connection to the caller.
;; 
(define (tt:client-connect-to-server ttdat dbfname run-id testsuite)
  (assert (tt:valid-run-id run-id dbfname) "FATAL: invalid run-id "run-id)
  (debug:print-info 2 *default-log-port* "tt:client-connect-to-server " dbfname)
  (let* ((conn (hash-table-ref/default (tt-conns ttdat) dbfname #f))
	 (server-start-proc (lambda ()
			      (tt:server-process-run
			       (tt-areapath ttdat)
			       testsuite ;; (dbfile:testsuite-name)
			       (common:find-local-megatest)
			       run-id))))
    (if conn
	(begin 
          (debug:print-info 2 *default-log-port* "already connected to a server for " dbfname)
           conn) ;; we are already connected to the server
	(let* ((sdat (tt:get-current-server-info ttdat dbfname)))
	  (match sdat
	    ((host port start-time server-id pid dbfname2 servinffile)
	     (assert (equal? dbfname dbfname2) "FATAL: read server info from wrong file.")
             (debug:print-info 2 *default-log-port* "no conn - in match servinffile:" servinffile)
	     (let* ((host-port (conc host":"port))
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
		      (tt:client-connect-to-server ttdat dbfname run-id testsuite)))))))
	    (else ;; no good server found, if haven't started server in > 5 secs, start another
	     (if (> (- (current-seconds) (tt-last-serv-start ttdat)) 3) ;; BUG - grow this number really do not want to swamp the machine with servers
		 (begin
		   (debug:print-info 0 *default-log-port* "Starting server for "dbfname)
		   (server-start-proc)
		   (tt-last-serv-start-set! ttdat (current-seconds))
                   (thread-sleep! 3)
                   ))
	     (thread-sleep! 1)
             (debug:print-info 0 *default-log-port* "Connect to server for " dbfname)
	     (tt:client-connect-to-server ttdat dbfname run-id testsuite)))))))

(define (tt:timed-ping host port server-id)
  (let* ((start-time (current-milliseconds))







|







213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
		      (tt:client-connect-to-server ttdat dbfname run-id testsuite)))))))
	    (else ;; no good server found, if haven't started server in > 5 secs, start another
	     (if (> (- (current-seconds) (tt-last-serv-start ttdat)) 3) ;; BUG - grow this number really do not want to swamp the machine with servers
		 (begin
		   (debug:print-info 0 *default-log-port* "Starting server for "dbfname)
		   (server-start-proc)
		   (tt-last-serv-start-set! ttdat (current-seconds))
                   (thread-sleep! 4)
                   ))
	     (thread-sleep! 1)
             (debug:print-info 0 *default-log-port* "Connect to server for " dbfname)
	     (tt:client-connect-to-server ttdat dbfname run-id testsuite)))))))

(define (tt:timed-ping host port server-id)
  (let* ((start-time (current-milliseconds))
230
231
232
233
234
235
236

237
238
239
240
241
242
243
;; client side handler
;;
;;(tt:handler #<tt> get-keys #f () 2 #f "/home/matt/data/megatest/ext-tests" #f "main.db" "ext-tests" "/home/matt/data/megatest/bin/.22.04/../megatest")
;;
(define (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)
  (debug:print 2 *default-log-port* "tt:handler cmd: " cmd " run-id: " run-id " attemptnum: " attemptnum)
  ;; NOTE: areapath is passed in and in tt struct. We'll use passed in value for now.

  (let* ((conn (tt:client-connect-to-server ttdat dbfname run-id testsuite))) ;; (hash-table-ref/default (tt-conns ttdat) dbfname #f)))
    (if conn
	;; have connection, call the server
	(let* ((res (tt:send-receive ttdat conn cmd run-id params)))
	  ;; res is (status errmsg result meta)
         ; (debug:print 0 *default-log-port* "conn:" conn " res: " res)
	  (match res







>







256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
;; client side handler
;;
;;(tt:handler #<tt> get-keys #f () 2 #f "/home/matt/data/megatest/ext-tests" #f "main.db" "ext-tests" "/home/matt/data/megatest/bin/.22.04/../megatest")
;;
(define (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)
  (debug:print 2 *default-log-port* "tt:handler cmd: " cmd " run-id: " run-id " attemptnum: " attemptnum)
  ;; NOTE: areapath is passed in and in tt struct. We'll use passed in value for now.
  ;; connect-to-server will start a server if needed.
  (let* ((conn (tt:client-connect-to-server ttdat dbfname run-id testsuite))) ;; (hash-table-ref/default (tt-conns ttdat) dbfname #f)))
    (if conn
	;; have connection, call the server
	(let* ((res (tt:send-receive ttdat conn cmd run-id params)))
	  ;; res is (status errmsg result meta)
         ; (debug:print 0 *default-log-port* "conn:" conn " res: " res)
	  (match res
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
	       ((loaded)
		(debug:print 0 *default-log-port* "WARNING: server for "dbfname" is loaded, slowing queries.")
		(tt:backoff-incr (tt-conn-host conn)(tt-conn-port conn))
		result) ;; (tt:handler  ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
	       (else
		result)))
	    (else ;; did not receive properly formated result
	     (if (not res) ;; tt:handler is telling us that communication failed
		 (let* ((host    (tt-conn-host conn))
			(port    (tt-conn-port conn))
			;; (dbfname (tt-conn-port conn)) ;; 192.168.0.127:4242-726924:4.db
			(pid     (tt-conn-pid  conn))
                        ;;(servinf (tt-conn-servinf-file conn))) 
			(servinf (tt-servinf-file ttdat))) ;; (conc areapath"/.servinfo/"host":"port"-"pid":"dbfname))) ;; TODO, use (server:get-servinfo-dir areapath)
		   (hash-table-set! (tt-conns ttdat) dbfname #f)







|







285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
	       ((loaded)
		(debug:print 0 *default-log-port* "WARNING: server for "dbfname" is loaded, slowing queries.")
		(tt:backoff-incr (tt-conn-host conn)(tt-conn-port conn))
		result) ;; (tt:handler  ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
	       (else
		result)))
	    (else ;; did not receive properly formated result
	     (if (not res) ;; tt:send-receive telling us that communication failed
		 (let* ((host    (tt-conn-host conn))
			(port    (tt-conn-port conn))
			;; (dbfname (tt-conn-port conn)) ;; 192.168.0.127:4242-726924:4.db
			(pid     (tt-conn-pid  conn))
                        ;;(servinf (tt-conn-servinf-file conn))) 
			(servinf (tt-servinf-file ttdat))) ;; (conc areapath"/.servinfo/"host":"port"-"pid":"dbfname))) ;; TODO, use (server:get-servinfo-dir areapath)
		   (hash-table-set! (tt-conns ttdat) dbfname #f)
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
				     ;; start server - addressed in client-connect-to-server
				     ;; delay        - addressed in client-connect-to-server
				     ;; try again
				     (thread-sleep! 0.25) ;; dunno, I think this needs to be here
				     (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
				   ))))
		       (begin ;; no server file, delay and try again
			 (debug:print 2 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", but do not see servinf file "servinf)
			 (thread-sleep! 0.5)
			 (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))))
		 (begin ;; this case is where res is malformed. Probably should abort
		   (assert #f "FATAL: tt:handler received bad data "res)
		   ;; (debug:print 0 *default-log-port* "INFO: got corrupt data from server "host":"port", "res", for "dbfname", will try again.")
		   ;; (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)
		   )))))







|







318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
				     ;; start server - addressed in client-connect-to-server
				     ;; delay        - addressed in client-connect-to-server
				     ;; try again
				     (thread-sleep! 0.25) ;; dunno, I think this needs to be here
				     (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
				   ))))
		       (begin ;; no server file, delay and try again
			 (debug:print 2 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", no servinf file. Server exited? ")
			 (thread-sleep! 0.5)
			 (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))))
		 (begin ;; this case is where res is malformed. Probably should abort
		   (assert #f "FATAL: tt:handler received bad data "res)
		   ;; (debug:print 0 *default-log-port* "INFO: got corrupt data from server "host":"port", "res", for "dbfname", will try again.")
		   ;; (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)
		   )))))
474
475
476
477
478
479
480

481
482
483
484
485
486
487
488
(define (tt:start-server areapath run-id dbfname-in handler keys)
  (assert areapath "FATAL: areapath not provided for tt:start-server")
  ;; is there already a server for this dbfile? Then exit.
  (debug:print 2 *default-log-port* "tt:start-server: " dbfname-in)
  (let* ((ttdat   (make-tt areapath: areapath))
	 (dbfname (or dbfname-in (dbmod:run-id->dbfname run-id)))
	 (servers (tt:find-server areapath dbfname))) ;; should use tt:get-current-server-info instead

    (if (> (length servers) 4)
	(begin
	  (debug:print 0 *default-log-port* "INFO: found server(s) already running for db "dbfname", "(string-intersperse servers ",")" Exiting.")
	  (exit))
	(let* ((dbstruct   (dbmod:open-dbmoddb areapath run-id dbfname (dbfile:db-init-proc) keys)))
	  (tt-handler-set! ttdat (handler dbstruct))
	  (let* ((tcp-thread (make-thread
			      (lambda ()







>
|







501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
(define (tt:start-server areapath run-id dbfname-in handler keys)
  (assert areapath "FATAL: areapath not provided for tt:start-server")
  ;; is there already a server for this dbfile? Then exit.
  (debug:print 2 *default-log-port* "tt:start-server: " dbfname-in)
  (let* ((ttdat   (make-tt areapath: areapath))
	 (dbfname (or dbfname-in (dbmod:run-id->dbfname run-id)))
	 (servers (tt:find-server areapath dbfname))) ;; should use tt:get-current-server-info instead
         (debug:print 0 *default-log-port* "Found " (length servers) " already running for " dbfname)
    (if (> (length servers) 0)
	(begin
	  (debug:print 0 *default-log-port* "INFO: found server(s) already running for db "dbfname", "(string-intersperse servers ",")" Exiting.")
	  (exit))
	(let* ((dbstruct   (dbmod:open-dbmoddb areapath run-id dbfname (dbfile:db-init-proc) keys)))
	  (tt-handler-set! ttdat (handler dbstruct))
	  (let* ((tcp-thread (make-thread
			      (lambda ()
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
	 (nosyncdbpath (conc areapath"/.mtdb"))
	 (cleanup (lambda ()
		    (if (tt-cleanup-proc ttdat)
			((tt-cleanup-proc ttdat)))
		    (dbfile:with-no-sync-db nosyncdbpath
					    (lambda (db)
					      (let* ((dbtmpname (dbr:dbstruct-dbtmpname dbstruct)))
						(debug:print-info 0 *default-log-port* "Running clean up, including removing db file "dbtmpname)
						(db:no-sync-del! db dbfname)
						#;(if dbtmpname
						    (delete-file dbtmpname))))))))
    (set! *server-info* ttdat)
    (let loop ((count 0))
      (if (> count 240)
	  (begin
	    (debug:print 0 *default-log-port* "FATAL: Could not start a tcp server, giving up.")
	    (exit 1))
	  (if (not (tt-port ttdat)) ;; no connection yet







|

<
|







561
562
563
564
565
566
567
568
569

570
571
572
573
574
575
576
577
	 (nosyncdbpath (conc areapath"/.mtdb"))
	 (cleanup (lambda ()
		    (if (tt-cleanup-proc ttdat)
			((tt-cleanup-proc ttdat)))
		    (dbfile:with-no-sync-db nosyncdbpath
					    (lambda (db)
					      (let* ((dbtmpname (dbr:dbstruct-dbtmpname dbstruct)))
						(debug:print-info 0 *default-log-port* "keep-running: removing lock for file "dbtmpname)
						(db:no-sync-del! db dbfname)

                                                ))))))
    (set! *server-info* ttdat)
    (let loop ((count 0))
      (if (> count 240)
	  (begin
	    (debug:print 0 *default-log-port* "FATAL: Could not start a tcp server, giving up.")
	    (exit 1))
	  (if (not (tt-port ttdat)) ;; no connection yet
583
584
585
586
587
588
589

590
591
592
593
594
595
596
						(debug:print 0 *default-log-port* "Failed to get server lock for "dbfname)
						#f))))))
			  (if (and res (common:low-noise-print 120 "top server message"))
			      (debug:print-info 0 *default-log-port* "Keep running, I'm the top server for "
						dbfname" on "(tt-host ttdat)":"(tt-port ttdat)))
			  res))
		       (else

			(debug:print-info 0 *default-log-port* "I'm not the lead server: "servers)
			(let* ((leadsrv (car servers)))
			  (match leadsrv
			    ((host port startseconds server-id pid dbfname servinfofile)
			     (let* ((result  (tt:timed-ping host port server-id))
				    (res     (car result))
				    (ping    (cdr result)))







>







610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
						(debug:print 0 *default-log-port* "Failed to get server lock for "dbfname)
						#f))))))
			  (if (and res (common:low-noise-print 120 "top server message"))
			      (debug:print-info 0 *default-log-port* "Keep running, I'm the top server for "
						dbfname" on "(tt-host ttdat)":"(tt-port ttdat)))
			  res))
		       (else
                        ;; wrong servinfo file
			(debug:print-info 0 *default-log-port* "I'm not the lead server: "servers)
			(let* ((leadsrv (car servers)))
			  (match leadsrv
			    ((host port startseconds server-id pid dbfname servinfofile)
			     (let* ((result  (tt:timed-ping host port server-id))
				    (res     (car result))
				    (ping    (cdr result)))
711
712
713
714
715
716
717
718
















719



720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
;; find valid server
;; get servers listed, last part of name must match :<dbfname>
;; if more than one, wait one second and look again
;; future: ping oldest, if alive remove other :<dbfname> files
;;
(define (tt:find-server areapath dbfname)
  (let* ((servdir  (tt:get-servinfo-dir areapath))
	 (sfiles   (glob (conc servdir"/*:"dbfname))))
















    sfiles))




;; given a path to a server info file return: host port startseconds server-id pid dbfname logf
;; example of what it's looking for in the log file:
;;     SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 
;;
(define (tt:server-get-info logf)
  (let ((server-rx    (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+) dbfname: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id
        (dbprep-rx    (regexp "^SERVER: dbprep"))
        (dbprep-found 0)
	(bad-dat      (list #f #f #f #f #f #f logf)))
     (let ((fdat     (handle-exceptions
			 exn
		       (begin
			 ;; WARNING: this is potentially dangerous to blanket ignore the errors
			 (debug:print-info 0 *default-log-port* "Unable to get server info from "logf", exn="(condition->list exn))
			 '()) ;; no idea what went wrong, call it a bad server, return empty list
		       (with-input-from-file logf read-lines))))
       (if (null? fdat) ;; bad data, return bad-dat
	   bad-dat
	   (let loop ((inl  (car fdat))
		      (tail (cdr fdat))
		      (lnum 0))
	     (let ((mlst (string-match server-rx inl))
		   (dbprep (string-match dbprep-rx inl)))
	       (if dbprep (set! dbprep-found 1))
	       (if (not mlst)
		   (if (> lnum 500) ;; give up if more than 500 lines of server log read
		       bad-dat
		       (if (null? tail)
			   bad-dat
			   (loop (car tail)(cdr tail)(+ lnum 1))))
		   (match mlst ;; have a not null list







|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
>
>
>







<
<













|
<
<







739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773


774
775
776
777
778
779
780
781
782
783
784
785
786
787


788
789
790
791
792
793
794
;; find valid server
;; get servers listed, last part of name must match :<dbfname>
;; if more than one, wait one second and look again
;; future: ping oldest, if alive remove other :<dbfname> files
;;
(define (tt:find-server areapath dbfname)
  (let* ((servdir  (tt:get-servinfo-dir areapath))
	 (sfiles   (glob (conc servdir"/*:"dbfname)))
         (good-files '()))
         (for-each 
           (lambda (sfile)
             (let* ((sinfo (tt:server-get-info sfile))
                 (host (list-ref sinfo 0))
                 (port (list-ref sinfo 1))
                 (server-id (list-ref sinfo 3))
                 (pid (list-ref sinfo 4))
                 (status (system (conc "ssh " host " ps " pid " > /dev/null")))
                 )
                (if (= status 0)
                  (set! good-files (cons sfile good-files))
                  (delete-file* sfile)
                )
             )
           )
           sfiles
         )
    (debug:print-info 2 *default-log-port* "tt:find-server: good-files: " good-files " sfiles: " sfiles)
    good-files))

;; given a path to a server info file return: host port startseconds server-id pid dbfname logf
;; example of what it's looking for in the log file:
;;     SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 
;;
(define (tt:server-get-info logf)
  (let ((server-rx    (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+) dbfname: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id


	(bad-dat      (list #f #f #f #f #f #f logf)))
     (let ((fdat     (handle-exceptions
			 exn
		       (begin
			 ;; WARNING: this is potentially dangerous to blanket ignore the errors
			 (debug:print-info 0 *default-log-port* "Unable to get server info from "logf", exn="(condition->list exn))
			 '()) ;; no idea what went wrong, call it a bad server, return empty list
		       (with-input-from-file logf read-lines))))
       (if (null? fdat) ;; bad data, return bad-dat
	   bad-dat
	   (let loop ((inl  (car fdat))
		      (tail (cdr fdat))
		      (lnum 0))
	     (let ((mlst (string-match server-rx inl)))


	       (if (not mlst)
		   (if (> lnum 500) ;; give up if more than 500 lines of server log read
		       bad-dat
		       (if (null? tail)
			   bad-dat
			   (loop (car tail)(cdr tail)(+ lnum 1))))
		   (match mlst ;; have a not null list
882
883
884
885
886
887
888


889


890
891
892
893
894
895
896
    (handle-exceptions
	exn
      (if (< port 65535)
	  (begin
	    (portlogger:open-run-close portlogger:set-failed port)
	    (thread-sleep! 0.25)
	    (setup-listener-portlogger uconn))


	  #f)


      (connect-listener uconn port))))

(define (connect-listener uconn port)
  ;; (tcp-listener-socket LISTENER)(socket-name so)
  ;; sockaddr-address, sockaddr-port, sockaddr->string
  (let* ((tlsn (tcp-listen port 10000 #f)) ;; (tcp-listen TCPPORT [BACKLOG [HOST]])
	 (addr  (tt:get-best-guess-address (get-host-name)))) ;; (get-my-best-address))) ;; (hostinfo-addresses (host-information (current-hostname)))







>
>
|
>
>







925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
    (handle-exceptions
	exn
      (if (< port 65535)
	  (begin
	    (portlogger:open-run-close portlogger:set-failed port)
	    (thread-sleep! 0.25)
	    (setup-listener-portlogger uconn))
          (begin
            (assert #t "setup-listener-portlogger: could not get a port")
	    #f
          )
      )
      (connect-listener uconn port))))

(define (connect-listener uconn port)
  ;; (tcp-listener-socket LISTENER)(socket-name so)
  ;; sockaddr-address, sockaddr-port, sockaddr->string
  (let* ((tlsn (tcp-listen port 10000 #f)) ;; (tcp-listen TCPPORT [BACKLOG [HOST]])
	 (addr  (tt:get-best-guess-address (get-host-name)))) ;; (get-my-best-address))) ;; (hostinfo-addresses (host-information (current-hostname)))

Modified tests.scm from [6fa611f761] to [f734d44190].

146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164


165
166
167
168
169
170
171
	(string-split global-waitons)
	'())))

;; return items given config
;;
(define (tests:get-items tconfig)
  (let ((items      (hash-table-ref/default tconfig "items" #f)) ;; items 4
	(itemstable (hash-table-ref/default tconfig "itemstable" #f))) 
    ;; if either items or items table is a proc return it so test running
    ;; process can know to call items:get-items-from-config
    ;; if either is a list and none is a proc go ahead and call get-items
    ;; otherwise return #f - this is not an iterated test
    (cond
     ((procedure? items)      
      (debug:print-info 4 *default-log-port* "items is a procedure, will calc later")
      items)            ;; calc later
     ((procedure? itemstable)
      (debug:print-info 4 *default-log-port* "itemstable is a procedure, will calc later")
      itemstable)       ;; calc later


     ((filter (lambda (x)
		(let ((val (car x)))
		  (if (procedure? val) val #f)))
	      (append (if (list? items) items '())
		      (if (list? itemstable) itemstable '())))
      'have-procedure)
     ((or (list? items)(list? itemstable)) ;; calc now







|











>
>







146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
	(string-split global-waitons)
	'())))

;; return items given config
;;
(define (tests:get-items tconfig)
  (let ((items      (hash-table-ref/default tconfig "items" #f)) ;; items 4
	(itemstable (hash-table-ref/default tconfig "itemstable" #f)))
    ;; if either items or items table is a proc return it so test running
    ;; process can know to call items:get-items-from-config
    ;; if either is a list and none is a proc go ahead and call get-items
    ;; otherwise return #f - this is not an iterated test
    (cond
     ((procedure? items)      
      (debug:print-info 4 *default-log-port* "items is a procedure, will calc later")
      items)            ;; calc later
     ((procedure? itemstable)
      (debug:print-info 4 *default-log-port* "itemstable is a procedure, will calc later")
      itemstable)       ;; calc later
     ((and (not items) (not itemstable))
      #f)
     ((filter (lambda (x)
		(let ((val (car x)))
		  (if (procedure? val) val #f)))
	      (append (if (list? items) items '())
		      (if (list? itemstable) itemstable '())))
      'have-procedure)
     ((or (list? items)(list? itemstable)) ;; calc now
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
		(if (and tcfg cache-file) (hash-table-set! tcfg "have fulldata" #t)) ;; mark this as fully read data
		(if tcfg (hash-table-set! *testconfigs* test-full-name tcfg))
		(if (and testexists
			 cache-file
			 (file-write-access? cache-path)
			 allow-write-cache)
		    (let ((tpath (conc cache-path "/.testconfig")))
		      (debug:print-info 1 *default-log-port* "Caching testconfig for " test-name " in " tpath)
                      (if (and tcfg (not (common:in-running-test?)))
                          (configf:write-alist tcfg tpath))))
		tcfg))))))
  
;; sort tests by priority and waiton
;; Move test specific stuff to a test unit FIXME one of these days
(define (tests:sort-by-priority-and-waiton test-records)







|







1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
		(if (and tcfg cache-file) (hash-table-set! tcfg "have fulldata" #t)) ;; mark this as fully read data
		(if tcfg (hash-table-set! *testconfigs* test-full-name tcfg))
		(if (and testexists
			 cache-file
			 (file-write-access? cache-path)
			 allow-write-cache)
		    (let ((tpath (conc cache-path "/.testconfig")))
		      (debug:print-info 2 *default-log-port* "Caching testconfig for " test-name " in " tpath)
                      (if (and tcfg (not (common:in-running-test?)))
                          (configf:write-alist tcfg tpath))))
		tcfg))))))
  
;; sort tests by priority and waiton
;; Move test specific stuff to a test unit FIXME one of these days
(define (tests:sort-by-priority-and-waiton test-records)