Megatest

Diff
Login

Differences From Artifact [f3b05e9850]:

To Artifact [69229ba1dd]:


1
2

3
4
5
6
7
8
9
1

2
3
4
5
6
7
8
9

-
+








;; Copyright 2006-2012, Matthew Welland.
;; Copyright 2006-2013, Matthew Welland.
;; 
;;  This program is made available under the GNU GPL version 2.0 or
;;  greater. See the accompanying file COPYING for details.
;; 
;;  This program is distributed WITHOUT ANY WARRANTY; without even the
;;  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
;;  PURPOSE.
69
70
71
72
73
74
75

76
77
78
79
80
81
82
83
84
85
86
87
88
89
90

























91
92
93
94

95
96

97
98
99
100
101
102
103
104
105
106
107
108
109










110
111
112
113
114
115







116
117
118
119
120
121
122
123
124
125
126
127
128
129
130

131
132
133
134

135
136
137
138
139






140
141
142
143
144
145
146
147
148
149
150
151
152

153
154
155
156
157










158
159
160
161
162
163
164
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119

120
121

122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145






146
147
148
149
150
151
152
153
154






155
156
157




158

159
160

161
162
163
164


165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188

189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205







+















+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+



-
+

-
+













+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
+
+
+
+
+
+
+


-
-
-
-
-
-



-
-
-
-
+
-


-
+



-
-
+
+
+
+
+
+













+




-
+
+
+
+
+
+
+
+
+
+







	       (test-id   (assoc/default 'test-id   cmdinfo))
	       (target    (assoc/default 'target    cmdinfo))
	       (itemdat   (assoc/default 'itemdat   cmdinfo))
	       (env-ovrd  (assoc/default 'env-ovrd  cmdinfo))
	       (set-vars  (assoc/default 'set-vars  cmdinfo)) ;; pre-overrides from -setvar
	       (runname   (assoc/default 'runname   cmdinfo))
	       (megatest  (assoc/default 'megatest  cmdinfo))
	       (runtlim   (assoc/default 'runtlim   cmdinfo))
	       (mt-bindir-path (assoc/default 'mt-bindir-path cmdinfo))
	       (keys      #f)
	       (keyvals   #f)
	       (fullrunscript (if (not runscript)
                                  #f
                                  (if (substring-index "/" runscript)
                                      runscript ;; use unadultered if contains slashes
                                      (let ((fulln (conc testpath "/" runscript)))
	                                  (if (and (file-exists? fulln)
                                                   (file-execute-access? fulln))
                                              fulln
                                              runscript))))) ;; assume it is on the path
	       (rollup-status 0))
	  (change-directory top-path)
	  (debug:print 2 "Exectuing " test-name " (id: " test-id ") on " (get-host-name))

	  ;; (runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals) ;; these may be needed by the launching process
	  ;; one of these is defunct/redundant ...
	  (if (not (launch:setup-for-run force: #t))
	      (begin
		(debug:print 0 "Failed to setup, exiting") 
		;; (sqlite3:finalize! db)
		;; (sqlite3:finalize! tdb)
		(exit 1)))
	  (change-directory *toppath*) 
	  (let ((rconfig (full-runconfigs-read))) ;; (read-config (conc  *toppath* "/runconfigs.config") #f #t sections: (list "default" target))))
	    ;; (setup-env-defaults (conc *toppath* "/runconfigs.config") run-id (make-hash-table) keyvals target)
	    ;; (set-run-config-vars run-id keyvals target) ;; (db:get-target db run-id))
	    ;; Now have runconfigs data loaded, set environment vars
	    (for-each (lambda (section)
			(for-each (lambda (varval)
				    (let ((var (car varval))
					  (val (cadr varval)))
				      (if (and (string? var)(string? val))
					  (begin
					    (setenv var (config:eval-string-in-environment val))) ;; val)
					  (debug:print 0 "ERROR: bad variable spec, " var "=" val))))
				  (configf:get-section rconfig section)))
		      (list "default" target)))
	  (change-directory work-area) 
	  ;; Setup the *runremote* global var
	  (if *runremote* (debug:print 2 "ERROR: I'm not expecting *runremote* to be set at this time"))
	  ;; (set! *runremote* runremote)
	  (set! *transport-type* (string->symbol transport))
	  ;; (set! *transport-type* (string->symbol transport))
	  (set! keys       (cdb:remote-run db:get-keys #f))
	  (set! keyvals    (if run-id (cdb:remote-run db:get-key-vals #f run-id) #f))
	  (set! keyvals    (keys:target->keyval keys target))
	  ;; apply pre-overrides before other variables. The pre-override vars must not
	  ;; clobbers things from the official sources such as megatest.config and runconfigs.config
	  (if (string? set-vars)
	      (let ((varpairs (string-split set-vars ",")))
		(debug:print 4 "varpairs: " varpairs)
		(map (lambda (varpair)
		       (let ((varval (string-split varpair "=")))
			 (if (eq? (length varval) 2)
			     (let ((var (car varval))
				   (val (cadr varval)))
			       (debug:print 1 "Adding pre-var/val " var " = " val " to the environment")
			       (setenv var val)))))
		     varpairs)))
	  (for-each
	   (lambda (varval)
	     (let ((var (car varval))
		   (val (cadr varval)))
	       (if val
		   (setenv var val)
		   (begin
		     (debug:print 0 "ERROR: required variable " var " does not have a valid value. Exiting")
		     (exit)))))
	     (list 
	  (setenv "MT_TEST_RUN_DIR" work-area)
	  (setenv "MT_TEST_NAME" test-name)
	  (setenv "MT_ITEM_INFO" (conc itemdat))
	  (setenv "MT_RUNNAME"   runname)
	  (setenv "MT_MEGATEST"  megatest)
	  (setenv "MT_TARGET"    target)
	      (list  "MT_TEST_RUN_DIR" work-area)
	      (list  "MT_TEST_NAME" test-name)
	      (list  "MT_ITEM_INFO" (conc itemdat))
	      (list  "MT_RUNNAME"   runname)
	      (list  "MT_MEGATEST"  megatest)
	      (list  "MT_TARGET"    target)
	      (list  "MT_LINKTREE"  (configf:lookup *configdat* "setup" "linktree"))))
	  (if mt-bindir-path (setenv "PATH" (conc (getenv "PATH") ":" mt-bindir-path)))
	  ;; (change-directory top-path)
	  (if (not (setup-for-run))
	      (begin
		(debug:print 0 "Failed to setup, exiting") 
		;; (sqlite3:finalize! db)
		;; (sqlite3:finalize! tdb)
		(exit 1)))
	  ;; Can setup as client for server mode now
	  ;; (client:setup)

	  (change-directory *toppath*) 
	  (set-megatest-env-vars run-id) ;; these may be needed by the launching process
	  (change-directory work-area) 

	  
	  (set-run-config-vars run-id keys keyvals target) ;; (db:get-target db run-id))
	  ;; environment overrides are done *before* the remaining critical envars.
	  (alist->env-vars env-ovrd)
	  (set-megatest-env-vars run-id)
	  (runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals)
	  (set-item-env-vars itemdat)
	  (save-environment-as-files "megatest")
	  ;; open-run-close not needed for test-set-meta-info
	  (tests:set-meta-info #f test-id run-id test-name itemdat 0 work-area)
	  (tests:test-set-status! test-id "REMOTEHOSTSTART" "n/a" (args:get-arg "-m") #f)
	  (tests:set-full-meta-info #f test-id run-id 0 work-area 10)

	  ;; (tests:test-set-status! test-id "REMOTEHOSTSTART" "n/a" (args:get-arg "-m") #f)
	  (tests:test-force-state-status! test-id "REMOTEHOSTSTART" "n/a")
	  (thread-sleep! 0.3) ;; NFS slowness has caused grief here

	  (if (args:get-arg "-xterm")
	      (set! fullrunscript "xterm")
	      (if (and fullrunscript (not (file-execute-access? fullrunscript)))
		  (system (conc "chmod ug+x " fullrunscript))))
	  ;; We are about to actually kick off the test
	  ;; so this is a good place to remove the records for 
	  ;; any previous runs
	  ;; (db:test-remove-steps db run-id testname itemdat)
	  
	  (let* ((m            (make-mutex))
		 (kill-job?    #f)
		 (exit-info    (vector #t #t #t))
		 (job-thread   #f)
		 (keep-going   #t)
		 (runit        (lambda ()
				 ;; (let-values
				 ;;  (((pid exit-status exit-code)
				 ;;    (run-n-wait fullrunscript)))
				 (tests:test-set-status! test-id "RUNNING" "n/a" #f #f)
				 ;; (tests:test-set-status! test-id "RUNNING" "n/a" #f #f)
				 ;; Since we should have a clean slate at this time there is no need to do 
				 ;; any of the other stuff that tests:test-set-status! does. Let's just 
				 ;; force RUNNING/n/a
				 

				 (thread-sleep! 0.3)
				 (tests:test-force-state-status! test-id "RUNNING" "n/a")
				 (thread-sleep! 0.3) ;; NFS slowness has caused grief here

				 ;; if there is a runscript do it first
				 (if fullrunscript
				     (let ((pid (process-run fullrunscript)))
				       (let loop ((i 0))
					 (let-values
					  (((pid-val exit-status exit-code) (process-wait pid #t)))
					  (mutex-lock! m)
173
174
175
176
177
178
179

180
181
182
183
184
185
186
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228







+







						(loop (+ i 1)))
					      )))))
				 ;; then, if runscript ran ok (or did not get called)
				 ;; do all the ezsteps (if any)
				 (if ezsteps
				     (let* ((testconfig (read-config (conc work-area "/testconfig") #f #t environ-patt: "pre-launch-env-vars")) ;; FIXME??? is allow-system ok here?
					    (ezstepslst (hash-table-ref/default testconfig "ezsteps" '())))
				       (hash-table-set! *testconfigs* test-name testconfig) ;; cached for lazy reads later ...
				       (if (not (file-exists? ".ezsteps"))(create-directory ".ezsteps"))
				       ;; if ezsteps was defined then we are sure to have at least one step but check anyway
				       (if (not (> (length ezstepslst) 0))
					   (debug:print 0 "ERROR: ezsteps defined but ezstepslst is zero length")
					   (let loop ((ezstep (car ezstepslst))
						      (tal    (cdr ezstepslst))
						      (prevstep #f))
240
241
242
243
244
245
246
247







248
249
250
251
252
253
254
255

256
257
258
259

260
261
262


263
264
265
266
267
268
269
270
271
272
273
274
275
276

277
278
279








280
281

282
283
284



285
286




287
288
289
290
291
292
293
294
295
296
297
298
299
300
301

















302
303
304

305
306
307


308
309
310


311
312
313
314






315
316
317
318






319
320

321

322
323

324
325
326
327
328





329
330
331

332
333
334
335
336
337
338













339
340
341

342
343
344
345
346
347
348
349
350
351
352
353
354

355
356
357
358


359
360
361
362
363
364
365
366
367
368










369
370
371
372
373
374
375
282
283
284
285
286
287
288

289
290
291
292
293
294
295
296
297
298
299
300
301
302

303
304
305
306

307
308


309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327

328
329
330
331
332
333
334
335
336

337
338
339
340
341
342
343
344
345
346
347
348
349















350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368

369

370

371
372
373
374
375
376
377




378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396

397
398

399





400
401
402
403
404
405
406

407
408
409
410
411
412


413
414
415
416
417
418
419
420
421
422
423
424
425
426
427

428

429


430
431


432
433
434
435

436
437
438
439
440
441
442










443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459







-
+
+
+
+
+
+
+







-
+



-
+

-
-
+
+














+


-
+
+
+
+
+
+
+
+

-
+



+
+
+


+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+


-
+
-

-
+
+



+
+
-
-
-
-
+
+
+
+
+
+




+
+
+
+
+
+


+
-
+

-
+
-
-
-
-
-
+
+
+
+
+


-
+





-
-
+
+
+
+
+
+
+
+
+
+
+
+
+


-
+
-

-
-


-
-




-
+




+
+
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+







									       ((eq? rollup-status 2) 'warn)
									       ((eq? rollup-status 0) 'pass)
									       (else 'fail)))
							    (next-status      (cond 
									       ((eq? overall-status 'pass) this-step-status)
									       ((eq? overall-status 'warn)
										(if (eq? this-step-status 'fail) 'fail 'warn))
									       (else 'fail))))
									       (else 'fail)))
							    (next-state       ;; "RUNNING") ;; WHY WAS THIS CHANGED TO NOT USE (null? tal) ??
							                       (cond
									       ((null? tal) ;; more to run?
									        "COMPLETED")
									       (else "RUNNING")))
							    )
						       (debug:print 4 "Exit value received: " (vector-ref exit-info 2) " logpro-used: " logpro-used 
								    " this-step-status: " this-step-status " overall-status: " overall-status 
								    " next-status: " next-status " rollup-status: " rollup-status)
						       (case next-status
							 ((warn)
							  (set! rollup-status 2)
							  ;; NB// test-set-status! does rdb calls under the hood
							  (tests:test-set-status! test-id "RUNNING" "WARN" 
							  (tests:test-set-status! test-id next-state "WARN" 
									  (if (eq? this-step-status 'warn) "Logpro warning found" #f)
									  #f))
							 ((pass)
							  (tests:test-set-status! test-id "RUNNING" "PASS" #f #f))
							  (tests:test-set-status! test-id next-state "PASS" #f #f))
							 (else ;; 'fail
							  (set! rollup-status 1) ;; force fail
							  (tests:test-set-status! test-id "RUNNING" "FAIL" (conc "Failed at step " stepname) #f)
							  (set! rollup-status 1) ;; force fail, this used to be next-state but that doesn't make sense. should always be "COMPLETED" 
							  (tests:test-set-status! test-id "COMPLETED" "FAIL" (conc "Failed at step " stepname) #f)
							  ))))
						   (if (and (steprun-good? logpro-used (vector-ref exit-info 2))
							    (not (null? tal)))
						       (loop (car tal) (cdr tal) stepname)))
						 (debug:print 4 "WARNING: a prior step failed, stopping at " ezstep))))))))
		 (monitorjob   (lambda ()
				 (let* ((start-seconds (current-seconds))
					(calc-minutes  (lambda ()
							 (inexact->exact 
							  (round 
							   (- 
							    (current-seconds) 
							    start-seconds)))))
					(kill-tries 0))
				   (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10)
				   (let loop ((minutes   (calc-minutes)))
				     (begin
				       (set! kill-job? (test-get-kill-request test-id)) ;; run-id test-name itemdat))
				       (set! kill-job? (or (test-get-kill-request test-id) ;; run-id test-name itemdat))
							   (and runtlim (let* ((run-seconds   (- (current-seconds) start-seconds))
									       (time-exceeded (> run-seconds runtlim)))
									  (if time-exceeded
									      (begin
										(debug:print-info 0 "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" run-seconds " seconds, limit=" runtlim)
										#t)
									      #f)))))
				       ;; open-run-close not needed for test-set-meta-info
				       (tests:set-meta-info #f test-id run-id test-name itemdat minutes work-area)
				       (tests:set-partial-meta-info #f test-id run-id minutes work-area 10)
				       (if kill-job? 
					   (begin
					     (mutex-lock! m)
					     ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this
					     ;;       section and the runit section? Or add a loop that tries three times with a 1/4 second
					     ;;       between tries?
					     (let* ((pid (vector-ref exit-info 0)))
					       (if (number? pid)
						   (handle-exceptions
						    exn
						    (debug:print-info 0 "Unable to kill process with pid " pid ", possibly already killed.")
						    ;;(process-signal pid signal/kill))
						   (begin
						     (debug:print 0 "WARNING: Request received to kill job (attempt # " kill-tries ")")
						     (let ((processes (cmd-run->list (conc "pgrep -l -P " pid))))
						       (for-each 
							(lambda (p)
							  (let* ((parts  (string-split p))
								 (p-id   (if (> (length parts) 0)
									     (string->number (car parts))
									     #f)))
							    (if p-id
								(begin
								  (debug:print 0 "Killing " (cadr parts) "; kill -9  " p-id)
								  (system (conc "kill -9 " p-id))))))
							(car processes))
						       (system (conc "kill -9 -" pid))))
						    (begin
						      (debug:print 0 "WARNING: Request received to kill job (attempt # " kill-tries ")")
						      (let ((processes (cmd-run->list (conc "pgrep -l -P " pid))))
							(for-each 
							 (lambda (p)
							   (let* ((parts  (string-split p))
								  (p-id   (if (> (length parts) 0)
									      (string->number (car parts))
									      #f)))
							     (if p-id
								 (begin
								   (debug:print 0 "Killing " (cadr parts) "; kill -9  " p-id)
								   ;; (process-signal pid signal/kill))))) ;; 
								   (system (conc "kill -9 " p-id))))))
							 (car processes)))
						      (system (conc "kill -9 -" pid))
						      (tests:test-set-status! test-id "KILLED"  "KILLED" (args:get-arg "-m") #f)))
						   (begin
						     (debug:print 0 "WARNING: Request received to kill job but problem with process, attempting to kill manager process")
						     (tests:test-set-status! test-id "KILLED"  "FAIL"
						     (tests:test-set-status! test-id "KILLED"  "KILLED" (args:get-arg "-m") #f)
								     (args:get-arg "-m") #f)
						     (sqlite3:finalize! tdb)
						     (exit 1))))
						     (exit 1) ;; IS THIS NECESSARY OR WISE???
						     )))
					     (set! kill-tries (+ 1 kill-tries))
					     (mutex-unlock! m)))
				       ;; (sqlite3:finalize! db)
				       (if keep-going
					   (begin
				       (thread-sleep! (+ 10 (random 10))) ;; add some jitter to the call home time to spread out the db accesses
				       (loop (calc-minutes)))))))
		 (th1          (make-thread monitorjob))
		 (th2          (make-thread runit)))
					     (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses
					     (if keep-going
						 (loop (calc-minutes)))))))
				   (tests:update-central-meta-info test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f)))) ;; NOTE: Checking twice for keep-going is intentional
		 (th1          (make-thread monitorjob "monitor job"))
		 (th2          (make-thread runit "run job")))
	    (set! job-thread th2)
	    (thread-start! th1)
	    (thread-start! th2)
	    (thread-join! th2)
	    (set! keep-going #f)
	    (thread-join! th1)
	    ;; (thread-sleep! 1)
	    ;; (thread-terminate! th1) ;; Not sure if this is a good idea
	    (thread-sleep! 1)       ;; give thread th1 a chance to be done TODO: Verify this is needed. At 0.1 I was getting fail to stop, increased to total of 1.1 sec.
	    ;; (tests:update-central-meta-info test-id cpuload diskfree minutes #f #f)
	    (mutex-lock! m)
	    (let* ((item-path (item-list->path itemdat))
		   ;; only state and status needed - use lazy routine
		   (testinfo  (cdb:get-test-info-by-id *runremote* test-id))) ;; )) ;; run-id test-name item-path)))
		   (testinfo  (cdb:remote-run db:get-testinfo-state-status #f test-id))) ;;;(cdb:get-test-info-by-id *runremote* test-id))) ;; )) ;; run-id test-name item-path)))
	      ;; Am I completed?
	      (if (not (equal? (db:test-get-state testinfo) "COMPLETED"))
	      (if (member (db:test-get-state testinfo) '("REMOTEHOSTSTART" "RUNNING")) ;; NOTE: It should *not* be REMOTEHOSTSTART but for reasons I don't yet understand it sometimes gets stuck in that state ;; (not (equal? (db:test-get-state testinfo) "COMPLETED"))
		  (begin
		    (debug:print 2 "Test NOT logged as COMPLETED, (state=" (db:test-get-state testinfo) "), updating result, rollup-status is " rollup-status)
		    (tests:test-set-status! test-id 
				    (if kill-job? "KILLED" "COMPLETED")
				    (cond
		  (let ((new-state  (if kill-job? "KILLED" "COMPLETED") ;; (if (eq? (vector-ref exit-info 2) 0) ;; exited with "good" status
				                                        ;; "COMPLETED"
							                ;; (db:test-get-state testinfo)))   ;; else preseve the state as set within the test
				    )
			(new-status (cond
				     ((not (vector-ref exit-info 1)) "FAIL") ;; job failed to run
				     ((eq? rollup-status 0)
				      ;; if the current status is AUTO the defer to the calculated value (i.e. leave this AUTO)
				      ;; if the current status is AUTO then defer to the calculated value (i.e. leave this AUTO)
				      (if (equal? (db:test-get-status testinfo) "AUTO") "AUTO" "PASS"))
				     ((eq? rollup-status 1) "FAIL")
				     ((eq? rollup-status 2)
				      ;; if the current status is AUTO the defer to the calculated value but qualify (i.e. make this AUTO-WARN)
				      (if (equal? (db:test-get-status testinfo) "AUTO") "AUTO-WARN" "WARN"))
				     (else "FAIL"))
				    (args:get-arg "-m") #f)))
				     (else "FAIL")))) ;; (db:test-get-status testinfo)))
		    (debug:print-info 1 "Test exited in state=" (db:test-get-state testinfo) ", setting state/status based on exit code of " (vector-ref exit-info 1) " and rollup-status of " rollup-status)
		    (tests:test-set-status! test-id 
					    new-state
					    new-status
					    (args:get-arg "-m") #f)
		    ;; need to update the top test record if PASS or FAIL and this is a subtest
		    ;; NO NEED TO CALL roll-up-pass-fail-counts HERE, THIS IS DONE IN roll-up-pass-fail-counts called by tests:test-set-status!
		    ;; (if (not (equal? item-path ""))
		    ;;     (begin
		    ;;       (thread-sleep! 0.1) ;; give other processes an opportunity to access the db as rollup is lower priority
		    ;;       (cdb:roll-up-pass-fail-counts *runremote* run-id test-name item-path new-status)))
		    ))
	      ;; for automated creation of the rollup html file this is a good place...
	      (if (not (equal? item-path ""))
		  (tests:summarize-items #f run-id test-name #f)) ;; don't force - just update if no
		  (tests:summarize-items #f run-id test-id test-name #f))) ;; don't force - just update if no
	      )
	    (mutex-unlock! m)
	    ;; (exec-results (cmd-run->list fullrunscript)) ;;  (list ">" (conc test-name "-run.log"))))
	    ;; (success      exec-results)) ;; (eq? (cadr exec-results) 0)))
	    (debug:print 2 "Output from running " fullrunscript ", pid " (vector-ref exit-info 0) " in work area " 
			 work-area ":\n====\n exit code " (vector-ref exit-info 2) "\n" "====\n")
	    ;; (sqlite3:finalize! db)
	    ;; (sqlite3:finalize! tdb)
	    (if (not (vector-ref exit-info 1))
		(exit 4)))))))

;; set up the very basics needed for doing anything here.
(define (setup-for-run)
(define (launch:setup-for-run #!key (force #f))
  ;; would set values for KEYS in the environment here for better support of env-override but 
  ;; have chicken/egg scenario. need to read megatest.config then read it again. Going to 
  ;; pass on that idea for now
  ;; special case
  (if (or force (not (hash-table? *configdat*)))  ;; no need to re-open on every call
      (begin
  (set! *configinfo* (find-and-read-config 
		      (if (args:get-arg "-config")(args:get-arg "-config") "megatest.config")
		      environ-patt: "env-override"
		      given-toppath: (get-environment-variable "MT_RUN_AREA_HOME")
		      pathenvvar: "MT_RUN_AREA_HOME"))
  (set! *configdat*  (if (car *configinfo*)(car *configinfo*) #f))
  (set! *toppath*    (if (car *configinfo*)(cadr *configinfo*) #f))
  (if *toppath*
      (setenv "MT_RUN_AREA_HOME" *toppath*) ;; to be deprecated
      (debug:print 0 "ERROR: failed to find the top path to your run setup."))
	(set! *configinfo* (find-and-read-config 
			    (if (args:get-arg "-config")(args:get-arg "-config") "megatest.config")
			    environ-patt: "env-override"
			    given-toppath: (get-environment-variable "MT_RUN_AREA_HOME")
			    pathenvvar: "MT_RUN_AREA_HOME"))
	(set! *configdat*  (if (car *configinfo*)(car *configinfo*) #f))
	(set! *toppath*    (if (car *configinfo*)(cadr *configinfo*) #f))
	(if *toppath*
	    (setenv "MT_RUN_AREA_HOME" *toppath*) ;; to be deprecated
	    (debug:print 0 "ERROR: failed to find the top path to your Megatest area."))))
  *toppath*)

(define (get-best-disk confdat)
  (let* ((disks    (hash-table-ref/default confdat "disks" #f))
	 (best     #f)
	 (bestsize 0))
    (if disks 
404
405
406
407
408
409
410
411

412
413
414
415
416
417

418
419
420
421
422
423
424
488
489
490
491
492
493
494

495
496
497
498
499
500

501
502
503
504
505
506
507
508







-
+





-
+







;; 
;;  <linkdir> - <target> - <testname> [ - <itempath> ]
;; 
;; All log file links should be stored relative to the top of link path
;;  
;; <target> - <testname> [ - <itempath> ] 
;;
(define (create-work-area run-id run-info key-vals test-id test-src-path disk-path testname itemdat)
(define (create-work-area run-id run-info keyvals test-id test-src-path disk-path testname itemdat)
  (let* ((item-path (item-list->path itemdat))
	 (runname  (db:get-value-by-header (db:get-row run-info)
					   (db:get-header run-info)
					   "runname"))
	 ;; convert back to db: from rdb: - this is always run at server end
	 (target   (string-intersperse key-vals "/"))
	 (target   (string-intersperse (map cadr keyvals) "/"))

	 (not-iterated  (equal? "" item-path))

	 ;; all tests are found at <rundir>/test-base or <linkdir>/test-base
	 (testtop-base (conc target "/" runname "/" testname))
	 (test-base    (conc testtop-base (if not-iterated "" "/") item-path))

439
440
441
442
443
444
445
446


447
448
449
450
451
452
453
523
524
525
526
527
528
529

530
531
532
533
534
535
536
537
538







-
+
+








    (debug:print 2 "INFO:\n       lnkbase=" lnkbase "\n       lnkpath=" lnkpath "\n  toptest-path=" toptest-path "\n     test-path=" test-path)
    (if (not (file-exists? linktree))
	(begin
	  (debug:print 0 "WARNING: linktree did not exist! Creating it now at " linktree)
	  (create-directory linktree #t))) ;; (system (conc "mkdir -p " linktree))))
    ;; create the directory for the tests dir links, this is needed no matter what...
    (if (not (directory-exists? lnkbase))
    (if (and (not (directory-exists? lnkbase))
	     (not (file-exists? lnkbase)))
	(create-directory lnkbase #t))
    
    ;; update the toptest record with its location rundir, cache the path
    ;; This wass highly inefficient, one db write for every subtest, potentially
    ;; thousands of unnecessary updates, cache the fact it was set and don't set it 
    ;; again. 

513
514
515
516
517
518
519
520

521
522
523
524
525
526
527
598
599
600
601
602
603
604

605
606
607
608
609
610
611
612







-
+







		       " - creating link from: " test-path "\n"
		       "                   to: " lnktarget)

	  ;; If there is already a symlink delete it and recreate it.
	  (handle-exceptions
	   exn
	   (begin
	     (debug:print 0 "ERROR:  Failed to re-create link " linktarget ((condition-property-accessor 'exn 'message) exn) ", exiting")
	     (debug:print 0 "ERROR:  Failed to re-create link " lnktarget ((condition-property-accessor 'exn 'message) exn) ", exiting")
	     (exit))
	   (if (symbolic-link? lnktarget)     (delete-file lnktarget))
	   (if (not (file-exists? lnktarget)) (create-symbolic-link test-path lnktarget)))))

    ;; I suspect this section was deleting test directories under some 
    ;; wierd sitations? This doesn't make sense - reenabling the rm -f 
    ;; I honestly don't remember *why* this chunk was needed...
552
553
554
555
556
557
558
559

560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575












576


577
578
579
580
581
582
583
584
585
586
587


588
589

590

591
592
593
594
595
596
597
598
599
600

601
602

603
604
605
606
607
608





609
610
611

612
613
614
615
616
617
618
637
638
639
640
641
642
643

644
645
646
647
648
649
650
651
652
653







654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677


678
679
680
681
682

683
684
685
686
687
688
689

690
691

692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708

709
710
711
712
713
714
715
716







-
+









-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+

+
+









-
-
+
+


+
-
+






-


-
+


+






+
+
+
+
+


-
+







;; 1. look though disks list for disk with most space
;; 2. create run dir on disk, path name is meaningful
;; 3. create link from run dir to megatest runs area 
;; 4. remotely run the test on allocated host
;;    - could be ssh to host from hosts table (update regularly with load)
;;    - could be netbatch
;;      (launch-test db (cadr status) test-conf))
(define (launch-test test-id run-id run-info key-vals runname test-conf keyvallst test-name test-path itemdat params)
(define (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat params)
  (change-directory *toppath*)
  (alist->env-vars ;; consolidate this code with the code in megatest.scm for "-execute"
   (list ;; (list "MT_TEST_RUN_DIR" work-area)
    (list "MT_RUN_AREA_HOME" *toppath*)
    (list "MT_TEST_NAME" test-name)
    ;; (list "MT_ITEM_INFO" (conc itemdat)) 
    (list "MT_RUNNAME"   runname)
    ;; (list "MT_TARGET"    mt_target)
    ))
  (let* ((useshell   (config-lookup *configdat* "jobtools"     "useshell"))
	 (launcher   (config-lookup *configdat* "jobtools"     "launcher"))
	 (runscript  (config-lookup test-conf   "setup"        "runscript"))
	 (ezsteps    (> (length (hash-table-ref/default test-conf "ezsteps" '())) 0)) ;; don't send all the steps, could be big
	 (diskspace  (config-lookup test-conf   "requirements" "diskspace"))
	 (memory     (config-lookup test-conf   "requirements" "memory"))
	 (hosts      (config-lookup *configdat* "jobtools"     "workhosts"))
  (let* ((useshell        (let ((ush (config-lookup *configdat* "jobtools"     "useshell")))
			    (if ush 
				(if (equal? ush "no") ;; must use "no" to NOT use shell
				    #f
				    ush)
				#t)))     ;; default is yes
	 (launcher        (config-lookup *configdat* "jobtools"     "launcher"))
	 (runscript       (config-lookup test-conf   "setup"        "runscript"))
	 (ezsteps         (> (length (hash-table-ref/default test-conf "ezsteps" '())) 0)) ;; don't send all the steps, could be big
	 (diskspace       (config-lookup test-conf   "requirements" "diskspace"))
	 (memory          (config-lookup test-conf   "requirements" "memory"))
	 (hosts           (config-lookup *configdat* "jobtools"     "workhosts"))
	 (remote-megatest (config-lookup *configdat* "setup" "executable"))
	 (run-time-limit  (or (configf:lookup  test-conf   "requirements" "runtimelim")
			      (configf:lookup  *configdat* "setup" "runtimelim")))
	 ;; FIXME SOMEDAY: not good how this is so obtuse, this hack is to 
	 ;;                allow running from dashboard. Extract the path
	 ;;                from the called megatest and convert dashboard
	 ;;             	  or dboard to megatest
	 (local-megatest  (let* ((lm  (car (argv)))
				 (dir (pathname-directory lm))
				 (exe (pathname-strip-directory lm)))
			    (conc (if dir (conc dir "/") "")
				  (case (string->symbol exe)
				    ((dboard)    "megatest")
				    ((mtest)     "megatest")
				    ((dboard)    "../megatest")
				    ((mtest)     "../megatest")
				    ((dashboard) "megatest")
				    (else exe)))))
	 (item-path  (item-list->path itemdat))
	 (test-sig   (conc test-name ":" (item-list->path itemdat))) ;; test-path is the full path including the item-path
	 (test-sig   (conc test-name ":" item-path)) ;; (item-list->path itemdat))) ;; test-path is the full path including the item-path
	 (work-area  #f)
	 (toptest-work-area #f) ;; for iterated tests the top test contains data relevant for all
	 (diskpath   #f)
	 (cmdparms   #f)
	 (fullcmd    #f) ;; (define a (with-output-to-string (lambda ()(write x))))
	 (mt-bindir-path #f)
	 (item-path (item-list->path itemdat))
	 ;; (test-id    (cdb:remote-run db:get-test-id #f run-id test-name item-path))
	 (testinfo   (cdb:get-test-info-by-id *runremote* test-id))
	 (mt_target  (string-intersperse (map cadr keyvallst) "/"))
	 (mt_target  (string-intersperse (map cadr keyvals) "/"))
	 (debug-param (append (if (args:get-arg "-debug")  (list "-debug" (args:get-arg "-debug")) '())
			      (if (args:get-arg "-logging")(list "-logging") '()))))
    (setenv "MT_ITEMPATH" item-path)
    (if hosts (set! hosts (string-split hosts)))
    ;; set the megatest to be called on the remote host
    (if (not remote-megatest)(set! remote-megatest local-megatest)) ;; "megatest"))
    (set! mt-bindir-path (pathname-directory remote-megatest))
    (if launcher (set! launcher (string-split launcher)))
    ;; set up the run work area for this test
    (if (and (args:get-arg "-preclean") ;; user has requested to preclean for this run
	     (not (equal? (db:test-get-rundir testinfo) "n/a"))) ;; n/a is a placeholder and thus not a read dir
	(begin 
	  (debug:print-info 0 "attempting to preclean directory " (db:test-get-rundir testinfo) " for test " test-name "/" item-path)
	  (runs:remove-test-directory #f testinfo #t))) ;; remove data only, do not perturb the record
    (set! diskpath (get-best-disk *configdat*))
    (if diskpath
	(let ((dat  (create-work-area run-id run-info key-vals test-id test-path diskpath test-name itemdat)))
	(let ((dat  (create-work-area run-id run-info keyvals test-id test-path diskpath test-name itemdat)))
	  (set! work-area (car dat))
	  (set! toptest-work-area (cadr dat))
	  (debug:print-info 2 "Using work area " work-area))
	(begin
	  (set! work-area (conc test-path "/tmp_run"))
	  (create-directory work-area #t)
	  (debug:print 0 "WARNING: No disk work area specified - running in the test directory under tmp_run")))
629
630
631
632
633
634
635

636
637
638
639

640
641
642
643
644
645
646
727
728
729
730
731
732
733
734
735
736
737

738
739
740
741
742
743
744
745







+



-
+







				     (list 'runscript runscript) 
				     (list 'run-id    run-id   )
				     (list 'test-id   test-id  )
				     (list 'itemdat   itemdat  )
				     (list 'megatest  remote-megatest)
				     (list 'ezsteps   ezsteps) 
				     (list 'target    mt_target)
				     (list 'runtlim   (if run-time-limit (common:hms-string->seconds run-time-limit) #f))
				     (list 'env-ovrd  (hash-table-ref/default *configdat* "env-override" '())) 
				     (list 'set-vars  (if params (hash-table-ref/default params "-setvars" #f)))
				     (list 'runname   runname)
				     (list 'mt-bindir-path mt-bindir-path))))))) ;; (string-intersperse keyvallst " "))))
				     (list 'mt-bindir-path mt-bindir-path)))))))
    ;; clean out step records from previous run if they exist
    ;; (debug:print-info 4 "FIXMEEEEE!!!! This can be removed some day, perhaps move all test records to the test db?")
    ;; (open-run-close db:delete-test-step-records db test-id)
    (change-directory work-area) ;; so that log files from the launch process don't clutter the test dir
    (tests:test-set-status! test-id "LAUNCHED" "n/a" #f #f) ;; (if launch-results launch-results "FAILED"))
    (cond
     ((and launcher hosts) ;; must be using ssh hostname
663
664
665
666
667
668
669

670
671

672


673
674
675
676




677
678
679
680
681
682
683
684
685








686
687
688
689
690
691
692
762
763
764
765
766
767
768
769
770
771
772

773
774
775
776
777

778
779
780
781
782
783
784
785





786
787
788
789
790
791
792
793
794
795
796
797
798
799
800







+


+
-
+
+



-
+
+
+
+




-
-
-
-
-
+
+
+
+
+
+
+
+







			    (hash-table-ref/default test-conf "pre-launch-env-overrides" '())))
	   (miscprevvals   (alist->env-vars ;; consolidate this code with the code in megatest.scm for "-execute"
			    (append (list (list "MT_TEST_RUN_DIR" work-area)
					  (list "MT_TEST_NAME" test-name)
					  (list "MT_ITEM_INFO" (conc itemdat)) 
					  (list "MT_RUNNAME"   runname)
					  (list "MT_TARGET"    mt_target)
					  (list "MT_ITEMPATH"  item-path)
					  )
				    itemdat)))
	   ;; Launchwait defaults to true, must override it to turn off wait
	   (launch-results (apply (if (equal? (configf:lookup *configdat* "setup" "launchwait") "yes")
	   (launchwait     (if (equal? (configf:lookup *configdat* "setup" "launchwait") "no") #f #t))
	   (launch-results (apply (if launchwait
				      cmd-run-with-stderr->list
				      process-run)
				  (if useshell
				      (string-intersperse fullcmd " ")
				      (let ((cmdstr (string-intersperse fullcmd " ")))
					(if launchwait
					    cmdstr
					    (conc cmdstr " >> mt_launch.log 2>&1")))
				      (car fullcmd))
				  (if useshell
				      '()
				      (cdr fullcmd)))))
      (if (list? launch-results)
	  (with-output-to-file "mt_launch.log"
	    (lambda ()
	      (apply print launch-results))
	    #:append))
      (if (not launchwait) ;; give the OS a little time to allow the process to start
	  (thread-sleep! 0.01))
      (with-output-to-file "mt_launch.log"
	(lambda ()
	  (if (list? launch-results)
	      (apply print launch-results)
	      (print "NOTE: launched \"" fullcmd "\"\n  but did not wait for it to proceed. Add the following to megatest.config \n[setup]\nlaunchwait yes\n  if you have problems with this"))
	  #:append))
      (debug:print 2 "Launching completed, updating db")
      (debug:print 2 "Launch results: " launch-results)
      (if (not launch-results)
          (begin
            (print "ERROR: Failed to run " (string-intersperse fullcmd " ") ", exiting now")
            ;; (sqlite3:finalize! db)
            ;; good ole "exit" seems not to work