Megatest

Check-in [36d5293a01]
Login
Overview
Comment:Added kill of -runtests processes if -remove-runs is called with test patt of %
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.60
Files: files | file ages | folders
SHA1: 36d5293a018d5658fd3d9ed5619b4fe55330eba1
User & Date: matt on 2014-10-16 23:58:07
Other Links: branch diff | manifest | tags
Context
2014-10-17
00:19
Changed default server run time to 60 seconds. Removed wait on RUNNING for servers - caused more hassle than benefit check-in: 9818a847b5 user: matt tags: v1.60
2014-10-16
23:58
Added kill of -runtests processes if -remove-runs is called with test patt of % check-in: 36d5293a01 user: matt tags: v1.60
11:20
Fixed jobgroups (added sensitivity to n/a) and num running check-in: 7ecb5e5d84 user: mrwellan tags: v1.60
Changes

Modified launch.scm from [ba7f6d2131] to [acb41eb596].

511
512
513
514
515
516
517
518

519
520
521
522
523
524
525
511
512
513
514
515
516
517

518
519
520
521
522
523
524
525







-
+







		 (begin
		   (set! best     dirpath)
		   (set! bestsize freespc)))))
	 (map car disks)))
    (if (and best (> bestsize 0))
	best
	(begin
	  (if (common:low-noise-print 20 "disks" disk-num)
	  (if (common:low-noise-print 20 "no valid disks")
	      (debug:print 0 "ERROR: No valid disks found in megatest.config. Please add some to your [disks] section and ensure the directory exists!"))
	  (exit 1)))))

;; Desired directory structure:
;;
;;  <linkdir> - <target> - <testname> -.
;;                                     |

Modified runs.scm from [aabc29c272] to [0a93b5efd0].

192
193
194
195
196
197
198

199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215



216











217
218
219
220
221
222
223
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215

216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237







+
















-
+
+
+

+
+
+
+
+
+
+
+
+
+
+







				       (>= num-running-in-jobgroup job-group-limit))
				  (if (runs:lownoise (conc "maxjobgroup " jobgroup) 60)
				      (debug:print 1 "WARNING: number of jobs " num-running-in-jobgroup 
						   " in jobgroup \"" jobgroup "\" exceeds limit of " job-group-limit))
				  #t)
				 (else #f))))
	  (list (not can-not-run-more) num-running num-running-in-jobgroup max-concurrent-jobs job-group-limit)))))


;;  test-names: Comma separated patterns same as test-patts but used in selection 
;;              of tests to run. The item portions are not respected.
;;              FIXME: error out if /patt specified
;;            
(define (runs:run-tests target runname test-patts user flags #!key (run-count 3)) ;; test-names
  (let* ((keys               (keys:config-get-fields *configdat*))
	 (keyvals            (keys:target->keyval keys target))
	 (run-id             (rmt:register-run keyvals runname "new" "n/a" user))  ;;  test-name)))
	 (deferred          '()) ;; delay running these since they have a waiton clause
	 (runconfigf         (conc  *toppath* "/runconfigs.config"))
	 (test-records       (make-hash-table))
	 ;; need to process runconfigs before generating these lists
	 (all-tests-registry #f)  ;; (tests:get-all)) ;; (tests:get-valid-tests (make-hash-table) test-search-path)) ;; all valid tests to check waiton names
	 (all-test-names     #f)  ;; (hash-table-keys all-tests-registry))
	 (test-names         #f)  ;; (tests:filter-test-names all-test-names test-patts))
	 (required-tests     #f)) ;;(lset-intersection equal? (string-split test-patts ",") test-names))) ;; test-names)) ;; Added test-names as initial for required-tests but that failed to work
	 (required-tests     #f)  ;;(lset-intersection equal? (string-split test-patts ",") test-names))) ;; test-names)) ;; Added test-names as initial for required-tests but that failed to work
	 (task-key           (conc (hash-table->alist flags) " " (get-host-name) " " (current-process-id)))
	 (tasks-db           (tasks:open-db)))

    (set-signal-handler! signal/int
			 (lambda (signum)
			   (let ((tdb (tasks:open-db)))
			     (tasks:set-state-given-param-key tdb task-key "killed")
			     (sqlite3:finalize! tdb))
			   (print "Killed by sigint. Exiting")
			   (exit)))

    ;; register this run in monitor.db
    (tasks:add tasks-db "run-tests" user target runname test-patts task-key) ;; params)
    (tasks:set-state-given-param-key tasks-db task-key "running")
    (runs:set-megatest-env-vars run-id inkeys: keys inrunname: runname) ;; these may be needed by the launching process
    (if (file-exists? runconfigf)
	(setup-env-defaults runconfigf run-id *already-seen-runconfig-info* keyvals target)
	(debug:print 0 "WARNING: You do not have a run config file: " runconfigf))

    ;; Now generate all the tests lists
    (set! all-tests-registry (tests:get-all))
352
353
354
355
356
357
358

359

360
361
362
363
364
365
366
366
367
368
369
370
371
372
373

374
375
376
377
378
379
380
381







+
-
+







		  (if (not (hash-table-ref/default flags "-preclean" #f))
		      (hash-table-set! flags "-preclean" #t))
		  (if (not (hash-table-ref/default flags "-rerun" #f))
		      (hash-table-set! flags "-rerun" "STUCK/DEAD,n/a,ZERO_ITEMS"))
		  (runs:run-tests target runname test-patts user flags run-count: (- run-count 1)))))
	  (debug:print-info 0 "No tests to run")))
    (debug:print-info 4 "All done by here")
    (tasks:set-state-given-param-key tasks-db task-key "done")
    ))
    (sqlite3:finalize! tasks-db)))


;; loop logic. These are used in runs:run-tests-queue to make it a bit more readable.
;;
;; If reg not full and have items in tal then loop with (car tal)(cdr tal) reg reruns
;; If reg is full (i.e. length >= n
;;   loop with (car reg) tal (cdr reg) reruns
1351
1352
1353
1354
1355
1356
1357

1358
1359
1360
1361
1362
1363
1364
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380







+







;;    'set-state-status
;;
;; NB// should pass in keys?
;;
(define (runs:operate-on action target runnamepatt testpatt #!key (state #f)(status #f)(new-state-status #f)(remove-data-only #f))
  (common:clear-caches) ;; clear all caches
  (let* ((db           #f)
	 (tasks-db     (tasks:open-db))
	 (keys         (rmt:get-keys))
	 (rundat       (mt:get-runs-by-patt keys runnamepatt target))
	 (header       (vector-ref rundat 0))
	 (runs         (vector-ref rundat 1))
	 (states       (if state  (string-split state  ",") '()))
	 (statuses     (if status (string-split status ",") '()))
	 (state-status (if (string? new-state-status) (string-split new-state-status ",") '(#f #f))))
1377
1378
1379
1380
1381
1382
1383

1384
1385
1386
1387
1388
1389
1390
1391
1392




1393
1394
1395
1396
1397
1398
1399
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420







+









+
+
+
+







						    testpatt states statuses
						    not-in:  #f
						    sort-by: (case action
							       ((remove-runs) 'rundir)
							       (else          'event_time))))))
	 (let* ((run-id    (db:get-value-by-header run header "id"))
		(run-state (db:get-value-by-header run header "state"))
		(run-name  (db:get-value-by-header run header "runname"))
		(tests     (if (not (equal? run-state "locked"))
			       (proc-get-tests run-id)
			       '()))
		(lasttpath "/does/not/exist/I/hope"))
	   (debug:print-info 4 "runs:operate-on run=" run ", header=" header)
	   (if (not (null? tests))
	       (begin
		 (case action
		   ((remove-runs)
		    ;; seek and kill in flight -runtests with % as testpatt here
		    (if (equal? testpatt "%")
			(tasks:kill-runner tasks-db target run-name)
			(debug:print 0 "not attempting to kill any run launcher processes as testpatt is " testpatt))
		    (debug:print 1 "Removing tests for run: " runkey " " (db:get-value-by-header run header "runname")))
		   ((set-state-status)
		    (debug:print 1 "Modifying state and staus for tests for run: " runkey " " (db:get-value-by-header run header "runname")))
		   ((print-run)
		    (debug:print 1 "Printing info for run " runkey ", run=" run ", tests=" tests ", header=" header)
		    action)
		   ((run-wait)
1496
1497
1498
1499
1500
1501
1502
1503


1504
1505
1506
1507
1508
1509
1510
1517
1518
1519
1520
1521
1522
1523

1524
1525
1526
1527
1528
1529
1530
1531
1532







-
+
+







		       ;; need to figure out the path to the run dir and remove it if empty
		       ;;    (if (null? (glob (conc runpath "/*")))
		       ;;        (begin
		       ;; 	 (debug:print 1 "Removing run dir " runpath)
		       ;; 	 (system (conc "rmdir -p " runpath))))
		       )))))
	 ))
     runs))
     runs)
    (sqlite3:finalize! tasks-db))
  #t)

(define (runs:remove-test-directory db test remove-data-only)
  (let* ((run-dir       (db:test-get-rundir test))    ;; run dir is from the link tree
	 (real-dir      (if (file-exists? run-dir)
			    (resolve-pathname run-dir)
			    #f)))

Modified tasks.scm from [e478443442] to [3f21ed396f].

387
388
389
390
391
392
393
394

395
396
397
398
399

400
401
402
403
404
405
406
407
387
388
389
390
391
392
393

394
395
396
397
398

399

400
401
402
403
404
405
406







-
+




-
+
-







     "SELECT count(id) FROM monitors WHERE last_update < (strftime('%s','now') - 300) AND username=?;"
     (car (user-information (current-user-id))))
    res))

;; register a task
(define (tasks:add mdb action owner target runname testpatt params)
  (sqlite3:execute mdb "INSERT INTO tasks_queue (action,owner,state,target,name,testpatt,params,creation_time,execution_time)
                       VALUES (?,?,'new',?,?,?,?,?,strftime('%s','now'),0);" 
                       VALUES (?,?,'new',?,?,?,?,strftime('%s','now'),0);" 
		   action
		   owner
		   target
		   runname
		   test
		   testpatt
		   item
		   (if params params "")))

(define (keys:key-vals-hash->target keys key-params)
  (let ((tmp (hash-table-ref/default key-params (vector-ref (car keys) 0) "")))
    (if (> (length keys) 1)
	(for-each (lambda (key)
		    (set! tmp (conc tmp "/" (hash-table-ref/default key-params (vector-ref key 0) ""))))
575
576
577
578
579
580
581

































































582
583
584
585
586
587
588
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652







+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







		   (current-process-id)
		   (get-host-name)))

(define (tasks:set-state mdb task-id state)
  (sqlite3:execute mdb "UPDATE tasks_queue SET state=? WHERE id=?;" 
		   state 
		   task-id))

;;======================================================================
;; Access using task key (stored in params; (hash-table->alist flags) hostname pid
;;======================================================================

(define (tasks:param-key->id mdb task-params)
  (handle-exceptions
   exn
   #f
   (sqlite3:first-result mdb "SELECT id FROM tasks_queue WHERE params LIKE ?;" task-params)))

(define (tasks:set-state-given-param-key mdb param-key new-state)
  (sqlite3:execute mdb "UPDATE tasks_queue SET state=? WHERE params LIKE ?;" new-state param-key))

(define (tasks:get-records-given-param-key mdb param-key state-patt action-patt test-patt)
  (handle-exceptions
   exn
   '()
   (sqlite3:first-row mdb "SELECT id,action,owner,state,target,name,testpatt,keylock,params WHERE
         params LIKE ? AND state LIKE ? AND action LIKE ? AND testpatt LIKE ?;"
		      param-key state-patt action-patt test-patt)))


;;======================================================================
;; Rogue items, no place to put these yet
;;======================================================================

(define (tasks:find-task-queue-records mdb target run-name test-patt state-patt action-patt)
  ;; (handle-exceptions
  ;;  exn
  ;;  '()
  ;;  (sqlite3:first-row
  (let ((res '()))
    (sqlite3:for-each-row 
     (lambda (a . b)
       (set! res (cons (cons a b) res)))
     mdb "SELECT id,action,owner,state,target,name,testpatt,keylock,params FROM tasks_queue 
       WHERE
         target = ? AND name = ? AND state LIKE ? AND action LIKE ? AND testpatt LIKE ?;"
     target run-name state-patt action-patt test-patt)
    res)) ;; )


(define (tasks:kill-runner mdb target run-name)
  (let ((records    (tasks:find-task-queue-records mdb target run-name "%" "running" "run-tests"))
	(hostpid-rx (regexp "\\s+(\\w+)\\s+(\\d+)$"))) ;; host pid is at end of param string
    (if (null? records)
	(debug:print 0 "No run launching processes found for " target " / " run-name)
	(debug:print 0 "Found " (length records) " run(s) to kill."))
    (for-each 
     (lambda (record)
       (let* ((param-key (list-ref record 8))
	      (match-dat (string-search hostpid-rx param-key))
	      (hostname  (cadr match-dat))
	      (pid       (caddr match-dat)))
	 (debug:print 0 "Sending SIGINT to process " pid " on host " hostname)
	 (if (equal? (get-host-name) hostname)
	     (process-signal (string->number pid) signal/int)
	     ;;  (call-with-environment-variables
	     (let ((old-targethost (getenv "TARGETHOST")))
	       (set-environment-variable "TARGETHOST" hostname)
	       (system (conc "nbfake " kill " " pid))
	       (if old-targethost (set-environment-variable "TARGETHOST" old-targethost))))))
     records)))


;;======================================================================
;; The routines to process tasks
;;======================================================================

;; NOTE: It might be good to add one more layer of checking to ensure
;;       that no task gets run in parallel.

Modified tests/fdktestqa/fdk.config from [e76cce79f3] to [bb2780b886].

17
18
19
20
21
22
23
24

25




26
27

28
29

17
18
19
20
21
22
23

24
25
26
27
28
29
30

31
32

33







-
+

+
+
+
+

-
+

-
+
[jobtools]
maxload 4
launcher nbfake

[server]
# timeout 0.01
# homehost xena
homehost 143.182.225.38
# homehost 143.182.225.38

# force server
server-query-threshold 0


[jobtools]
launcher nbq -P ch_vp -C SLES11_EM64T_4G -Q /ciaf/fdk
# launcher nbq -P ch_vp -C SLES11_EM64T_4G -Q /ciaf/fdk
# launcher nbfake
maxload 4
# maxload 4