Megatest

Check-in [658cf01137]
Login
Overview
Comment:Tweaks to fix the premature test launching issues
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | test-specific-db
Files: files | file ages | folders
SHA1: 658cf0113708c0330262254687c8a5bebfe1dc2b
User & Date: mrwellan on 2012-09-24 17:20:06
Other Links: branch diff | manifest | tags
Context
2012-09-24
17:20
Tweaks to fix the premature test launching issues Leaf check-in: 658cf01137 user: mrwellan tags: test-specific-db
00:02
Added comment on mode check-in: eb47e2c93d user: matt tags: test-specific-db
Changes

Modified db.scm from [589b14ba9e] to [c259cfdf98].

948
949
950
951
952
953
954
955
956
957



958
959
960
961

962
963
964
965
966
967
968
948
949
950
951
952
953
954



955
956
957
958
959
960

961
962
963
964
965
966
967
968







-
-
-
+
+
+



-
+







	(sqlite3:execute 
	 db
	 "UPDATE tests 
             SET fail_count=(SELECT count(id) FROM tests WHERE run_id=? AND testname=? AND item_path != '' AND status='FAIL'),
                 pass_count=(SELECT count(id) FROM tests WHERE run_id=? AND testname=? AND item_path != '' AND (status='PASS' OR status='WARN' OR status='WAIVED'))
             WHERE run_id=? AND testname=? AND item_path='';"
	 run-id test-name run-id test-name run-id test-name)
        (thread-sleep! 0.1) ;; give other processes a chance here
	(if (equal? status "RUNNING") ;; running takes priority over all other states, force the test state to RUNNING
	    (sqlite3:execute db "UPDATE tests SET state=? WHERE run_id=? AND testname=? AND item_path='';" "RUNNING" run-id test-name)
        ;; (thread-sleep! 0.1) ;; give other processes a chance here
	(if (member status '("NOT_STARTED" "LAUNCHED" "RUNNING" "REMOTEHOSTSTART")) ;; running takes priority over all other states, force the test state to RUNNING
	    (sqlite3:execute db "UPDATE tests SET state=? WHERE run_id=? AND testname=? AND item_path='';" status run-id test-name)
	    (sqlite3:execute
	     db
	     "UPDATE tests
                       SET state=CASE WHEN (SELECT count(id) FROM tests WHERE run_id=? AND testname=? AND item_path != '' AND state in ('RUNNING','NOT_STARTED')) > 0 THEN 
                       SET state=CASE WHEN (SELECT count(id) FROM tests WHERE run_id=? AND testname=? AND item_path != '' AND state in ('RUNNING','NOT_STARTED','LAUNCHED','REMOTEHOSTSTART')) > 0 THEN 
                          'RUNNING'
                       ELSE 'COMPLETED' END,
                          status=CASE WHEN fail_count > 0 THEN 'FAIL' WHEN pass_count > 0 AND fail_count=0 THEN 'PASS' ELSE 'UNKNOWN' END
                       WHERE run_id=? AND testname=? AND item_path='';"
	     run-id test-name run-id test-name))
	#f)
      #f))
1073
1074
1075
1076
1077
1078
1079
1080

1081
1082
1083
1084
1085
1086
1087
1073
1074
1075
1076
1077
1078
1079

1080
1081
1082
1083
1084
1085
1086
1087







-
+







    (if (not (eof-object? lin))
	(begin
	  (debug:print 4 lin)
	  (rdb:csv->test-data db test-id lin)
	  (loop (read-line)))))
  ;; roll up the current results.
  ;; FIXME: Add the status to 
  (rdb:test-data-rollup db test-id #f))
  (db:test-data-rollup db test-id #f))

;; WARNING: Do NOT call this for the parent test on an iterated test
;; Roll up test_data pass/fail results
;; look at the test_data status field, 
;;    if all are pass (any case) and the test status is PASS or NULL or '' then set test status to PASS.
;;    if one or more are fail (any case) then set test status to PASS, non "pass" or "fail" are ignored
(define (db:test-data-rollup db test-id status)
1220
1221
1222
1223
1224
1225
1226
1227

1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242

1243
1244
1245
1246

1247
1248
1249
1250
1251
1252
1253
1220
1221
1222
1223
1224
1225
1226

1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241

1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254







-
+














-
+




+







      '()
      (let* ((unmet-pre-reqs '())
	     (result         '()))
	(for-each 
	 (lambda (waitontest-name)
	   ;; by getting the tests with matching name we are looking only at the matching test 
	   ;; and related sub items
	   (let ((tests             (rdb:get-tests-for-run db run-id waitontest-name #f '() '()))
	   (let ((tests             (open-run-close db:get-tests-for-run db run-id waitontest-name #f '() '()))
		 (ever-seen         #f)
		 (parent-waiton-met #f)
		 (item-waiton-met   #f))
	     (for-each 
	      (lambda (test)
		;; (if (equal? waitontest-name (db:test-get-testname test)) ;; by defintion this had better be true ...
		(let* ((state             (db:test-get-state test))
		       (status            (db:test-get-status test))
		       (item-path         (db:test-get-item-path test))
		       (is-completed      (equal? state "COMPLETED"))
		       (is-ok             (member status '("PASS" "WARN" "CHECK" "WAIVED")))
		       (same-itempath     (equal? ref-item-path item-path)))
		  (set! ever-seen #t)
		  (cond
		   ;; case 1, non-item (parent test) is 
		   ;; case 1, non-item (parent test) is completed and ok
		   ((and (equal? item-path "") ;; this is the parent test
			 is-completed
			 (or is-ok (eq? mode 'toplevel)))
		    (set! parent-waiton-met #t))
		   ;; 
		   ((and same-itempath
			 is-completed
			 (or is-ok (eq? mode 'toplevel)))
		    (set! item-waiton-met #t)))))
	      tests)
	     (if (not (or parent-waiton-met item-waiton-met))
		 (set! result (append (if (null? tests) (list waitontest-name) tests) result)))

Modified launch.scm from [c4ef02d1b8] to [73000cbbad].

278
279
280
281
282
283
284
285

286
287
288
289
290
291
292
278
279
280
281
282
283
284

285
286
287
288
289
290
291
292







-
+







								  (system (conc "kill -9 " p-id))))))
							(car processes))
						       (system (conc "kill -9 " pid))))
						   (begin
						     (debug:print 0 "WARNING: Request received to kill job but problem with process, attempting to kill manager process")
						     (open-run-close test-set-status! #f test-id "KILLED"  "FAIL"
								       (args:get-arg "-m") #f)
						     (sqlite3:finalize! tdb)
						     ;; (sqlite3:finalize! tdb)
						     (exit 1))))
					     (set! kill-tries (+ 1 kill-tries))
					     (mutex-unlock! m)))
				       ;; (sqlite3:finalize! db)
				       (thread-sleep! (+ 10 (random 10))) ;; add some jitter to the call home time to spread out the db accesses
				       (loop (calc-minutes)))))))
		 (th1          (make-thread monitorjob))
330
331
332
333
334
335
336
337
338


339
340
341
342
343
344
345
330
331
332
333
334
335
336


337
338
339
340
341
342
343
344
345







-
-
+
+







		  (open-run-close tests:summarize-items #f run-id test-name #f)) ;; don't force - just update if no
	      )
	    (mutex-unlock! m)
	    ;; (exec-results (cmd-run->list fullrunscript)) ;;  (list ">" (conc test-name "-run.log"))))
	    ;; (success      exec-results)) ;; (eq? (cadr exec-results) 0)))
	    (debug:print 2 "Output from running " fullrunscript ", pid " (vector-ref exit-info 0) " in work area " 
			 work-area ":\n====\n exit code " (vector-ref exit-info 2) "\n" "====\n")
	    (sqlite3:finalize! db)
	    (sqlite3:finalize! tdb)
	    ;; (sqlite3:finalize! db)
	    ;; (sqlite3:finalize! tdb)
	    (if (not (vector-ref exit-info 1))
		(exit 4)))))))

;; set up the very basics needed for doing anything here.
(define (setup-for-run)
  ;; would set values for KEYS in the environment here for better support of env-override but 
  ;; have chicken/egg scenario. need to read megatest.config then read it again. Going to 

Modified runs.scm from [5395094d8f] to [eac0e08035].

276
277
278
279
280
281
282





















283
284
285
286
287
288
289
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310







+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+








    (if (not (null? required-tests))
	(debug:print 1 "INFO: Adding " required-tests " to the run queue"))
    ;; NOTE: these are all parent tests, items are not expanded yet.
    (runs:run-tests-queue run-id runname test-records keyvallst flags)
    (debug:print 4 "INFO: All done by here")))


;; testname is hed and remtests is tal, can be testname strings or testqueue vectors
;; remaining-items are other items for the current test that have not been run yet
;; this is used in calculating the state of toplevel tests. They are NOT COMPLETED
;; until all items are COMPLETED and thus not in this list.
(define (runs:remaining-items testdat remtests)
  (let* ((testname    (tests:testqueue-get-testname testdat)) ;; extract the name of the test (may have vector record)
	 (itempath    (tests:testqueue-get-itempath testdat))
	 (toptestname (if (string? testname)
			  (car (string-split testname "/"))
			  (begin
			    (debug:print 0 "ERROR: Should have a string testname here! Please report this as a bug :(")
			    testname))))
    (filter (lambda (test)
	      (let ((tname (tests:testqueue-get-testname test))
		    (ipath (tests:testqueue-get-itempath test)))
		(and (equal? tname testname)
		     (and (not (equal? ipath ""))
			  (not (equal? ipath itempath))))))
	    remtests)))

;; test-records is a hash table testname:item_path => vector < testname testconfig waitons priority items-info ... >
(define (runs:run-tests-queue run-id runname test-records keyvallst flags)
    ;; At this point the list of parent tests is expanded 
    ;; NB// Should expand items here and then insert into the run queue.
  (debug:print 5 "test-records: " test-records ", keyvallst: " keyvallst " flags: " (hash-table->alist flags))
  (let ((sorted-test-names (tests:sort-by-priority-and-waiton test-records))
	(item-patts        (hash-table-ref/default flags "-itempatt" #f)))
333
334
335
336
337
338
339
340

341
342
343
344
345
346
347
354
355
356
357
358
359
360

361
362
363
364
365
366
367
368







-
+







		(begin
		  (debug:print 0 "ERROR: test " hed " has listed itself as a waiton, please correct this!")
		  (set! waiton (filter (lambda (x)(not (equal? x hed))) waitons))))

	    (cond
	     ((not items) ;; when false the test is ok to be handed off to launch (but not before)
	      (let* ((have-resources  (open-run-close runs:can-run-more-tests #f test-record)) ;; look at the test jobgroup and tot jobs running
		     (prereqs-not-met (open-run-close db:get-prereqs-not-met #f run-id waitons item-path mode: testmode))
		     (prereqs-not-met (db:get-prereqs-not-met #f run-id waitons item-path mode: testmode))
		     (fails           (calc-fails prereqs-not-met))
		     (non-completed   (calc-not-completed prereqs-not-met)))
		(debug:print 8 "INFO: have-resources: " have-resources " prereqs-not-met: " 
			     (string-intersperse 
			      (map (lambda (t)
				     (if (vector? t)
					 (conc (db:test-get-state t) "/" (db:test-get-status t))
357
358
359
360
361
362
363
364




365
366
367
368

369
370
371
372
373
374
375
376
377
378
379


380
381
382
383
384
385
386
378
379
380
381
382
383
384

385
386
387
388
389
390
391

392
393
394
395
396
397
398
399
400
401


402
403
404
405
406
407
408
409
410







-
+
+
+
+



-
+









-
-
+
+







		  ;; no loop here, just drop though and use the loop at the bottom 
		  (if (patt-list-match item-path item-patts)
		      (run:test run-id runname keyvallst test-record flags #f)
		      (debug:print 1 "INFO: Skipping " (tests:testqueue-get-testname test-record) " " item-path " as it doesn't match " item-patts))
		  ;; else the run is stuck, temporarily or permanently
		  ;; but should check if it is due to lack of resources vs. prerequisites
		  )
		 ((not have-resources) ;; simply try again after waiting a second
		 ((not have-resources)
		  ;; simply try again after waiting a second, but register the test
		  ;; so the itemized tests have place holders
		  (open-run-close tests:register-test db run-id (tests:testqueue-get-testname hed) item-path)
		  (thread-sleep! (+ 1 *global-delta*))
		  (debug:print 1 "INFO: no resources to run new tests, waiting ...")
		  ;; could have done hed tal here but doing car/cdr of newtal to rotate tests
		  (loop (car newtal)(cdr newtal)))
		  (loop hed tal)) ;; (car newtal)(cdr newtal))) WHY DID I REORDER!!? 
		 (else ;; must be we have unmet prerequisites
		    (debug:print 4 "FAILS: " fails)
		    ;; If one or more of the prereqs-not-met are FAIL then we can issue
		    ;; a message and drop hed from the items to be processed.
		    (if (null? fails)
			(begin
			  ;; couldn't run, take a breather
			  (debug:print 4 "INFO: Shouldn't really get here, race condition? Unable to launch more tests at this moment, killing time ...")
			  (thread-sleep! (+ 1 *global-delta*)) ;; long sleep here - no resources, may as well be patient
			  ;; we made new tal by sticking hed at the back of the list
			  (loop (car newtal)(cdr newtal)))
			  ;; we made new tal by sticking hed at the back of the list. BUT WHY?
			  (loop hed tal)) ;; (car newtal)(cdr newtal)))
			;; the waiton is FAIL so no point in trying to run hed ever again
			(if (not (null? tal))
			    (if (vector? hed)
				(begin (debug:print 1 "WARN: Dropping test " (db:test-get-testname hed) "/" (db:test-get-item-path hed)
						    " from the launch list as it has prerequistes that are FAIL")
				       (loop (car tal)(cdr tal)))
				(begin
443
444
445
446
447
448
449
450

451
452
453
454
455
456
457
467
468
469
470
471
472
473

474
475
476
477
478
479
480
481







-
+







			    (if (list? items-list)
				(begin
				  (tests:testqueue-set-items! test-record items-list)
				  (loop hed tal))
				(begin
				  (debug:print 0 "ERROR: The proc from reading the setup did not yield a list - please report this")
				  (exit 1))))))
		       ((null? fails)
		       ((null? fails) ;; AGAIN, WHY DID I TRY TO ROTATE THE TESTS HERE?
			(loop (car newtal)(cdr newtal))) ;; an issue with prereqs not yet met?
		       ((and (not (null? fails))(eq? testmode 'normal))
			(debug:print 1 "INFO: test "  hed " (mode=" testmode ") has failed prerequisite(s); "
				     (string-intersperse (map (lambda (t)(conc (db:test-get-testname t) ":" (db:test-get-state t)"/"(db:test-get-status t))) fails) ", ")
				     ", removing it from to-do list")
			(if (not (null? tal))
			    (loop (car tal)(cdr tal))))

Modified test_records.scm from [9245906f33] to [4ca0c9e265].

1
2


3


4
5
6
7
8
9
10
11





12
13
14
15
16
17
18
1
2
3
4

5
6
7
8
9
10
11
12


13
14
15
16
17
18
19
20
21
22
23
24


+
+
-
+
+






-
-
+
+
+
+
+







;; make-vector-record tests testqueue testname testconfig waitons priority items
(define (make-tests:testqueue)(make-vector 7 #f))

;; modified to treat the param either as a string (pure name) or vec (testqueue record)
(define-inline (tests:testqueue-get-testname     vec)    (vector-ref  vec 0))
(define-inline (tests:testqueue-get-testname     vec)    
  (if (string? vec) (car (string-split vec "/"))(vector-ref  vec 0)))
(define-inline (tests:testqueue-get-testconfig   vec)    (vector-ref  vec 1))
(define-inline (tests:testqueue-get-waitons      vec)    (vector-ref  vec 2))
(define-inline (tests:testqueue-get-priority     vec)    (vector-ref  vec 3))
;; items: #f=no items, list=list of items remaining, proc=need to call to get items
(define-inline (tests:testqueue-get-items        vec)    (vector-ref  vec 4))
(define-inline (tests:testqueue-get-itemdat      vec)    (vector-ref  vec 5))
(define-inline (tests:testqueue-get-item_path    vec)    (vector-ref  vec 6))

(define-inline (tests:testqueue-get-item_path    vec)    
  (if (string? vec)
      (let ((tmp (cdr (string-split vec "/"))))
	(if (null? tmp) "" (car tmp))
	(vector-ref  vec 6))))
(define-inline (tests:testqueue-set-testname!    vec val)(vector-set! vec 0 val))
(define-inline (tests:testqueue-set-testconfig!  vec val)(vector-set! vec 1 val))
(define-inline (tests:testqueue-set-waitons!     vec val)(vector-set! vec 2 val))
(define-inline (tests:testqueue-set-priority!    vec val)(vector-set! vec 3 val))
(define-inline (tests:testqueue-set-items!       vec val)(vector-set! vec 4 val))
(define-inline (tests:testqueue-set-itemdat!     vec val)(vector-set! vec 5 val))
(define-inline (tests:testqueue-set-item_path!   vec val)(vector-set! vec 6 val))

Modified tests.scm from [9c29f324c9] to [fafb372d78].

369
370
371
372
373
374
375
376
377
378
379



380
381
382
383
384
385
386
387
388

389

390
391
392


393
394
395
396
397
398
399
369
370
371
372
373
374
375




376
377
378
379
380
381
382
383
384
385
386
387
388

389



390
391
392
393
394
395
396
397
398







-
-
-
-
+
+
+









+
-
+
-
-
-
+
+







	      (waitons     (tests:testqueue-get-waitons   test-record))
	      (keep-test   #t)
	      (test-id     (db:get-test-id db run-id test-name item-path))
	      (tdat        (db:get-test-info-by-id db test-id)))
	 (if tdat
	     (begin
	       ;; Look at the test state and status
	       (if (or (member (db:test-get-status tdat) 
			       '("PASS" "WARN" "WAIVED" "CHECK"))
		       (member (db:test-get-state tdat)
			       '("INCOMPLETE" "KILLED")))
	       (if (or (and (member (db:test-get-status tdat) '("PASS" "WARN" "WAIVED" "CHECK"))
			    (equal? (db:test-get-state tdat)  "COMPLETED"))
		       (member (db:test-get-state tdat)       '("INCOMPLETE" "KILLED")))
		   (set! keep-test #f))

	       ;; examine waitons for any fails. If it is FAIL or INCOMPLETE then eliminate this test
	       ;; from the runnable list
	       (if keep-test
		   (for-each (lambda (waiton)
			       ;; for now we are waiting only on the parent test
			       (let* ((parent-test-id (db:get-test-id db run-id waiton ""))
				      (wtdat (db:get-test-info-by-id db test-id)))
				 (if (or (and (equal? (db:test-get-state wtdat) "COMPLETED")
				 (if (or (member (db:test-get-status wtdat)
					      (member (db:test-get-status wtdat) '("FAIL")))
						 '("FAIL" "KILLED"))
					 (member (db:test-get-state wtdat)
						 '("INCOMPETE")))
					 (member (db:test-get-status wtdat)  '("KILLED"))
					 (member (db:test-get-state wtdat)   '("INCOMPETE")))
				     (set! keep-test #f)))) ;; no point in running this one again
			     waitons))))
	 (if keep-test (set! runnables (cons testkeyname runnables)))))
     testkeynames)
    runnables))

;;======================================================================