Megatest

Check-in [7383eb0df2]
Login
Overview
Comment:Merged 31c3 from v1.55 into v1.60 and fixed couple compile issues
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.60
Files: files | file ages | folders
SHA1: 7383eb0df296776e15095ed546e62c9d6105e528
User & Date: mrwellan on 2014-06-02 13:33:40
Other Links: branch diff | manifest | tags
Context
2014-06-02
13:38
Merged f2d7 from v1.55 to v1.60 check-in: e30eb474c8 user: mrwellan tags: v1.60
13:33
Merged 31c3 from v1.55 into v1.60 and fixed couple compile issues check-in: 7383eb0df2 user: mrwellan tags: v1.60
11:21
Merged 0f5d from v1.55 to v1.60 check-in: ce8b9e0b55 user: mrwellan tags: v1.60
2014-03-28
11:45
Fixed missing call to set state/status correctly on killreq check-in: 31c35bf056 user: mrwellan tags: v1.55
Changes

Modified db.scm from [b980dbcc63] to [c98f61a5ff].

1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021

























2022
2023
2024
2025
2026
2027
2028
1990
1991
1992
1993
1994
1995
1996

























1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028







-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







				    (> (db:test-get-event_time testdat)(db:test-get-event_time stored-test))))
			   ;; this test is younger, store it in the hash
			   (hash-table-set! tests-hash full-testname testdat))))
		   results)
		  (if (null? tal)
		      (map cdr (hash-table->alist tests-hash)) ;; return a list of the most recent tests
		      (loop (car tal)(cdr tal))))))))))
			   (let* ((remtries 10)
				  (proc     #f))
			     (set! proc (lambda (remtries)
					  (if (> remtries 0)
					      (handle-exceptions
					       exn
					       (let ((sleep-time (random 30))
						     (err-status ((condition-property-accessor 'sqlite3 'status #f) exn)))
						 (case err-status
						   ((busy)
						    (thread-sleep! sleep-time)
						    (proc 10)) ;; we never give up on busy
						   (else
						    (debug:print 0 "EXCEPTION: database probably overloaded or unreadable.")
						    (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
						    (debug:print 0 " status:  " ((condition-property-accessor 'sqlite3 'status)  exn))
						    (print-call-chain)
						    (debug:print 0 "Sleeping for " sleep-time)
						    (thread-sleep! sleep-time)
						    (debug:print-info 0 "trying db call one more time....this may never recover, if necessary kill process " (current-process-id) " on host " (get-host-name) " to clean up")
						    (proc (- remtries 1)))))
					       (apply sqlite3:execute db query params))
					      (debug:print 0 "ERROR: too many attempts to access db were made and no sucess. query: "
							   query ", params: " params))))
			     (proc remtries))
;; 			   (let* ((remtries 10)
;; 				  (proc     #f))
;; 			     (set! proc (lambda (remtries)
;; 					  (if (> remtries 0)
;; 					      (handle-exceptions
;; 					       exn
;; 					       (let ((sleep-time (random 30))
;; 						     (err-status ((condition-property-accessor 'sqlite3 'status #f) exn)))
;; 						 (case err-status
;; 						   ((busy)
;; 						    (thread-sleep! sleep-time)
;; 						    (proc 10)) ;; we never give up on busy
;; 						   (else
;; 						    (debug:print 0 "EXCEPTION: database probably overloaded or unreadable.")
;; 						    (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
;; 						    (debug:print 0 " status:  " ((condition-property-accessor 'sqlite3 'status)  exn))
;; 						    (print-call-chain)
;; 						    (debug:print 0 "Sleeping for " sleep-time)
;; 						    (thread-sleep! sleep-time)
;; 						    (debug:print-info 0 "trying db call one more time....this may never recover, if necessary kill process " (current-process-id) " on host " (get-host-name) " to clean up")
;; 						    (proc (- remtries 1)))))
;; 					       (apply sqlite3:execute db query params))
;; 					      (debug:print 0 "ERROR: too many attempts to access db were made and no sucess. query: "
;; 							   query ", params: " params))))
;; 			     (proc remtries))

(define (db:test-get-records-for-index-file dbstruct run-id test-name)
  (let ((res '()))
    (sqlite3:for-each-row 
     (lambda (id itempath state status run_duration logf-id comment-id)
       (let ((logf    (db:get-string dbstruct logf-id))
	     (comment (db:get-string dbstruct comment-id)))

Modified launch.scm from [dd204b865e] to [e833897f83].

328
329
330
331
332
333
334



335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350


















351
352
353
354


355
356
357
358
359
360
361
328
329
330
331
332
333
334
335
336
337
















338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357


358
359
360
361
362
363
364
365
366







+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+


-
-
+
+







					   (begin
					     (mutex-lock! m)
					     ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this
					     ;;       section and the runit section? Or add a loop that tries three times with a 1/4 second
					     ;;       between tries?
					     (let* ((pid (vector-ref exit-info 0)))
					       (if (number? pid)
						   (handle-exceptions
						    exn
						    (debug:print-info 0 "Unable to kill process with pid " pid ", possibly already killed.")
						   (process-signal pid signal/kill)
						   ;; (begin
						   ;;   (debug:print 0 "WARNING: Request received to kill job (attempt # " kill-tries ")")
						   ;;   (let ((processes (cmd-run->list (conc "pgrep -l -P " pid))))
						   ;;     (for-each 
						   ;;      (lambda (p)
						   ;;        (let* ((parts  (string-split p))
						   ;;      	 (p-id   (if (> (length parts) 0)
						   ;;      		     (string->number (car parts))
						   ;;      		     #f)))
						   ;;          (if p-id
						   ;;      	(begin
						   ;;      	  (debug:print 0 "Killing " (cadr parts) "; kill -9  " p-id)
						   ;;      	  (system (conc "kill -9 " p-id))))))
						   ;;      (car processes))
						   ;;     (system (conc "kill -9 -" pid))))
						    ;;(process-signal pid signal/kill))
						    (begin
						      (debug:print 0 "WARNING: Request received to kill job (attempt # " kill-tries ")")
						      (let ((processes (cmd-run->list (conc "pgrep -l -P " pid))))
							(for-each 
							 (lambda (p)
							   (let* ((parts  (string-split p))
								  (p-id   (if (> (length parts) 0)
									      (string->number (car parts))
									      #f)))
							     (if p-id
								 (begin
								   (debug:print 0 "Killing " (cadr parts) "; kill -9  " p-id)
								   ;; (process-signal pid signal/kill))))) ;; 
								   (system (conc "kill -9 " p-id))))))
							 (car processes)))
						      (system (conc "kill -9 -" pid))
						      (tests:test-set-status! test-id "KILLED"  "FAIL" (args:get-arg "-m") #f)))
						   (begin
						     (debug:print 0 "WARNING: Request received to kill job but problem with process, attempting to kill manager process")
						     (tests:test-set-status! run-id test-id "KILLED"  "FAIL"
								     (args:get-arg "-m") #f)
;;						     (tests:test-set-status! run-id test-id "KILLED"  "FAIL"
						     (tests:test-set-status! run-id test-id "KILLED"  "FAIL" (args:get-arg "-m") #f)
						     (exit 1) ;; IS THIS NECESSARY OR WISE???
						     )))
					     (set! kill-tries (+ 1 kill-tries))
					     (mutex-unlock! m)))
				       (if keep-going
					   (begin
					     (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses