Megatest

Check-in [71bd40f627]
Login
Overview
Comment:Added server killing, cleaning out junk records.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.60 | v1.6001_beta2
Files: files | file ages | folders
SHA1: 71bd40f62724cbb700452132ad1b96fec109f393
User & Date: mrwellan on 2014-09-11 11:44:36
Other Links: branch diff | manifest | tags
Context
2014-09-11
13:03
Experimental tweaks to address stuck server start issue check-in: 7c83ed2d8b user: mrwellan tags: v1.60
11:44
Added server killing, cleaning out junk records. check-in: 71bd40f627 user: mrwellan tags: v1.60, v1.6001_beta2
10:13
Added -import-megatest.db to help. check-in: b931cba810 user: mrwellan tags: v1.60, v1.6001_beta
Changes

Modified db.scm from [f6b0a09960] to [922a7b73d4].

764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
;; 1. Look at test records either deleted or part of deleted run:
;;    a. If test dir exists, set the the test to state='UNKNOWN', Set the run to 'unknown'
;;    b. If test dir gone, delete the test record
;; 2. Look at run records
;;    a. If have tests that are not deleted, set state='unknown'
;;    b. ....
;;
(define (db:clean-up dbstruct)

  (debug:print 0 "ERROR: db clean up not ported yet")

  (let* ((db         (db:get-db dbstruct #f))
	 (count-stmt (sqlite3:prepare db "SELECT (SELECT count(id) FROM tests)+(SELECT count(id) FROM runs);"))
	(statements
	 (map (lambda (stmt)
		(sqlite3:prepare db stmt))
	      (list
	       ;; delete all tests that belong to runs that are 'deleted'
	       "DELETE FROM tests WHERE run_id in (SELECT id FROM runs WHERE state='deleted');"







|
|
<
<
|







764
765
766
767
768
769
770
771
772


773
774
775
776
777
778
779
780
;; 1. Look at test records either deleted or part of deleted run:
;;    a. If test dir exists, set the the test to state='UNKNOWN', Set the run to 'unknown'
;;    b. If test dir gone, delete the test record
;; 2. Look at run records
;;    a. If have tests that are not deleted, set state='unknown'
;;    b. ....
;;
(define (db:clean-up db)
  (debug:print 0 "WARNING: db clean up not ported to v1.60, cleanup action will be on megatest.db")


  (let* (;; (db         (db:get-db dbstruct #f))
	 (count-stmt (sqlite3:prepare db "SELECT (SELECT count(id) FROM tests)+(SELECT count(id) FROM runs);"))
	(statements
	 (map (lambda (stmt)
		(sqlite3:prepare db stmt))
	      (list
	       ;; delete all tests that belong to runs that are 'deleted'
	       "DELETE FROM tests WHERE run_id in (SELECT id FROM runs WHERE state='deleted');"
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
			     count-stmt)
       (map sqlite3:execute statements)
       (sqlite3:for-each-row (lambda (tot)
			       (debug:print-info 0 "Records count after  clean: " tot))
			     count-stmt)))
    (map sqlite3:finalize! statements)
    (sqlite3:finalize! count-stmt)
    (db:find-and-mark-incomplete db)
    (sqlite3:execute db "VACUUM;")))

;;======================================================================
;; M E T A   G E T   A N D   S E T   V A R S
;;======================================================================

;; returns number if string->number is successful, string otherwise







|







795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
			     count-stmt)
       (map sqlite3:execute statements)
       (sqlite3:for-each-row (lambda (tot)
			       (debug:print-info 0 "Records count after  clean: " tot))
			     count-stmt)))
    (map sqlite3:finalize! statements)
    (sqlite3:finalize! count-stmt)
    ;; (db:find-and-mark-incomplete db)
    (sqlite3:execute db "VACUUM;")))

;;======================================================================
;; M E T A   G E T   A N D   S E T   V A R S
;;======================================================================

;; returns number if string->number is successful, string otherwise
1632
1633
1634
1635
1636
1637
1638



1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
(define (db:replace-test-records dbstruct run-id testrecs)
  (db:with-db dbstruct run-id #t 
	      (lambda (db)
		(let* ((qmarks (string-intersperse (make-list (length db:test-record-fields) "?") ","))
		       (qrystr (conc "INSERT OR REPLACE INTO tests (" db:test-record-qry-selector ") VALUES (" qmarks ");"))
		       (qry    (sqlite3:prepare db qrystr)))
		  (debug:print 0 "INFO: migrating test records for run with id " run-id)



		  (for-each 
		   (lambda (rec)
		     ;; (debug:print 0 "INFO: Inserting values: " (string-intersperse (map conc (vector->list rec)) ",") "\n")
		     (apply sqlite3:execute qry (vector->list rec)))
		   testrecs)
		  (sqlite3:finalize! qry)))))

;; map a test-id into the proper range
;;
(define (db:adj-test-id mtdb min-test-id test-id)
  (if (>= test-id min-test-id)
      test-id







>
>
>
|
|
|
|
|







1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
(define (db:replace-test-records dbstruct run-id testrecs)
  (db:with-db dbstruct run-id #t 
	      (lambda (db)
		(let* ((qmarks (string-intersperse (make-list (length db:test-record-fields) "?") ","))
		       (qrystr (conc "INSERT OR REPLACE INTO tests (" db:test-record-qry-selector ") VALUES (" qmarks ");"))
		       (qry    (sqlite3:prepare db qrystr)))
		  (debug:print 0 "INFO: migrating test records for run with id " run-id)
		  (sqlite3:with-transaction
		   db
		   (lambda ()
		     (for-each 
		      (lambda (rec)
			;; (debug:print 0 "INFO: Inserting values: " (string-intersperse (map conc (vector->list rec)) ",") "\n")
			(apply sqlite3:execute qry (vector->list rec)))
		      testrecs)))
		  (sqlite3:finalize! qry)))))

;; map a test-id into the proper range
;;
(define (db:adj-test-id mtdb min-test-id test-id)
  (if (>= test-id min-test-id)
      test-id

Modified megatest.scm from [04d9533be8] to [ae9ba80a5d].

1291
1292
1293
1294
1295
1296
1297
1298














1299
1300
1301
1302
1303
1304
1305
;; ;; ;; redo me        (list "uname" "rundir" "final_logf" "comment"))
;; ;; ;; redo me       (set! *didsomething* #t)))

(if (args:get-arg "-import-megatest.db")
    (let* ((toppath  (launch:setup-for-run))
	   (dbstruct (if toppath (make-dbr:dbstruct path: toppath) #f))
	   (mtdb     (if toppath (db:open-megatest-db)))
	   (run-ids  (if toppath (db:get-all-run-ids mtdb))))















      ;; adjust test-ids to fit into proper range
      ;;
      (db:prep-megatest.db-for-migration mtdb)

      ;; sync runs, test_meta etc.
      ;;







|
>
>
>
>
>
>
>
>
>
>
>
>
>
>







1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
;; ;; ;; redo me        (list "uname" "rundir" "final_logf" "comment"))
;; ;; ;; redo me       (set! *didsomething* #t)))

(if (args:get-arg "-import-megatest.db")
    (let* ((toppath  (launch:setup-for-run))
	   (dbstruct (if toppath (make-dbr:dbstruct path: toppath) #f))
	   (mtdb     (if toppath (db:open-megatest-db)))
	   (run-ids  (if toppath (db:get-all-run-ids mtdb)))
	   (mdb     (tasks:open-db))
	   (servers (tasks:get-all-servers mdb)))
      
      ;; kill servers
      (for-each
       (lambda (server)
	 (tasks:server-delete-record mdb (vector-ref server 0) "dbmigration")
	 (tasks:kill-server (vector-ref server 2)(vector-ref server 1)))
       servers)
      (sqlite3:finalize! mdb)

      ;; clear out junk records
      ;;
      (db:clean-up mtdb)

      ;; adjust test-ids to fit into proper range
      ;;
      (db:prep-megatest.db-for-migration mtdb)

      ;; sync runs, test_meta etc.
      ;;

Modified tasks.scm from [929251744a] to [393b62edba].

293
294
295
296
297
298
299
300
301
302
303
304
305


306
307
308
309

310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
     "SELECT id FROM servers WHERE run_id=? AND (state = 'running' OR (state = 'dbprep' AND  (strftime('%s','now') - start_time) < 60));" run-id)
    res))

(define (tasks:get-all-servers mdb)
  (let ((res '()))
    (sqlite3:for-each-row
     (lambda (id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id)
       ;;                       0   1        2         3    4       5          6        7     8          9          10        11     12
       (set! res (cons (vector id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id) res)))
     mdb
     "SELECT id,pid,hostname,interface,port,pubport,start_time,priority,state,mt_version,strftime('%s','now')-heartbeat AS last_update,transport,run_id FROM servers WHERE state NOT LIKE 'defunct%' ORDER BY start_time DESC;")
    res))



(define (tasks:kill-server status hostname port pid)
  (debug:print-info 1 "Removing defunct server record for " hostname ":" port)
  (if port
      (open-run-close tasks:server-deregister tasks:open-db hostname port: port)

      (open-run-close tasks:server-deregister tasks:open-db hostname pid:  pid))
  (if status ;; #t means alive
      (begin
	(if (equal? hostname (get-host-name))
	    (handle-exceptions
	     exn
	     (debug:print-info 0 "server may or may not be dead, check for megatest -server running as pid " pid "\n"
			       "  EXCEPTION: " ((condition-property-accessor 'exn 'message) exn))
	     (debug:print 1 "Sending signal/term to " pid " on " hostname)
	     (process-signal pid signal/term)
	     (thread-sleep! 5) ;; give it five seconds to die peacefully then do a brutal kill
	     ;;(process-signal pid signal/kill)
	     ) ;; local machine, send sig term
	    (begin
	      ;;(debug:print-info 1 "Stopping remote servers not yet supported."))))
	      (debug:print-info 1 "Telling alive server on " hostname ":" port " to commit servercide")
	      (let ((serverdat (list hostname port)))
		(hash-table-set! *runremote* run-id (http-transport:client-connect hostname port))
	      	(cdb:kill-server serverdat pid)))))    ;; remote machine, try telling server to commit suicide
      (begin
	(if status 
	    (if (equal? hostname (get-host-name))
		(begin
		  (debug:print-info 1 "Sending signal/term to " pid " on " hostname)
		  (process-signal pid signal/term)  ;; local machine, send sig term
		  (thread-sleep! 5)                 ;; give it five seconds to die peacefully then do a brutal kill
		  (process-signal pid signal/kill)) 
		(debug:print 0 "WARNING: Can't kill frozen server on remote host " hostname))))))


;;======================================================================
;; Tasks and Task monitors
;;======================================================================









|





>
>
|
|
<
|
>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|







293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309

310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
     "SELECT id FROM servers WHERE run_id=? AND (state = 'running' OR (state = 'dbprep' AND  (strftime('%s','now') - start_time) < 60));" run-id)
    res))

(define (tasks:get-all-servers mdb)
  (let ((res '()))
    (sqlite3:for-each-row
     (lambda (id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id)
       ;;                       0  1     2         3      4     5          6        7     8          9          10        11     12
       (set! res (cons (vector id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id) res)))
     mdb
     "SELECT id,pid,hostname,interface,port,pubport,start_time,priority,state,mt_version,strftime('%s','now')-heartbeat AS last_update,transport,run_id FROM servers WHERE state NOT LIKE 'defunct%' ORDER BY start_time DESC;")
    res))

;; no elegance here ...
;;
(define (tasks:kill-server hostname pid)
  (debug:print-info 0 "Attempting to kill server process " pid " on host " hostname)

  (setenv "TARGETHOST" hostname)
  (system (conc "nbfake kill " pid)))
 
;;   (if status ;; #t means alive
;;       (begin
;; 	(if (equal? hostname (get-host-name))
;; 	    (handle-exceptions
;; 	     exn
;; 	     (debug:print-info 0 "server may or may not be dead, check for megatest -server running as pid " pid "\n"
;; 			       "  EXCEPTION: " ((condition-property-accessor 'exn 'message) exn))
;; 	     (debug:print 1 "Sending signal/term to " pid " on " hostname)
;; 	     (process-signal pid signal/term)
;; 	     (thread-sleep! 5) ;; give it five seconds to die peacefully then do a brutal kill
;; 	     ;;(process-signal pid signal/kill)
;; 	     ) ;; local machine, send sig term
;; 	    (begin
;; 	      ;;(debug:print-info 1 "Stopping remote servers not yet supported."))))
;; 	      (debug:print-info 1 "Telling alive server on " hostname ":" port " to commit servercide")
;; 	      (let ((serverdat (list hostname port)))
;; 		(hash-table-set! *runremote* run-id (http-transport:client-connect hostname port))
;; 	      	(cdb:kill-server serverdat pid)))))    ;; remote machine, try telling server to commit suicide
;;       (begin
;; 	(if status 
;; 	    (if (equal? hostname (get-host-name))
;; 		(begin
;; 		  (debug:print-info 1 "Sending signal/term to " pid " on " hostname)
;; 		  (process-signal pid signal/term)  ;; local machine, send sig term
;; 		  (thread-sleep! 5)                 ;; give it five seconds to die peacefully then do a brutal kill
;; 		  (process-signal pid signal/kill)) 
;; 		(debug:print 0 "WARNING: Can't kill frozen server on remote host " hostname))))))


;;======================================================================
;; Tasks and Task monitors
;;======================================================================