Megatest

Check-in [1cf9221da5]
Login
Overview
Comment:filled out more exception handlers. ==/3.73/1.3/PASS/1203/orion/==
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.65-cleanup
Files: files | file ages | folders
SHA1: 1cf9221da58afe7bffd32e504f08d69ffb82485d
User & Date: mrwellan on 2020-08-24 06:54:26
Original Comment: filled out more exception handlers.
Other Links: branch diff | manifest | tags
Context
2020-08-24
18:06
Saftey fixes and minor cleanup ==/3.73/1.3/PASS/1203/orion/== check-in: 79674abc64 user: mrwellan tags: v1.65-cleanup
06:54
filled out more exception handlers. ==/3.73/1.3/PASS/1203/orion/== check-in: 1cf9221da5 user: mrwellan tags: v1.65-cleanup
00:23
Added more granular exception handling ==/9.4/0.9/WARN/2102/mars/== ==/5.7/1.2/WARN/1201/mars/== check-in: 0f33ea4295 user: mrwellan tags: v1.65-cleanup
Changes

Modified common_records.scm from [72d272b34e] to [f00d4d5706].

37
38
39
40
41
42
43

44
45
46
47
48
49
50
    ((_ (name arg ...) body ...)
     (define-syntax name (syntax-rules () ((name arg ...) (begin body ...)))))))

;; (define-syntax common:handle-exceptions
;;   (syntax-rules ()
;;     ((_ exn-in errstmt ...)(handle-exceptions exn-in errstmt ...))))


(define-syntax common:debug-handle-exceptions
  (syntax-rules ()
    ((_ debug exn errstmt body ...)
     (if debug
	 (begin body ...)
	 (handle-exceptions exn errstmt body ...)))))








>







37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
    ((_ (name arg ...) body ...)
     (define-syntax name (syntax-rules () ((name arg ...) (begin body ...)))))))

;; (define-syntax common:handle-exceptions
;;   (syntax-rules ()
;;     ((_ exn-in errstmt ...)(handle-exceptions exn-in errstmt ...))))

;; this works, why didn't I use it more?
(define-syntax common:debug-handle-exceptions
  (syntax-rules ()
    ((_ debug exn errstmt body ...)
     (if debug
	 (begin body ...)
	 (handle-exceptions exn errstmt body ...)))))

Modified megatest.scm from [71d98ab132] to [c469764f4f].

496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
;; TODO: for multiple areas, we will have multiple watchdogs; and multiple threads to manage
(define *watchdog* (make-thread
		    (lambda ()
		      (handle-exceptions
			  exn
			  (begin
			    (print-call-chain)
			    (print " message: " ((condition-property-accessor 'exn 'message) exn)))
			(common:watchdog)))
		    "Watchdog thread"))

;;(if (not (args:get-arg "-server"))
;;    (thread-start! *watchdog*)) ;; if starting a server; wait till we get to running state before kicking off watchdog
(let* ((no-watchdog-args
       '("-list-runs"







|







496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
;; TODO: for multiple areas, we will have multiple watchdogs; and multiple threads to manage
(define *watchdog* (make-thread
		    (lambda ()
		      (handle-exceptions
			  exn
			  (begin
			    (print-call-chain)
			    (print " message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn))
			(common:watchdog)))
		    "Watchdog thread"))

;;(if (not (args:get-arg "-server"))
;;    (thread-start! *watchdog*)) ;; if starting a server; wait till we get to running state before kicking off watchdog
(let* ((no-watchdog-args
       '("-list-runs"
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
;; manually started and thus should never be started in a non-megatest area. Thus no need to handle situation
;; where (launch:setup) returns #f?
;;
(if (or (args:get-arg "-log")(args:get-arg "-server")) ;; redirect the log always when a server
    (handle-exceptions
	exn
	(begin
	  (print "ERROR: Failed to switch to log output. " ((condition-property-accessor 'exn 'message) exn))
	  )
      (let* ((tl   (or (args:get-arg "-log")(launch:setup)))   ;; run launch:setup if -server, ensure we do NOT run launch:setup if -log specified
	     (logf (or (args:get-arg "-log") ;; use -log unless we are a server, then craft a logfile name
		       (conc tl "/logs/server-" (current-process-id) "-" (get-host-name) ".log")))
	     (oup  (open-logfile logf)))
	(if (not (args:get-arg "-log"))
	    (hash-table-set! args:arg-hash "-log" logf)) ;; fake out future queries of -log







|







550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
;; manually started and thus should never be started in a non-megatest area. Thus no need to handle situation
;; where (launch:setup) returns #f?
;;
(if (or (args:get-arg "-log")(args:get-arg "-server")) ;; redirect the log always when a server
    (handle-exceptions
	exn
	(begin
	  (print "ERROR: Failed to switch to log output. " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn)
	  )
      (let* ((tl   (or (args:get-arg "-log")(launch:setup)))   ;; run launch:setup if -server, ensure we do NOT run launch:setup if -log specified
	     (logf (or (args:get-arg "-log") ;; use -log unless we are a server, then craft a logfile name
		       (conc tl "/logs/server-" (current-process-id) "-" (get-host-name) ".log")))
	     (oup  (open-logfile logf)))
	(if (not (args:get-arg "-log"))
	    (hash-table-set! args:arg-hash "-log" logf)) ;; fake out future queries of -log
600
601
602
603
604
605
606
607


608
609
610
611
612
613
614
615
    (let ((original-exit (exit-handler)))
      (exit-handler (lambda (#!optional (exit-code 0))
		      (printf "Preparing to exit with exit code ~A ...\n" exit-code)
		      (for-each
		       
		       (lambda (pid)
			 (handle-exceptions
			  exn


			  #t
			  (let-values (((pid-val exit-status exit-code) (process-wait pid #t)))
				      (if (or (eq? pid-val pid)
					      (eq? pid-val 0))
					  (begin
					    (printf "Sending signal/term to ~A\n" pid)
					    (process-signal pid signal/term))))))
		       (process:children #f))







|
>
>
|







600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
    (let ((original-exit (exit-handler)))
      (exit-handler (lambda (#!optional (exit-code 0))
		      (printf "Preparing to exit with exit code ~A ...\n" exit-code)
		      (for-each
		       
		       (lambda (pid)
			 (handle-exceptions
			     exn
			   (begin
			     (printf "process reap failed. exn=~A\n" exn)
			     #t)
			  (let-values (((pid-val exit-status exit-code) (process-wait pid #t)))
				      (if (or (eq? pid-val pid)
					      (eq? pid-val 0))
					  (begin
					    (printf "Sending signal/term to ~A\n" pid)
					    (process-signal pid signal/term))))))
		       (process:children #f))

Modified mt.scm from [6bcd02e8ac] to [e9055c2687].

188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
		   (state         (if newstate  newstate  (db:test-get-state  test-dat)))
		   (status        (if newstatus newstatus (db:test-get-status test-dat))))
	      ;; (mutex-lock! *triggers-mutex*)
              (handle-exceptions
               exn
               (begin
                 (debug:print-error 0 *default-log-port* " Exception in mt:process-triggers for run-id="run-id" test-id="test-id" newstate="newstate" newstatus="newstatus
                                    "\n   error: " ((condition-property-accessor 'exn 'message) exn)
                                    "\n   test-rundir="test-rundir
                                    "\n   test-name="test-name
                                    "\n   item-path="item-path
                                    "\n   state="state
                                    "\n   status="status
                                    "\n")
                 (print-call-chain (current-error-port))







|







188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
		   (state         (if newstate  newstate  (db:test-get-state  test-dat)))
		   (status        (if newstatus newstatus (db:test-get-status test-dat))))
	      ;; (mutex-lock! *triggers-mutex*)
              (handle-exceptions
               exn
               (begin
                 (debug:print-error 0 *default-log-port* " Exception in mt:process-triggers for run-id="run-id" test-id="test-id" newstate="newstate" newstatus="newstatus
                                    "\n   error: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn
                                    "\n   test-rundir="test-rundir
                                    "\n   test-name="test-name
                                    "\n   item-path="item-path
                                    "\n   state="state
                                    "\n   status="status
                                    "\n")
                 (print-call-chain (current-error-port))

Modified process.scm from [b12ad1bbce] to [f9dfbe5500].

194
195
196
197
198
199
200
201


202
203
204
205
206
207
208
209
   (let-values (((rpid exit-type exit-signal)(process-wait pid #t)))
       (and (number? rpid)
	    (equal? rpid pid)))))

(define (process:alive-on-host? host pid)
  (let ((cmd (conc "ssh " host " ps -o pid= -p " pid)))
    (handle-exceptions
     exn


     #f ;; anything goes wrong - assume the process in NOT running.
     (with-input-from-pipe 
      cmd
      (lambda ()
	(let loop ((inl (read-line)))
	  (if (eof-object? inl)
	      #f
	      (let* ((clean-str (string-substitute "^[^\\d]*([0-9]+)[^\\d]*$" "\\1" inl))







|
>
>
|







194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
   (let-values (((rpid exit-type exit-signal)(process-wait pid #t)))
       (and (number? rpid)
	    (equal? rpid pid)))))

(define (process:alive-on-host? host pid)
  (let ((cmd (conc "ssh " host " ps -o pid= -p " pid)))
    (handle-exceptions
	exn
      (begin
	(debug:print 0 *default-log-port* "failed to identify if process " pid ", on host " host " is alive. exn=" exn)
	#f) ;; anything goes wrong - assume the process in NOT running.
     (with-input-from-pipe 
      cmd
      (lambda ()
	(let loop ((inl (read-line)))
	  (if (eof-object? inl)
	      #f
	      (let* ((clean-str (string-substitute "^[^\\d]*([0-9]+)[^\\d]*$" "\\1" inl))

Modified rmt.scm from [f016ee8609] to [f699e4c73f].

413
414
415
416
417
418
419
420


421
422
423
424
425
426
427
428
429
/		(set! *db-last-access* start-time)  ;; THIS IS PROBABLY USELESS? (we are on a client)
                (mutex-unlock! *db-multi-sync-mutex*)))))
    res))

(define (rmt:send-receive-no-auto-client-setup connection-info cmd run-id params)
  (let* ((run-id   (if run-id run-id 0))
	 (res  	   (handle-exceptions
		    exn


		    #f
		    (http-transport:client-api-send-receive run-id connection-info cmd params))))
    (if (and res (vector-ref res 0))
	(vector-ref res 1) ;;; YES!! THIS IS CORRECT!! CHANGE IT HERE, THEN CHANGE rmt:send-receive ALSO!!!
	#f)))

;; ;; Wrap json library for strings (why the ports crap in the first place?)
;; (define (rmt:dat->json-str dat)
;;   (with-output-to-string 







|
>
>
|
|







413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
/		(set! *db-last-access* start-time)  ;; THIS IS PROBABLY USELESS? (we are on a client)
                (mutex-unlock! *db-multi-sync-mutex*)))))
    res))

(define (rmt:send-receive-no-auto-client-setup connection-info cmd run-id params)
  (let* ((run-id   (if run-id run-id 0))
	 (res  	   (handle-exceptions
		       exn
		     (begin
		       (print "transport failed. exn=" exn)
		       #f)
		     (http-transport:client-api-send-receive run-id connection-info cmd params))))
    (if (and res (vector-ref res 0))
	(vector-ref res 1) ;;; YES!! THIS IS CORRECT!! CHANGE IT HERE, THEN CHANGE rmt:send-receive ALSO!!!
	#f)))

;; ;; Wrap json library for strings (why the ports crap in the first place?)
;; (define (rmt:dat->json-str dat)
;;   (with-output-to-string 

Modified server.scm from [3e4e26900f] to [bafb54f0c9].

166
167
168
169
170
171
172


173
174
175
176
177
178
179
180

;; given a path to a server log return: host port startseconds
;;
(define (server:logf-get-start-info logf)
  (let ((rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+)"))) ;; SERVER STARTED: host:port AT timesecs
    (handle-exceptions
	exn


	(list #f #f #f) ;; no idea what went wrong, call it a bad server
      (with-input-from-file
	  logf
	(lambda ()
	  (let loop ((inl  (read-line))
		     (lnum 0))
	    (if (not (eof-object? inl))
		(let ((mlst (string-match rx inl)))







>
>
|







166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182

;; given a path to a server log return: host port startseconds
;;
(define (server:logf-get-start-info logf)
  (let ((rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+)"))) ;; SERVER STARTED: host:port AT timesecs
    (handle-exceptions
	exn
      (begin
	(print "failed to get server info from " logf ", exn=" exn)
	(list #f #f #f)) ;; no idea what went wrong, call it a bad server
      (with-input-from-file
	  logf
	(lambda ()
	  (let loop ((inl  (read-line))
		     (lnum 0))
	    (if (not (eof-object? inl))
		(let ((mlst (string-match rx inl)))
212
213
214
215
216
217
218


219
220
221
222
223
224
225
226
	  (if (null? server-logs)
	      '()
	      (let loop ((hed  (car server-logs))
			 (tal  (cdr server-logs))
			 (res '()))
		(let* ((mod-time  (handle-exceptions
				      exn


				      (current-seconds) ;; 0
				    (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted
		       (down-time (- (current-seconds) mod-time))
		       (serv-dat  (if (or (< num-serv-logs 10)
				  	  (< down-time 900)) ;; day-seconds))
				      (server:logf-get-start-info hed)
				      '())) ;; don't waste time processing server files not touched in the 15 minutes if there are more than ten servers to look at
		       (serv-rec (cons mod-time serv-dat))







>
>
|







214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
	  (if (null? server-logs)
	      '()
	      (let loop ((hed  (car server-logs))
			 (tal  (cdr server-logs))
			 (res '()))
		(let* ((mod-time  (handle-exceptions
				      exn
				    (begin
				      (print "failed to get modification time on " hed ", exn=" exn)
				      (current-seconds)) ;; 0
				    (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted
		       (down-time (- (current-seconds) mod-time))
		       (serv-dat  (if (or (< num-serv-logs 10)
				  	  (< down-time 900)) ;; day-seconds))
				      (server:logf-get-start-info hed)
				      '())) ;; don't waste time processing server files not touched in the 15 minutes if there are more than ten servers to look at
		       (serv-rec (cons mod-time serv-dat))

Modified tasks.scm from [dcc03dba77] to [b621e9649f].

71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
(define (tasks:get-task-db-path)
  (let ((dbdir  (or (configf:lookup *configdat* "setup" "monitordir")
		    (configf:lookup *configdat* "setup" "dbdir")
		    (conc (common:get-linktree) "/.db"))))
    (handle-exceptions
     exn
     (begin
       (debug:print-error 0 *default-log-port* "Couldn't create path to " dbdir)
       (exit 1))
     (if (not (directory? dbdir))(create-directory dbdir #t)))
    dbdir))

;; If file exists AND
;;    file readable
;;         ==> open it







|







71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
(define (tasks:get-task-db-path)
  (let ((dbdir  (or (configf:lookup *configdat* "setup" "monitordir")
		    (configf:lookup *configdat* "setup" "dbdir")
		    (conc (common:get-linktree) "/.db"))))
    (handle-exceptions
     exn
     (begin
       (debug:print-error 0 *default-log-port* "Couldn't create path to " dbdir ", exn=" exn)
       (exit 1))
     (if (not (directory? dbdir))(create-directory dbdir #t)))
    dbdir))

;; If file exists AND
;;    file readable
;;         ==> open it

Modified tests.scm from [947715acf0] to [0094b671e6].

550
551
552
553
554
555
556


557
558
559
560
561
562
563
564
		  (change-directory orig-dir)
		  ;; NB// tests:test-set-toplog! is remote internal...
		  (tests:test-set-toplog! run-id test-name outputfilename))
		;; didn't get the lock, check to see if current update started later than this 
		;; update, if so we can exit without doing any work
		(if (> my-start-time (handle-exceptions
					 exn


					 0
				       (file-modification-time lockf)))
		    ;; we started since current re-gen in flight, delay a little and try again
		    (begin
		      (debug:print-info 1 *default-log-port* "Waiting to update " outputfilename ", another test currently updating it")
		      (thread-sleep! (+ 5 (random 5))) ;; delay between 5 and 10 seconds
		      (loop (common:simple-file-lock lockf))))))))))








>
>
|







550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
		  (change-directory orig-dir)
		  ;; NB// tests:test-set-toplog! is remote internal...
		  (tests:test-set-toplog! run-id test-name outputfilename))
		;; didn't get the lock, check to see if current update started later than this 
		;; update, if so we can exit without doing any work
		(if (> my-start-time (handle-exceptions
					 exn
				       (begin
					 (print "failed to get mod time on " lockf ", exn=" exn)
					 0)
				       (file-modification-time lockf)))
		    ;; we started since current re-gen in flight, delay a little and try again
		    (begin
		      (debug:print-info 1 *default-log-port* "Waiting to update " outputfilename ", another test currently updating it")
		      (thread-sleep! (+ 5 (random 5))) ;; delay between 5 and 10 seconds
		      (loop (common:simple-file-lock lockf))))))))))

1502
1503
1504
1505
1506
1507
1508


1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
    (if fnamepatt
	(apply append 
	       (map (lambda (p)
		      (if (directory-exists? p)
			  (let ((glob-query (conc p "/" fnamepatt)))
			    (handle-exceptions
				exn


				(with-input-from-pipe
				    (conc "echo " glob-query)
				  read-lines)  ;; we aren't going to try too hard. If glob breaks it is likely because someone tried to do */*/*.log or similar
			      (glob glob-query)))
			  '()))
		    paths-from-db))
	paths-from-db)))

			      
;;======================================================================







>
>

|
|







1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
    (if fnamepatt
	(apply append 
	       (map (lambda (p)
		      (if (directory-exists? p)
			  (let ((glob-query (conc p "/" fnamepatt)))
			    (handle-exceptions
				exn
			      (begin
				(print "built-in glob on " glob-query ", failed, try using the shell. exn=" exn)
				(with-input-from-pipe
				 (conc "echo " glob-query)
				 read-lines))  ;; we aren't going to try too hard. If glob breaks it is likely because someone tried to do */*/*.log or similar
			      (glob glob-query)))
			  '()))
		    paths-from-db))
	paths-from-db)))

			      
;;======================================================================
1557
1558
1559
1560
1561
1562
1563
1564


1565
1566
1567
1568
1569
1570
1571
1572
1573
	 (cache-exists (and cache-file
			    (not force-create)  ;; if force-create then pretend there is no cache to read
			    (common:file-exists? cache-file)))
	 (cached-dat   (if (and (not force-create)
				cache-exists
				use-cache)
			   (handle-exceptions
			    exn


			    #f ;; any issues, just give up with the cached version and re-read
			    (configf:read-alist cache-file))
			   #f))
         (test-full-name (if (and item-path (not (string-null? item-path)))
                             (conc test-name "/" item-path)
                             test-name)))
    (if cached-dat
	cached-dat
	(let ((dat (hash-table-ref/default *testconfigs* test-full-name #f)))







|
>
>
|
|







1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
	 (cache-exists (and cache-file
			    (not force-create)  ;; if force-create then pretend there is no cache to read
			    (common:file-exists? cache-file)))
	 (cached-dat   (if (and (not force-create)
				cache-exists
				use-cache)
			   (handle-exceptions
			       exn
			     (begin
			       (debug:print 0 *default-log-port* "failed to read " cache-file ", exn=" exn)
			       #f) ;; any issues, just give up with the cached version and re-read
			     (configf:read-alist cache-file))
			   #f))
         (test-full-name (if (and item-path (not (string-null? item-path)))
                             (conc test-name "/" item-path)
                             test-name)))
    (if cached-dat
	cached-dat
	(let ((dat (hash-table-ref/default *testconfigs* test-full-name #f)))