Megatest

Diff
Login

Differences From Artifact [af38f0fd65]:

To Artifact [bc77148d79]:


24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
     matchable regex posix (srfi 18) extras ;; tcp 
     (prefix nanomsg nmsg:)
     (prefix sqlite3 sqlite3:)
     pkts (prefix dbi dbi:)
     )

(declare (unit common))
(declare (uses commonmod))
(import commonmod)

(include "common_records.scm")


;; (require-library margs)
;; (include "margs.scm")








|
|







24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
     matchable regex posix (srfi 18) extras ;; tcp 
     (prefix nanomsg nmsg:)
     (prefix sqlite3 sqlite3:)
     pkts (prefix dbi dbi:)
     )

(declare (unit common))
;; (declare (uses commonmod))
;; (import commonmod)

(include "common_records.scm")


;; (require-library margs)
;; (include "margs.scm")

47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65

66

67
68
69
70
71
72
73
74
75
76
77
78
79

;; execute thunk, return value.  If exception thrown, trap exception, return #f, and emit nonfatal condition note to *default-log-port* .
;; arguments - thunk, message
(define (common:fail-safe thunk warning-message-on-exception)
  (handle-exceptions
   exn
   (begin
     (debug:print-info 0 *default-log-port* "notable but nonfatal condition - "warning-message-on-exception)
     (debug:print-info 0 *default-log-port*
                       (string-substitute "\n?Error:" "nonfatal condition:"
                                          (with-output-to-string
                                            (lambda ()
                                              (print-error-message exn) ))))
     (debug:print-info 0 *default-log-port* "    -- continuing after nonfatal condition...")
     #f)
   (thunk)))

(define getenv get-environment-variable)
(define (safe-setenv key val)

  (if (or (substring-index "!" key) (substring-index ":" key)) ;; variables containing : are for internal use and cannot be environment variables.

      (debug:print-error 4 *default-log-port* "skip setting internal use only variables containing \":\" or starting with \"!\"")
      (if (and (string? val)
	       (string? key))
	  (handle-exceptions
	      exn
	      (debug:print-error 0 *default-log-port* "bad value for setenv, key=" key ", value=" val)
	    (setenv key val))
	  (debug:print-error 0 *default-log-port* "bad value for setenv, key=" key ", value=" val))))

(define home (getenv "HOME"))
(define user (getenv "USER"))









|











>
|
>





|







47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81

;; execute thunk, return value.  If exception thrown, trap exception, return #f, and emit nonfatal condition note to *default-log-port* .
;; arguments - thunk, message
(define (common:fail-safe thunk warning-message-on-exception)
  (handle-exceptions
   exn
   (begin
     (debug:print-info 0 *default-log-port* "notable but nonfatal condition - "warning-message-on-exception", exn=" exn)
     (debug:print-info 0 *default-log-port*
                       (string-substitute "\n?Error:" "nonfatal condition:"
                                          (with-output-to-string
                                            (lambda ()
                                              (print-error-message exn) ))))
     (debug:print-info 0 *default-log-port* "    -- continuing after nonfatal condition...")
     #f)
   (thunk)))

(define getenv get-environment-variable)
(define (safe-setenv key val)
  (if (or (substring-index "!" key)
	  (substring-index ":" key)  ;; variables containing : are for internal use and cannot be environment variables.
	  (substring-index "." key)) ;; periods are not allowed in environment variables
      (debug:print-error 4 *default-log-port* "skip setting internal use only variables containing \":\" or starting with \"!\"")
      (if (and (string? val)
	       (string? key))
	  (handle-exceptions
	      exn
	      (debug:print-error 0 *default-log-port* "bad value for setenv, key=" key ", value=" val ", exn=" exn)
	    (setenv key val))
	  (debug:print-error 0 *default-log-port* "bad value for setenv, key=" key ", value=" val))))

(define home (getenv "HOME"))
(define user (getenv "USER"))


224
225
226
227
228
229
230






















231
232
233
234
235
236
237
            (caddr argv))
           (else (car argv))))
         (fullpath (realpath this-script)))
    fullpath))
(define *common:this-exe-fullpath* (common:get-this-exe-fullpath))
(define *common:this-exe-dir*      (pathname-directory *common:this-exe-fullpath*))
(define *common:this-exe-name*     (pathname-strip-directory *common:this-exe-fullpath*))























(define *common:logpro-exit-code->status-sym-alist*
  '( ( 0 . pass )
     ( 1 . fail )
     ( 2 . warn )
     ( 3 . check )
     ( 4 . waived )







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
            (caddr argv))
           (else (car argv))))
         (fullpath (realpath this-script)))
    fullpath))
(define *common:this-exe-fullpath* (common:get-this-exe-fullpath))
(define *common:this-exe-dir*      (pathname-directory *common:this-exe-fullpath*))
(define *common:this-exe-name*     (pathname-strip-directory *common:this-exe-fullpath*))

;; when called from a wrapper I need sometimes to find the calling
;; wrapper, this is for dashboard to find the correct megatest.
;;
(define (common:find-local-megatest #!optional (progname "megatest"))
  (let ((res (filter file-exists?
		     (map (lambda (updir)
			    (let* ((lm  (car (argv)))
				   (dir (pathname-directory lm))
				   (exe (pathname-strip-directory lm)))
			      (conc (if dir (conc dir "/") "")
				    (case (string->symbol exe)
				      ((dboard)    (conc updir progname))
				      ((mtest)     (conc updir progname))
				      ((dashboard) progname)
				      (else exe)))))
			  '("../../" "../")))))
    (if (null? res)
	(begin
	  (debug:print 0 *current-log-port* "Failed to find this executable! Using what can be found on the path")
	  progname)
	(car res))))

(define *common:logpro-exit-code->status-sym-alist*
  '( ( 0 . pass )
     ( 1 . fail )
     ( 2 . warn )
     ( 3 . check )
     ( 4 . waived )
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
	(max-allowed (string->number (or (configf:lookup *configdat* "setup" "max-logfiles") "300")))) ;; name -> age
    (if (not (directory-exists? "logs"))(create-directory "logs"))
    (directory-fold 
     (lambda (file rem)
       (handle-exceptions
	exn
	(begin
	  (debug:print-info 2 *default-log-port* "unable to rotate log " file ", probably handled by another process, this is safe to ignore.")
	  (debug:print 2 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
	  ;; (print-call-chain (current-error-port)) ;; 
	  )
	(let* ((fullname  (conc "logs/" file))
	       (mod-time  (file-modification-time fullname))
	       (file-age  (- (current-seconds) mod-time))
	       (file-old  (> file-age (* 48 60 60)))







|







506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
	(max-allowed (string->number (or (configf:lookup *configdat* "setup" "max-logfiles") "300")))) ;; name -> age
    (if (not (directory-exists? "logs"))(create-directory "logs"))
    (directory-fold 
     (lambda (file rem)
       (handle-exceptions
	exn
	(begin
	  (debug:print-info 2 *default-log-port* "unable to rotate log " file ", probably handled by another process, this is safe to ignore. exn=" exn)
	  (debug:print 2 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
	  ;; (print-call-chain (current-error-port)) ;; 
	  )
	(let* ((fullname  (conc "logs/" file))
	       (mod-time  (file-modification-time fullname))
	       (file-age  (- (current-seconds) mod-time))
	       (file-old  (> file-age (* 48 60 60)))
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
	    (for-each
	     (lambda (file)
	       (let* ((fullname (conc "logs/" file)))
		 (if (directory? fullname)
		     (debug:print-error 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.")
		     (handle-exceptions
		      exn
		      (debug:print-error 0 *default-log-port* "failed to remove " fullname)
		      (delete-file* fullname)))))
	     files)
	    (debug:print-info 0 *default-log-port* "Deleted " (length files) " files from logs, keeping " max-allowed " files."))))))
  
;; Force a megatest cleanup-db if version is changed and skip-version-check not specified
;; Do NOT check if not on homehost!
;;







|







568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
	    (for-each
	     (lambda (file)
	       (let* ((fullname (conc "logs/" file)))
		 (if (directory? fullname)
		     (debug:print-error 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.")
		     (handle-exceptions
		      exn
		      (debug:print-error 0 *default-log-port* "failed to remove " fullname ", exn=" exn)
		      (delete-file* fullname)))))
	     files)
	    (debug:print-info 0 *default-log-port* "Deleted " (length files) " files from logs, keeping " max-allowed " files."))))))
  
;; Force a megatest cleanup-db if version is changed and skip-version-check not specified
;; Do NOT check if not on homehost!
;;
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
             ((get-environment-variable "MT_SKIP_DB_MIGRATE") #t)
             ((and (common:file-exists? mtconf) (common:file-exists? dbfile) (not read-only)
                   (eq? (current-user-id)(file-owner mtconf))) ;; safe to run -cleanup-db
              (debug:print 0 *default-log-port* "   I see you are the owner of megatest.config, attempting to cleanup and reset to new version")
              (handle-exceptions
               exn
               (begin
                 (debug:print 0 *default-log-port* "Failed to switch versions.")
                 (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
                 (print-call-chain (current-error-port))
                 (exit 1))
               (common:cleanup-db dbstruct)))
             ((not (common:file-exists? mtconf))
              (debug:print 0 *default-log-port* "   megatest.config does not exist in this area.  Cannot proceed with megatest version migration.")
              (exit 1))







|







595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
             ((get-environment-variable "MT_SKIP_DB_MIGRATE") #t)
             ((and (common:file-exists? mtconf) (common:file-exists? dbfile) (not read-only)
                   (eq? (current-user-id)(file-owner mtconf))) ;; safe to run -cleanup-db
              (debug:print 0 *default-log-port* "   I see you are the owner of megatest.config, attempting to cleanup and reset to new version")
              (handle-exceptions
               exn
               (begin
                 (debug:print 0 *default-log-port* "Failed to switch versions. exn=" exn)
                 (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
                 (print-call-chain (current-error-port))
                 (exit 1))
               (common:cleanup-db dbstruct)))
             ((not (common:file-exists? mtconf))
              (debug:print 0 *default-log-port* "   megatest.config does not exist in this area.  Cannot proceed with megatest version migration.")
              (exit 1))
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696

(define (common:read-encoded-string instr)
  (handle-exceptions
   exn
   (handle-exceptions
    exn
    (begin
      (debug:print-error 0 *default-log-port* "received bad encoded string \"" instr "\", message: " ((condition-property-accessor 'exn 'message) exn))
      (print-call-chain (current-error-port))
      #f)
    (read (open-input-string (base64:base64-decode instr))))
   (read (open-input-string (z3:decode-buffer (base64:base64-decode instr))))))

;; dot-locking egg seems not to work, using this for now
;; if lock is older than expire-time then remove it and try again







|







706
707
708
709
710
711
712
713
714
715
716
717
718
719
720

(define (common:read-encoded-string instr)
  (handle-exceptions
   exn
   (handle-exceptions
    exn
    (begin
      (debug:print-error 0 *default-log-port* "received bad encoded string \"" instr "\", message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn)
      (print-call-chain (current-error-port))
      #f)
    (read (open-input-string (base64:base64-decode instr))))
   (read (open-input-string (z3:decode-buffer (base64:base64-decode instr))))))

;; dot-locking egg seems not to work, using this for now
;; if lock is older than expire-time then remove it and try again
882
883
884
885
886
887
888
889
890


891
892
893
894
895
896
897
898
  (let ((res (assoc key lst)))
    (if res (cadr res)(if (null? default) #f (car default)))))

(define (common:get-testsuite-name)
  (or (configf:lookup *configdat* "setup" "area-name") ;; megatest is a flexible tool, testsuite is too limiting a description.
      (configf:lookup *configdat* "setup" "testsuite" )
      (getenv "MT_TESTSUITE_NAME")
      (if (string? *toppath* )
          (pathname-file *toppath*)


          #f))) ;; (pathname-file (current-directory)))))

;; safe getting of toppath
(define (common:get-toppath areapath)
  (or *toppath*
      (if areapath
	  (begin
	    (set! *toppath* areapath)







|
|
>
>
|







906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
  (let ((res (assoc key lst)))
    (if res (cadr res)(if (null? default) #f (car default)))))

(define (common:get-testsuite-name)
  (or (configf:lookup *configdat* "setup" "area-name") ;; megatest is a flexible tool, testsuite is too limiting a description.
      (configf:lookup *configdat* "setup" "testsuite" )
      (getenv "MT_TESTSUITE_NAME")
      (pathname-file (or (if (string? *toppath* )
			     (pathname-file *toppath*)
			     #f)
			 (common:get-topath #f)))
      "please-set-setup-area-name")) ;; (pathname-file (current-directory)))))

;; safe getting of toppath
(define (common:get-toppath areapath)
  (or *toppath*
      (if areapath
	  (begin
	    (set! *toppath* areapath)
920
921
922
923
924
925
926
927
928

929
930
931




932
933

934
935
936
937
938
939
940
941
942
(define (common:get-db-tmp-area . junk)
  (if *db-cache-path*
      *db-cache-path*
      (if *toppath* ;; common:get-create-writeable-dir
	  (handle-exceptions
	      exn
	      (begin
		(debug:print-error 0 *default-log-port* "Couldn't create path to " *db-cache-path*)
		(exit 1))

	    (let ((dbpath (common:get-create-writeable-dir
			   (list (conc "/tmp/" (current-user-name)
				       "/megatest_localdb/"




				       (common:get-testsuite-name) "/"
				       (string-translate *toppath* "/" ".")))))) ;;  #t))))

	      (set! *db-cache-path* dbpath)
	      dbpath))
	  #f)))

(define (common:get-area-path-signature)
  (message-digest-string (md5-primitive) *toppath*))

(define (common:get-signature str)
  (message-digest-string (md5-primitive) str))







|

>
|
|
|
>
>
>
>
|
|
>
|
|







946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
(define (common:get-db-tmp-area . junk)
  (if *db-cache-path*
      *db-cache-path*
      (if *toppath* ;; common:get-create-writeable-dir
	  (handle-exceptions
	      exn
	      (begin
		(debug:print-error 0 *default-log-port* "Couldn't create path to " *db-cache-path* ", exn=" exn)
		(exit 1))
	      (let* ((tsname (common:get-testsuite-name))
		     (dbpath (common:get-create-writeable-dir
			      (list (conc "/tmp/" (current-user-name)
					  "/megatest_localdb/"
					  tsname "/"
					  (string-translate *toppath* "/" "."))
				    (conc "/tmp/" (current-process-id) ;; just in case we have an issue with the dir by own user name
					  "/megatest_localdb/"
					  tsname
					  (string-translate *toppath* "/" "."))
				    ))))
		(set! *db-cache-path* dbpath)
		dbpath))
	  #f)))

(define (common:get-area-path-signature)
  (message-digest-string (md5-primitive) *toppath*))

(define (common:get-signature str)
  (message-digest-string (md5-primitive) str))
1165
1166
1167
1168
1169
1170
1171
1172

1173
1174
1175
1176
1177
1178
1179
		 (tal (cdr dirs)))
	(let ((res (or (and (directory? hed)
			    (file-write-access? hed)
			    hed)
		       (handle-exceptions
			   exn
			   (begin
			     (debug:print-info 0 *default-log-port* "could not create " hed ", this might cause problems down the road.")

			     #f)
			(create-directory hed #t)))))
	  (if (and (string? res)
		   (directory? res))
	      res
	      (if (null? tal)
		  #f







|
>







1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
		 (tal (cdr dirs)))
	(let ((res (or (and (directory? hed)
			    (file-write-access? hed)
			    hed)
		       (handle-exceptions
			   exn
			   (begin
			     (debug:print-info 0 *default-log-port* "could not create " hed
					       ", this might cause problems down the road. exn=" exn)
			     #f)
			(create-directory hed #t)))))
	  (if (and (string? res)
		   (directory? res))
	      res
	      (if (null? tal)
		  #f
1309
1310
1311
1312
1313
1314
1315
1316


1317
1318
1319
1320
1321
1322
1323
1324

;; does the directory exist and do we have write access?
;;
;;    returns the directory or #f
;;
(define (common:directory-writable? path-string)
  (handle-exceptions
   exn


   #f
   (if (and (directory-exists? path-string)
            (file-write-access? path-string))
       path-string
       #f)))

(define (common:get-linktree)
  (or (getenv "MT_LINKTREE")







|
>
>
|







1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359

;; does the directory exist and do we have write access?
;;
;;    returns the directory or #f
;;
(define (common:directory-writable? path-string)
  (handle-exceptions
      exn
    (begin
      (debug:print 0 *default-log-port* "Failed to identify access to " path-string ", exn=" exn)
      #f)
   (if (and (directory-exists? path-string)
            (file-write-access? path-string))
       path-string
       #f)))

(define (common:get-linktree)
  (or (getenv "MT_LINKTREE")
1403
1404
1405
1406
1407
1408
1409
1410


1411
1412
1413
1414
1415


1416
1417
1418
1419
1420
1421
1422
	   ;; first look in config, then look in file .homehost, create it if not found
	   (homehost (or (configf:lookup *configdat* "server" "homehost" )
			 (handle-exceptions
			     exn
			     (if (> trynum 0)
				 (let ((delay-time (* (- 5 trynum) 5)))
				   (mutex-unlock! *homehost-mutex*)
				   (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Failed to read .homehost file, delaying " delay-time " seconds and trying again, message: "  ((condition-property-accessor 'exn 'message) exn))


				   (thread-sleep! delay-time)
				   (common:get-homehost trynum: (- trynum 1)))
				 (begin
				   (mutex-unlock! *homehost-mutex*)
				   (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Failed to read .homehost file after trying five times. Giving up and exiting, message: "  ((condition-property-accessor 'exn 'message) exn))


				   (exit 1)))
			   (let ((hhf (conc *toppath* "/.homehost")))
			     (if (common:file-exists? hhf)
				 (with-input-from-file hhf read-line)
				 (if (file-write-access? *toppath*)
				     (begin
				       (with-output-to-file hhf







|
>
>




|
>
>







1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
	   ;; first look in config, then look in file .homehost, create it if not found
	   (homehost (or (configf:lookup *configdat* "server" "homehost" )
			 (handle-exceptions
			     exn
			     (if (> trynum 0)
				 (let ((delay-time (* (- 5 trynum) 5)))
				   (mutex-unlock! *homehost-mutex*)
				   (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Failed to read .homehost file, delaying "
						delay-time " seconds and trying again, message: "  ((condition-property-accessor 'exn 'message) exn)
						", exn=" exn)
				   (thread-sleep! delay-time)
				   (common:get-homehost trynum: (- trynum 1)))
				 (begin
				   (mutex-unlock! *homehost-mutex*)
				   (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)
						"] Failed to read .homehost file after trying five times. Giving up and exiting, message: "
						((condition-property-accessor 'exn 'message) exn))
				   (exit 1)))
			   (let ((hhf (conc *toppath* "/.homehost")))
			     (if (common:file-exists? hhf)
				 (with-input-from-file hhf read-line)
				 (if (file-write-access? *toppath*)
				     (begin
				       (with-output-to-file hhf
1663
1664
1665
1666
1667
1668
1669


1670

1671

1672
1673
1674
1675
1676


1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
;;======================================================================

;; lazy-safe get file mod time. on any error (file not existing etc.) return 0
;;
(define (common:lazy-modification-time fpath)
  (handle-exceptions
      exn


      0

    (file-modification-time fpath)))


;; find timestamp of newest file associated with a sqlite db file
(define (common:lazy-sqlite-db-modification-time fpath)
  (let* ((glob-list (handle-exceptions
			exn


			`(,(conc "/no/such/file, message: " ((condition-property-accessor 'exn 'message) exn)))
		      (glob (conc fpath "*"))))
         (file-list (if (eq? 0 (length glob-list))
			'("/no/such/file")
			glob-list)))
  (apply max
   (map
    common:lazy-modification-time 
    file-list))))

;; return a nice clean pathname made absolute
(define (common:nice-path dir)
  (let ((match (string-match "^(~[^\\/]*)(\\/.*|)$" dir)))
    (if match ;; using ~ for home?
	(common:nice-path (conc (common:read-link-f (cadr match)) "/" (caddr match)))
	(normalize-pathname (if (absolute-pathname? dir)
				dir
				(conc (current-directory) "/" dir))))))

;; make "nice-path" available in config files and the repl
(define nice-path common:nice-path)

(define (common:read-link-f path)
  (handle-exceptions
      exn
      (begin
	(debug:print-error 0 *default-log-port* "command \"/bin/readlink -f " path "\" failed.")
	path) ;; just give up
    (with-input-from-pipe
	(conc "/bin/readlink -f " path)
      (lambda ()
	(read-line)))))

(define (get-cpu-load #!key (remote-host #f))







>
>
|
>
|
>





>
>
|





|
|
|

















|







1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
;;======================================================================

;; lazy-safe get file mod time. on any error (file not existing etc.) return 0
;;
(define (common:lazy-modification-time fpath)
  (handle-exceptions
      exn
    (begin
      (debug:print 0 *default-log-port* "Failed to get modifcation time for " fpath ", treating it as zero. exn=" exn)
      0)
    (if (file-exists? fpath)
	(file-modification-time fpath)
	0)))

;; find timestamp of newest file associated with a sqlite db file
(define (common:lazy-sqlite-db-modification-time fpath)
  (let* ((glob-list (handle-exceptions
			exn
		      (begin
			(debug:print 0 *default-log-port* "Failed to glob " fpath "*, exn=" exn)
			`(,(conc "/no/such/file, message: " ((condition-property-accessor 'exn 'message) exn))))
		      (glob (conc fpath "*"))))
         (file-list (if (eq? 0 (length glob-list))
			'("/no/such/file")
			glob-list)))
  (apply max
	 (map
	  common:lazy-modification-time 
	  file-list))))

;; return a nice clean pathname made absolute
(define (common:nice-path dir)
  (let ((match (string-match "^(~[^\\/]*)(\\/.*|)$" dir)))
    (if match ;; using ~ for home?
	(common:nice-path (conc (common:read-link-f (cadr match)) "/" (caddr match)))
	(normalize-pathname (if (absolute-pathname? dir)
				dir
				(conc (current-directory) "/" dir))))))

;; make "nice-path" available in config files and the repl
(define nice-path common:nice-path)

(define (common:read-link-f path)
  (handle-exceptions
      exn
      (begin
	(debug:print-error 0 *default-log-port* "command \"/bin/readlink -f " path "\" failed. exn=" exn)
	path) ;; just give up
    (with-input-from-pipe
	(conc "/bin/readlink -f " path)
      (lambda ()
	(read-line)))))

(define (get-cpu-load #!key (remote-host #f))
1726
1727
1728
1729
1730
1731
1732
1733


1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759


1760
1761
1762
1763










1764
1765
1766
1767
1768


1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779







1780
1781


1782
1783
1784
1785
1786
1787
1788
;;
(define (common:get-cached-info key dtype #!key (age 10))
  (if *toppath*
      (let* ((fullpath (conc *toppath* "/.sysdata/" key "-" dtype ".log")))
	(if (and (file-exists? fullpath)
		 (file-read-access? fullpath))
	    (handle-exceptions
	     exn


	     #f
	     (debug:print 2 *default-log-port* "reading file " fullpath)
	     (let ((real-age (- (current-seconds)(file-change-time fullpath)))) 
	       (if (< real-age age)
		   (handle-exceptions
		       exn
		     (begin
		       (debug:print-info 1 *default-log-port* " removing bad file " fullpath)
		       (delete-file* fullpath)
		       #f)
		     (with-input-from-file fullpath read))
		   (begin
		     (debug:print-info 2 *default-log-port* "file " fullpath " is too old (" real-age" seconds) to trust, skipping reading it")
		     #f))))
	    (begin
	      (debug:print 2 *default-log-port* "not reading file " fullpath)
	      #f)))
      #f))
 
(define (common:write-cached-info key dtype dat)
  (if *toppath*
      (let* ((fulldir  (conc *toppath* "/.sysdata"))
	     (fullpath (conc fulldir "/" key "-" dtype ".log")))
	(if (not (file-exists? fulldir))(create-directory fulldir #t))
	(handle-exceptions
	 exn


	 #f
	 (with-output-to-file fullpath (lambda ()(pp dat)))))
      #f))
  










;; get cpu load by reading from /proc/loadavg, return all three values
;;
(define (common:get-cpu-load remote-host)
  (handle-exceptions
   exn


   '(99 99 99)
   (let* ((actual-hostname (or remote-host (get-host-name) "localhost")))
     (or (common:get-cached-info actual-hostname "cpu-load")
	 (let ((result (if remote-host
			   (map (lambda (res)
				  (if (eof-object? res) 9e99 res))
			        (with-input-from-pipe 
				 (conc "ssh " remote-host " cat /proc/loadavg")
				 (lambda ()(list (read)(read)(read)))))
			   (with-input-from-file "/proc/loadavg" 
			     (lambda ()(list (read)(read)(read)))))))







	   (common:write-cached-info actual-hostname "cpu-load" result)
	                     result)))))



;; get normalized cpu load by reading from /proc/loadavg and /proc/cpuinfo return all three values and the number of real cpus and the number of threads
;; returns alist '((adj-cpu-load . normalized-proc-load) ... etc.
;;  keys: adj-proc-load, adj-core-load, 1m-load, 5m-load, 15m-load
;;
(define (common:get-normalized-cpu-load remote-host)
  (let ((res (common:get-normalized-cpu-load-raw remote-host))







|
>
>
|






|

















|
>
>
|
|

|
>
>
>
>
>
>
>
>
>
>




|
>
>
|
|
|
|
|
|
|
|
|
|
|
>
>
>
>
>
>
>
|
|
>
>







1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
;;
(define (common:get-cached-info key dtype #!key (age 10))
  (if *toppath*
      (let* ((fullpath (conc *toppath* "/.sysdata/" key "-" dtype ".log")))
	(if (and (file-exists? fullpath)
		 (file-read-access? fullpath))
	    (handle-exceptions
		exn
	      (begin
		(debug:print 0 *default-log-port* "failed to get cached info from " fullpath ", exn=" exn)
		#f)
	     (debug:print 2 *default-log-port* "reading file " fullpath)
	     (let ((real-age (- (current-seconds)(file-change-time fullpath)))) 
	       (if (< real-age age)
		   (handle-exceptions
		       exn
		     (begin
		       (debug:print-info 1 *default-log-port* " removing bad file " fullpath ", exn=" exn)
		       (delete-file* fullpath)
		       #f)
		     (with-input-from-file fullpath read))
		   (begin
		     (debug:print-info 2 *default-log-port* "file " fullpath " is too old (" real-age" seconds) to trust, skipping reading it")
		     #f))))
	    (begin
	      (debug:print 2 *default-log-port* "not reading file " fullpath)
	      #f)))
      #f))
 
(define (common:write-cached-info key dtype dat)
  (if *toppath*
      (let* ((fulldir  (conc *toppath* "/.sysdata"))
	     (fullpath (conc fulldir "/" key "-" dtype ".log")))
	(if (not (file-exists? fulldir))(create-directory fulldir #t))
	(handle-exceptions
	    exn
	  (begin
	    (debug:print 0 *default-log-path* "failed to write file " fullpath ", exn=" exn)
	    #f)
	  (with-output-to-file fullpath (lambda ()(pp dat)))))
      #f))

(define (common:raw-get-remote-host-load remote-host)
  (handle-exceptions
      exn
    (begin
      (debug:print 0 *default-log-port* "failed to ssh to " remote-host " and get loadavg. exn=" exn)
      #f) ;; more specific handling of errors needed
    (with-input-from-pipe 
     (conc "ssh " remote-host " cat /proc/loadavg")
     (lambda ()(list (read)(read)(read))))))

;; get cpu load by reading from /proc/loadavg, return all three values
;;
(define (common:get-cpu-load remote-host)
  (handle-exceptions
      exn
    (begin
      (debug:print 0 *default-log-port* "failed to ssh or read loadavg from host " remote-host ", exn=" exn)
      '(-99 -99 -99))
    (let* ((actual-hostname (or remote-host (get-host-name) "localhost")))
      (or (common:get-cached-info actual-hostname "cpu-load")
	  (let ((result (if remote-host
			    (map (lambda (res)
				   (if (eof-object? res) 9e99 res))
			         (with-input-from-pipe 
				  (conc "ssh " remote-host " cat /proc/loadavg")
				  (lambda ()(list (read)(read)(read)))))
			    (with-input-from-file "/proc/loadavg" 
			      (lambda ()(list (read)(read)(read)))))))
	    (match
		result
	      ((l1 l2 l3)
	       (if (and (number? l1)
		      (number? l2)
		      (number? l3))
		   (begin
		     (common:write-cached-info actual-hostname "cpu-load" result)
		     result)
		   '(-1 -1 -1))) ;; -1 is bad result
	      (else '(-2 -2 -2))))))))

;; get normalized cpu load by reading from /proc/loadavg and /proc/cpuinfo return all three values and the number of real cpus and the number of threads
;; returns alist '((adj-cpu-load . normalized-proc-load) ... etc.
;;  keys: adj-proc-load, adj-core-load, 1m-load, 5m-load, 15m-load
;;
(define (common:get-normalized-cpu-load remote-host)
  (let ((res (common:get-normalized-cpu-load-raw remote-host))
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043




2044





2045



2046

2047
2048
2049
2050






2051
2052
2053





2054
2055
2056



2057
2058

2059
2060





2061
2062
2063
2064
2065
2066
2067
	  (thread-sleep! (random 60)) ;; we failed to get num cpus. wait a bit and try again
	  (if (> rem-tries 0)
	      (common:wait-for-normalized-load maxload msg remote-host (- rem-tries 1))
	      #f)))))

;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load
;;
(define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f))
  (let* ((loadavg (common:get-cpu-load remote-host))
	 (numcpus (if (<= 1 numcpus-in) ;; not possible to have zero.  If we get 1, it's possible that we got the previous default, and we should check again
		      (common:get-num-cpus remote-host)
		      numcpus-in))
	 (maxload (if force-maxload
		      maxload-in
		      (if (number? maxload-in)
			  (max maxload-in 0.5)
			  0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME?
	 (first   (car loadavg))
	 (next    (cadr loadavg))
	 (adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1




	 (loadjmp (- first next))





         (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)(/ (- 1000 count) 10) waitdelay) (- first adjload) ))  )));; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously



    ;; let's let the user know once in a long while that load checking is happening but not constantly report it

    (if (> (random 100) 75) ;; about 25% of the time
	(debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload
			  ", load: " first ", adjload: " adjload ", loadjmp: " loadjmp))
    (cond






     ((and (> first adjload)
	   (> count 0))
      (debug:print-info 0 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg ""))





      (thread-sleep! adjwait)
      (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
     ((and (> loadjmp numcpus)



	   (> count 0))
      (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg ""))

      (thread-sleep! adjwait)
      (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)))))






(define (get-uname . params)
  (let* ((uname-res (process:cmd-run->list (conc "uname " (if (null? params) "-a" (car params)))))
	 (uname #f))
    (if (null? (car uname-res))
	"unknown"
	(caar uname-res))))







|











|
>
>
>
>
|
>
>
>
>
>
|
>
>
>
|
>
|

|

>
>
>
>
>
>
|

|
>
>
>
>
>


|
>
>
>

|
>

|
>
>
>
>
>







2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
	  (thread-sleep! (random 60)) ;; we failed to get num cpus. wait a bit and try again
	  (if (> rem-tries 0)
	      (common:wait-for-normalized-load maxload msg remote-host (- rem-tries 1))
	      #f)))))

;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load
;;
(define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f)(num-tries 5))
  (let* ((loadavg (common:get-cpu-load remote-host))
	 (numcpus (if (<= 1 numcpus-in) ;; not possible to have zero.  If we get 1, it's possible that we got the previous default, and we should check again
		      (common:get-num-cpus remote-host)
		      numcpus-in))
	 (maxload (if force-maxload
		      maxload-in
		      (if (number? maxload-in)
			  (max maxload-in 0.5)
			  0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME?
	 (first   (car loadavg))
	 (next    (cadr loadavg))
	 (adjmaxload (* maxload (max 1 numcpus))) ;; possible bug where
					       ;; numcpus (or could be
					       ;; maxload) is zero,
					       ;; crude fallback is to
					       ;; at least use 1
	 (loadjmp (- first (if (> next (* numcpus 0.7)) ;; could do something with average of first and next?
			       0
			       next))) ;; we will force a conservative calculation any time next is large.
	 (first-next-avg    (/ (+ first next) 2))
	 ;; add some randomness to the time to break any alignment
	 ;; where netbatch dumps many jobs to machines simultaneously
         (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)
						      (/ (- 1000 count) 10)
						      waitdelay)
						   (- first adjmaxload) ))  )))
    ;; let's let the user know once in a long while that load checking
    ;; is happening but not constantly report it
    (if (common:low-noise-print 30 (conc "cpuload" (or remote-host "localhost"))) ;; (> (random 100) 75) ;; about 25% of the time
	(debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload
			  ", load: " first ", adjmaxload: " adjmaxload ", loadjmp: " loadjmp))
    (cond
     ((and (< first 0) ;; this indicates the loadavg data is bad - machine may not be reachable
	   (> num-tries 0))
      (debug:print 0 *default-log-port* "WARNING: received bad data from get-cpu-load " first ", we'll sleep 10s and try " num-tries " more times.")
      (thread-sleep! 10)
      (common:wait-for-cpuload maxload-in numcpus-in waitdelay
			       count: count remote-host: remote-host force-maxload: force-maxload num-tries: (- num-tries 1)))
     ((and (> first adjmaxload)
	   (> count 0))
      (debug:print-info 0 *default-log-port*
			"Delaying " adjwait
			" seconds due to load " first
			" exceeding max of " adjmaxload
			" on server " (or remote-host (get-host-name))
			" (normalized load-limit: " maxload ") " (if msg msg ""))
      (thread-sleep! adjwait)
      (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
     ((and (> loadjmp (cond
		       ((> numcpus 8)(/ numcpus 4))
		       ((> numcpus 4)(/ numcpus 2))
		       (else 0.5)))
	   (> count 0))
      (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to possible load jump " loadjmp ". "
			(if msg msg ""))
      (thread-sleep! adjwait)
      (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
     (else
      (if (> num-tries 0)
	  (if (common:low-noise-print 30 (conc (round first) "-load-acceptable-" (or remote-host "localhost")))
	      (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") " is acceptable at " first " continuing."))
	  (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") ", "first" could not be retrieved. Giving up and continuing."))))))

(define (get-uname . params)
  (let* ((uname-res (process:cmd-run->list (conc "uname " (if (null? params) "-a" (car params)))))
	 (uname #f))
    (if (null? (car uname-res))
	"unknown"
	(caar uname-res))))
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
	  (let ((pktsdir (or (hash-table-ref/default *pkts-info* 'pkts-dir #f)
			     (let* ((pktsdirs (common:get-pkts-dirs mtconf use-lt))
				    (pktsdir   (car pktsdirs))) ;; assume it is there
			       (hash-table-set! *pkts-info* 'pkts-dir pktsdir)
			       pktsdir))))
            (handle-exceptions
             exn
             (debug:print-info 0 "failed to write out packet to " pktsdir) ;; don't care if this failed for now but MUST FIX - BUG!!
             (if (not (file-exists? pktsdir))
                 (create-directory pktsdir #t))
             (with-output-to-file
                 (conc pktsdir "/" uuid ".pkt")
               (lambda ()
                 (print pkt)))))))))
	







|







3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
	  (let ((pktsdir (or (hash-table-ref/default *pkts-info* 'pkts-dir #f)
			     (let* ((pktsdirs (common:get-pkts-dirs mtconf use-lt))
				    (pktsdir   (car pktsdirs))) ;; assume it is there
			       (hash-table-set! *pkts-info* 'pkts-dir pktsdir)
			       pktsdir))))
            (handle-exceptions
             exn
             (debug:print-info 0 "failed to write out packet to " pktsdir ", exn=" exn) ;; don't care if this failed for now but MUST FIX - BUG!!
             (if (not (file-exists? pktsdir))
                 (create-directory pktsdir #t))
             (with-output-to-file
                 (conc pktsdir "/" uuid ".pkt")
               (lambda ()
                 (print pkt)))))))))
	
3276
3277
3278
3279
3280
3281
3282
3283


3284
3285
3286
3287
3288
3289
3290
3291
3292
(define (common:join-backgrounded-threads)
  ;; may need to trap and ignore exceptions -- dunno how atomic threads are...
  (for-each
   (lambda (thread-name)
     (let* ((thread (hash-table-ref/default *common:thread-punchlist* thread-name #f)))
       (if thread
           (handle-exceptions
           exn


           #t ;; just ignore it, it might have died in the meantime so joining it will throw an exception
           (thread-join! thread))
           )))
   (hash-table-keys *common:thread-punchlist*)))

(define *common:telemetry-log-state* 'startup)
(define *common:telemetry-log-socket* #f)

(define (common:telemetry-log-open)







|
>
>
|
|







3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
(define (common:join-backgrounded-threads)
  ;; may need to trap and ignore exceptions -- dunno how atomic threads are...
  (for-each
   (lambda (thread-name)
     (let* ((thread (hash-table-ref/default *common:thread-punchlist* thread-name #f)))
       (if thread
           (handle-exceptions
               exn
	     (begin
	       (debug:print 0 *default-log-port* "joining threads failed. exn=" exn)
               #t) ;; just ignore it, it might have died in the meantime so joining it will throw an exception
             (thread-join! thread))
           )))
   (hash-table-keys *common:thread-punchlist*)))

(define *common:telemetry-log-state* 'startup)
(define *common:telemetry-log-socket* #f)

(define (common:telemetry-log-open)