Megatest

Check-in [4dcb84418f]
Login
Overview
Comment:Some more tweaks and output reduction. Still get crashes due to db lock but system seems to keep going pretty well. This is with 300 tests running on one machine.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.70-nohomehost
Files: files | file ages | folders
SHA1: 4dcb84418f9518463e8fce6e7143d904529bc4ca
User & Date: matt on 2022-11-22 08:59:16
Other Links: branch diff | manifest | tags
Context
2022-11-22
09:06
Turn the handler for opening server info files back on since those files can disappear without warning. Closed-Leaf check-in: abb42df5ef user: matt tags: v1.70-nohomehost
08:59
Some more tweaks and output reduction. Still get crashes due to db lock but system seems to keep going pretty well. This is with 300 tests running on one machine. check-in: 4dcb84418f user: matt tags: v1.70-nohomehost
07:49
Keep more servers around but let them also expire quickly if not used check-in: c0ef1c5bfa user: matt tags: v1.70-nohomehost
Changes

Modified client.scm from [6f5dc4a64f] to [2a6738b25e].

42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69

;; Not currently used! But, I think it *should* be used!!!
#;(define (client:logout serverdat)
  (let ((ok (and (socket? serverdat)
		 (cdb:logout serverdat *toppath* (client:get-signature)))))
    ok))

#;(define (client:connect iface port)
  (http-transport:client-connect iface port)
  #;(case (server:get-transport)
    ((rpc)  (rpc:client-connect  iface port))
    ((http) (http:client-connect iface port))
    ((zmq)  (zmq:client-connect  iface port))
    (else   (rpc:client-connect  iface port))))

(define (client:setup areapath #!key (remaining-tries 100) (failed-connects 0))
  (client:setup-http areapath remaining-tries: remaining-tries failed-connects: failed-connects)
  #;(case (server:get-transport)
    ((rpc) (rpc-transport:client-setup remaining-tries: remaining-tries failed-connects: failed-connects)) ;;(client:setup-rpc run-id))
    ((http)(client:setup-http areapath remaining-tries: remaining-tries failed-connects: failed-connects))
    (else  (rpc-transport:client-setup remaining-tries: remaining-tries failed-connects: failed-connects)))) ;; (client:setup-rpc run-id))))

;; Do all the connection work, look up the transport type and set up the
;; connection if required.
;;
;; There are two scenarios. 
;;   1. We are a test manager and we received *transport-type* and *runremote* via cmdline
;;   2. We are a run tests, list runs or other interactive process and we must figure out







<
<
<
<
<
<
<
<

|
<
<
<
<







42
43
44
45
46
47
48








49
50




51
52
53
54
55
56
57

;; Not currently used! But, I think it *should* be used!!!
#;(define (client:logout serverdat)
  (let ((ok (and (socket? serverdat)
		 (cdb:logout serverdat *toppath* (client:get-signature)))))
    ok))









(define (client:setup areapath #!key (remaining-tries 100) (failed-connects 0))
  (client:setup-http areapath remaining-tries: remaining-tries failed-connects: failed-connects))





;; Do all the connection work, look up the transport type and set up the
;; connection if required.
;;
;; There are two scenarios. 
;;   1. We are a test manager and we received *transport-type* and *runremote* via cmdline
;;   2. We are a run tests, list runs or other interactive process and we must figure out

Modified dbfile.scm from [e95b97d328] to [d24edd08a7].

1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
    (if (file-exists? jfile)
	(begin
	  (dbfile:print-err "INFO: "jfile" exists, delaying to reduce database load")
	  (thread-sleep! 0.2)))
    (if (and use-mutex
	     (common:low-noise-print 120 "over-50-parallel-api-requests"))
	(dbfile:print-err *api-process-request-count* " parallel api requests being processed in process "
			  (current-process-id) ", throttling access"))
    (condition-case
	(begin
	  (if use-mutex (mutex-lock! *db-with-db-mutex*))
	  (let ((res (apply proc dbdat db params))) ;; the actual call is here.
	    (if use-mutex (mutex-unlock! *db-with-db-mutex*))
	    ;; (if (vector? dbstruct)(db:done-with dbstruct run-id r/w))
	    (if dbdat







|







1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
    (if (file-exists? jfile)
	(begin
	  (dbfile:print-err "INFO: "jfile" exists, delaying to reduce database load")
	  (thread-sleep! 0.2)))
    (if (and use-mutex
	     (common:low-noise-print 120 "over-50-parallel-api-requests"))
	(dbfile:print-err *api-process-request-count* " parallel api requests being processed in process "
			  (current-process-id))) ;;  ", throttling access"))
    (condition-case
	(begin
	  (if use-mutex (mutex-lock! *db-with-db-mutex*))
	  (let ((res (apply proc dbdat db params))) ;; the actual call is here.
	    (if use-mutex (mutex-unlock! *db-with-db-mutex*))
	    ;; (if (vector? dbstruct)(db:done-with dbstruct run-id r/w))
	    (if dbdat

Modified http-transport.scm from [c23198333f] to [84bebe0a7c].

498
499
500
501
502
503
504
505
506

507
508
509
510
511
512
513
514
      (if (not server-going) ;; *dbstruct-dbs* 
	  (begin
	    (debug:print 0 *default-log-port* "SERVER: dbprep")
	    (set! *dbstruct-dbs*  (db:setup #t)) ;;  run-id)) FIXME!!!
	    (set! server-going #t)
	    (debug:print 0 *default-log-port* "SERVER: running, megatest version: " (common:get-full-version))) ;; NOTE: the server is NOT yet marked as running in the log. We do that in the keep-running routine.
	  (if (and no-sync-db
		   (common:low-noise-print 5 "sync-all")) ;; cheesy way to reduce frequency of running sync :)
              (begin

                (debug:print 0 *default-log-port* "keep-running calling db:all-db-sync at " (time->string (seconds->local-time) "%H:%M:%S"))
		(db:all-db-sync *dbstruct-dbs*))))
      
      ;; when things go wrong we don't want to be doing the various queries too often
      ;; so we strive to run this stuff only every four seconds or so.
      (let* ((sync-time (- (current-milliseconds) start-time))
	    (rem-time  (quotient (- 4000 sync-time) 1000)))
	(if (and (<= rem-time 4)







|

>
|







498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
      (if (not server-going) ;; *dbstruct-dbs* 
	  (begin
	    (debug:print 0 *default-log-port* "SERVER: dbprep")
	    (set! *dbstruct-dbs*  (db:setup #t)) ;;  run-id)) FIXME!!!
	    (set! server-going #t)
	    (debug:print 0 *default-log-port* "SERVER: running, megatest version: " (common:get-full-version))) ;; NOTE: the server is NOT yet marked as running in the log. We do that in the keep-running routine.
	  (if (and no-sync-db
		   (common:low-noise-print 10 "sync-all")) ;; cheesy way to reduce frequency of running sync :)
              (begin
		(if (common:low-noise-print 120 "sync-all-print")
                    (debug:print 0 *default-log-port* "keep-running calling db:all-db-sync at " (time->string (seconds->local-time) "%H:%M:%S")))
		(db:all-db-sync *dbstruct-dbs*))))
      
      ;; when things go wrong we don't want to be doing the various queries too often
      ;; so we strive to run this stuff only every four seconds or so.
      (let* ((sync-time (- (current-milliseconds) start-time))
	    (rem-time  (quotient (- 4000 sync-time) 1000)))
	(if (and (<= rem-time 4)
556
557
558
559
560
561
562
563
564
565

566
567
568
569
570
571




572
573
574
575
576
577
578
	       (> (+ last-access server-timeout)
		  (current-seconds)))
          (if (common:low-noise-print 120 "server continuing")
              (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access))
	      (let ((curr-time (current-seconds)))
		(handle-exceptions
		    exn
		    (debug:print 0 *default-log-port* "ERROR: Failed to change timestamp on log file " server-log-file ". Are you out of space on that disk? exn=" exn)
		    (if (and (< (- (current-seconds) server-start-time) 600) ;; run for ten minutes for experiment, 3600 thereafter
			     (not *server-overloaded*))

			(change-file-times servinfofile curr-time curr-time)))
		(if (or (common:low-noise-print 120 "start new server")
			(> *api-process-request-count* 50)) ;; if this server is kind of busy start up another
		    (begin
		      (debug:print-info 0 *default-log-port* "Server is busy, start another if possible...")
		      (server:kind-run *toppath*)))))




          (loop 0 server-state bad-sync-count (current-milliseconds)))
         (else
          (debug:print-info 0 *default-log-port* "Server timed out. seconds since last db access: " (- (current-seconds) last-access))
          (http-transport:server-shutdown port)))))))

(define (http-transport:server-shutdown port)
  (begin







|

|
>




|
|
>
>
>
>







557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
	       (> (+ last-access server-timeout)
		  (current-seconds)))
          (if (common:low-noise-print 120 "server continuing")
              (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access))
	      (let ((curr-time (current-seconds)))
		(handle-exceptions
		    exn
		    (debug:print 0 *default-log-port* "ERROR: Failed to change timestamp on info file " servinfofile ". Are you out of space on that disk? exn=" exn)
		    (if (and (< (- (current-seconds) server-start-time) 600) ;; run for ten minutes for experiment, 3600 thereafter
			     (not *server-overloaded*)
			     (file-exists? servinfofile))
			(change-file-times servinfofile curr-time curr-time)))
		(if (or (common:low-noise-print 120 "start new server")
			(> *api-process-request-count* 50)) ;; if this server is kind of busy start up another
		    (begin
		      (debug:print-info 0 *default-log-port* "Server is busy, parallel-api-count "*api-process-request-count*", start another if possible...")
		      (server:kind-run *toppath*)
		      (if (> *api-process-request-count* 100)
			  (begin
			    (debug:print-info 0 *default-log-port* "Server is overloaded at parallel-api-count="*api-process-request-count*", removing "servinfofile) 
			    (delete-file* servinfofile)))))))
          (loop 0 server-state bad-sync-count (current-milliseconds)))
         (else
          (debug:print-info 0 *default-log-port* "Server timed out. seconds since last db access: " (- (current-seconds) last-access))
          (http-transport:server-shutdown port)))))))

(define (http-transport:server-shutdown port)
  (begin

Modified server.scm from [69e932d5ac] to [90ee47bed7].

666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
;; Default is 60 seconds.
;;
(define (server:expiration-timeout)
  (let ((tmo (configf:lookup *configdat* "server" "timeout")))
    (if (and (string? tmo)
	     (common:hms-string->seconds tmo)) ;; BUG: hms-string->seconds is broken, if given "10" returns 0. Also, it doesn't belong in this logic unless the string->number is changed below
        (* 3600 (string->number tmo))
	1200)))

(define (server:get-best-guess-address hostname)
  (let ((res #f))
    (for-each 
     (lambda (adr)
       (if (not (eq? (u8vector-ref adr 0) 127))
	   (set! res adr)))







|







666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
;; Default is 60 seconds.
;;
(define (server:expiration-timeout)
  (let ((tmo (configf:lookup *configdat* "server" "timeout")))
    (if (and (string? tmo)
	     (common:hms-string->seconds tmo)) ;; BUG: hms-string->seconds is broken, if given "10" returns 0. Also, it doesn't belong in this logic unless the string->number is changed below
        (* 3600 (string->number tmo))
	60)))

(define (server:get-best-guess-address hostname)
  (let ((res #f))
    (for-each 
     (lambda (adr)
       (if (not (eq? (u8vector-ref adr 0) 127))
	   (set! res adr)))