Megatest

Changes On Branch 60706141c17744b6
Login

Changes In Branch v1.81-fixes Excluding Merge-Ins

This is equivalent to a diff from 98f3441b4f to 60706141c1

2024-08-19
11:42
CI/CD: Automated commit after successful test, build, and deploy for v1.81-fix-extract-scripts check-in: 29155bc147 user: fdiskadm tags: v1.81
2024-08-16
13:34
Patched forward the adjutant code that got lost in v1.65 Leaf check-in: 13060ce126 user: matt tags: v1.81-adjutant
2024-08-13
12:55
removed extra copy of launch:extract-scripts-logpro and corrected it to add .logpro to the logpro filenames check-in: e829926867 user: mmgraham tags: v1.81-fix-extract-scripts
2024-07-18
05:41
Minor cleanup Leaf check-in: 60706141c1 user: mrwellan tags: v1.81-fixes
2024-07-17
19:21
Changed Megatest version to v1.8181 check-in: a748f29739 user: icfadm tags: v1.81
19:17
Lower gating on test launch to 0.05 journal load. Add exception handler for file-modification-time on .servinfo files check-in: 7c315bd32d user: mrwellan tags: v1.81-fixes
17:13
Move sync transaction in an attempt to free up bound time in .mtdb/*.db files Leaf check-in: fab9bf9c5c user: mrwellan tags: v1.81-better sync
2024-07-15
15:47
Changed Megatest version to 1.8102 check-in: 98f3441b4f user: icfadm tags: v1.81
15:12
CI/CD: Automated commit after successful test, build, and deploy for v1.81-bump-server-load check-in: 1fff14fbea user: fdiskadm tags: v1.81

Modified db.scm from [346b188c56] to [9fe915e0e1].

2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477

2478
2479
2480
2481
2482
2483
2484
2449
2450
2451
2452
2453
2454
2455














2456
2457
2458
2459
2460



2461
2462
2463
2464
2465
2466
2467
2468







-
-
-
-
-
-
-
-
-
-
-
-
-
-





-
-
-
+







     dbstruct
     run-id
     #f
     (lambda (dbdat db)
       (let* ((stmth (db:get-cache-stmth dbdat db qry)))
	 (sqlite3:first-result stmth))))))

;; NEW BEHAVIOR: Count tests running in only one run!
;;
(define (db:get-count-tests-actually-running dbstruct run-id)
  (db:with-db
   dbstruct
   run-id
   #f
   (lambda (dbdat db)
     (sqlite3:first-result
      db
      ;; WARNING BUG EDIT ME - merged from v1.55 - not sure what is right here ...
      ;; "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id NOT IN (SELECT id FROM runs WHERE state='deleted') AND NOT (uname = 'n/a' AND item_path = '');")
      "SELECT count(id) FROM tests WHERE state in ('RUNNING','REMOTEHOSTSTART','LAUNCHED') AND run_id=?;" 
      run-id)))) ;; NOT IN (SELECT id FROM runs WHERE state='deleted');")

;; NEW BEHAVIOR: Look only at single run with run-id
;; 
;; (define (db:get-running-stats dbstruct run-id)
(define (db:get-count-tests-running-for-run-id dbstruct run-id) ;; fastmode)
  (let* ((qry ;; (if fastmode
		 ;;  "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=? LIMIT 1;"
		  "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=?;")) ;; )
  (let* ((qry "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=?;"))
    (db:with-db
     dbstruct
     run-id
     #f
     (lambda (dbdat db)
       (let* ((stmth (db:get-cache-stmth dbdat db qry)))
	 (sqlite3:first-result stmth run-id))))))

Modified runs.scm from [2d4118afd2] to [d02f63b65d].

1150
1151
1152
1153
1154
1155
1156
1157

1158
1159
1160
1161
1162
1163
1164
1150
1151
1152
1153
1154
1155
1156

1157
1158
1159
1160
1161
1162
1163
1164







-
+







	 (run-limits-info        (runs:dat-can-run-more-tests runsdat))
	 ;; (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running
	 (have-resources         (and (if *journal-stats*
					  (let* ((dbfname (conc
							   (dbfile:run-id->dbnum run-id)
							   ".db"))
						 (load (tt:get-journal-stats dbfname)))
					    (if (> load 0.1) ;; dbs too busy to start more tests
					    (if (> load 0.05) ;; dbs too busy to start more tests
						(begin
						  (debug:print-info 0 *default-log-port* "Gating launch due to db load "load" based on journal file observations for "dbfname)
						 #f)
						#t))
					  (begin
					    (debug:print-info 0 *default-log-port* "Journal gating not started for "run-id)
					    #t)) ;; if journal monitoring not started do not gate

Modified tcp-transportmod.scm from [b9c6fed28d] to [cc2742f5f6].

638
639
640
641
642
643
644



645

646
647
648
649
650
651
652
653
654
655


656
657
658
659
660
661
662
638
639
640
641
642
643
644
645
646
647

648
649
650
651
652
653
654
655
656


657
658
659
660
661
662
663
664
665







+
+
+
-
+








-
-
+
+







				     (same-host (or (not prime-host) ;; i.e. this is the first host
						    (equal? prime-host host)))
				     (keep-srv  (and good-ping same-host)))
				(if keep-srv	
				    (loop (cdr servrs)
					  host
					  (cons servdat result))
				    (let* ((modtime (handle-exceptions
						     exn
						     9999 ;; file probably disappeared
				    (let* ((modtime (file-modification-time servinfofile)))
						     (file-modification-time servinfofile))))
				      ;; if the .servinfo hasn't been touched in five min
				      ;; we can be pretty sure the server is truly dead
				      (if (> (- (current-seconds) modtime) 360)
					  (handle-exceptions
					   exn
					   (debug:print-info 0 *default-log-port*
							     "Error removing server info file: "servinfofile", "
							     (condition->list exn))
					   (delete-file* servinfofile))
					  (loop (cdr servrs) prime-host result))))))
					   (delete-file* servinfofile)))
				      (loop (cdr servrs) prime-host result)))))
			     (else
			      ;; can't delete it as we don't have a filename. NOTE: Should never get here.
			      (debug:print-info 0 *default-log-port* "ERROR: bad servinfo record \""servdat"\"")
			      (loop (cdr servrs) prime-host result)) ;; drop 
			     )))))
	       (home-host (if (null? good-srvrs)
			      #f