Megatest

Check-in [95d7411602]
Login
Overview
Comment:Speculative fix for bad commit e3040.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | v1.65-test-rundat-try1
Files: files | file ages | folders
SHA1: 95d7411602fc3911d39ee53a1f5254f16135deff
User & Date: matt on 2020-10-03 12:57:50
Other Links: branch diff | manifest | tags
Context
2020-10-03
12:57
Speculative fix for bad commit e3040. Leaf check-in: 95d7411602 user: matt tags: v1.65-test-rundat-try1
2020-09-19
04:21
Start moving test_rundat to no-sync db. ==/20/2/WARN/1203/mars/== check-in: abfabdb839 user: matt tags: v1.65-test-rundat
Changes

Modified TODO from [0ecebc45ef] to [0885dee1e5].

15
16
17
18
19
20
21
22


23
24
25

26
27
28
29
30
31
32
15
16
17
18
19
20
21

22
23
24
25

26
27
28
29
30
31
32
33







-
+
+


-
+







#     You should have received a copy of the GNU General Public License
#     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.

TODO
====

WW38
. Add test_rundat to no-sync
. Add test_rundat to no-sync ==> correction, put in <testdir>/.meta/test-run.dat
. Add STATE/STATUS transitions to .meta/test-run.dat or similar
. Swizzle update-test-rundat to operate on no-sync
. Swizzle update-run-duration, -uname-host and cpuload-diskfree to no-sync
. On test completion copy some of the data from no-sync to test_rundat
. On state/status change update tests table with duration

WW15
. fill newview matrix with data, filter pipeline gui elements
. improve [script], especially indent handling

WW16
. split db into megatest.db (runs etc.) db/<something>.db

Modified db.scm from [fb3a18f52f] to [d7c987aadc].

3465
3466
3467
3468
3469
3470
3471
3472

3473
3474
3475
3476
3477
3478
3479
3465
3466
3467
3468
3469
3470
3471

3472
3473
3474
3475
3476
3477
3478
3479







-
+







  (let* ((run-ids (db:get-all-run-ids mtdb)))
    (for-each 
     (lambda (run-id)
       (let ((testrecs (db:get-all-tests-info-by-run-id mtdb run-id)))
	 (db:prep-megatest.db-adj-test-ids (db:dbdat-get-db mtdb) run-id testrecs)))
     run-ids)))

;; Get test data using test_id, run-id is not used
;; Get test data using test_id, run-id is not used - but it will be!
;; 
(define (db:get-test-info-by-id dbstruct run-id test-id)
  (db:with-db
   dbstruct
   #f ;; run-id
   #f
   (lambda (db)

Modified launch.scm from [d0067277fa] to [98c9d63eeb].

203
204
205
206
207
208
209
210

211
212
213
214
215
216



217
218
219
220
221
222
223
203
204
205
206
207
208
209

210
211
212
213
214
215

216
217
218
219
220
221
222
223
224
225







-
+





-
+
+
+







			   (round 
			    (- 
			     (current-seconds) 
			     start-seconds)))))
	 (kill-tries 0))
    ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area)
    ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area)
    (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10)
    (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10 update-db: #t)

    (let loop ((minutes   (calc-minutes))
	       (cpu-load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
	       (disk-free (get-df (current-directory)))
               (last-sync (current-seconds)))
      (common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync))
      ;; (common:telemetry-log "zombie" (conc "launch:monitor-job -
      ;; top of loop encountered at "(current-seconds)" with
      ;; last-sync="last-sync))
      (let* ((over-time     (> (current-seconds) (+ last-sync update-period)))
             (new-cpu-load  (let* ((load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
                                   (delta (abs (- load cpu-load))))
                              (if (> delta 0.1) ;; don't bother updating with small changes
                                  load
                                  #f)))
             (new-disk-free (let* ((df    (if over-time ;; only get df every 30 seconds
231
232
233
234
235
236
237
238

239
240
241
242
243
244
245
246
247

248
249
250
251
252
253
254

255
256
257
258

259
260
261
262
263
264
265
266
267
233
234
235
236
237
238
239

240
241
242
243
244
245
246
247
248

249
250
251
252
253
254
255

256




257


258
259
260
261
262
263
264







-
+








-
+






-
+
-
-
-
-
+
-
-







             (do-sync       (or new-cpu-load new-disk-free over-time))

             (test-info   (rmt:get-test-info-by-id run-id test-id))
             (state       (db:test-get-state test-info))
             (status      (db:test-get-status test-info))
             (kill-reason  "no kill reason specified")
             (kill-job?    #f))
        (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
        #;(common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
        (cond
         ((test-get-kill-request run-id test-id)
          (set! kill-reason "KILLING TEST since received kill request (KILLREQ)")
          (set! kill-job? #t))
         ((and runtlim (> (- (current-seconds) start-seconds) runtlim))
          (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim))
          (set! kill-job? #t))
         ((equal? status "DEAD")
          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f update-db: #t)
          (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
          ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
          (set! kill-job? #f)))

        (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
        (launch:handle-zombie-tests run-id)
        (when do-sync
        (if do-sync ;; save meta data about the running of this test
          ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)
          ;;  (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))
          (common:telemetry-log "zombie" (conc  "launch:monitor-job - dosync started at "(current-seconds)))
          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
	    (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f))
          (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds))))
        
	(if kill-job? 
	    (begin
              (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason)
	      (mutex-lock! m)
	      ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this
	      ;;       section and the runit section? Or add a loop that tries three times with a 1/4 second
	      ;;       between tries?
310
311
312
313
314
315
316
317

318
319
320
321
322
323
324
307
308
309
310
311
312
313

314
315
316
317
318
319
320
321







-
+







	    (begin
	      (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses
	      (if (hash-table-ref/default misc-flags 'keep-going #f)  ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta
		  (loop (calc-minutes)
                        (or new-cpu-load cpu-load)
                        (or new-disk-free disk-free)
                        (if do-sync (current-seconds) last-sync)))))))
    (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional
    (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f update-db: #t))) ;; NOTE: Checking twice for keep-going is intentional


(define (launch:execute encoded-cmd)
  (let* ((cmdinfo    (common:read-encoded-string encoded-cmd))
	 (tconfigreg #f))
    (setenv "MT_CMDINFO" encoded-cmd)
    ;;(bb-check-path msg: "launch:execute incoming")
463
464
465
466
467
468
469



470
471
472
473
474
475
476
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476







+
+
+







				(db:test-get-host        test-info)
				(begin
				  (debug:print 0 *default-log-port* "ERROR: failed to find a record for test-id " test-id ", exiting.")
				  (exit))))
		 (test-pid  (db:test-get-process_id  test-info)))
	    (cond
             ;; -mrw- I'm removing KILLREQ from this list so that a test in KILLREQ state is treated as a "do not run" flag.
             ;; ((or (member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun
	     ;;	  (and (equal? (db:test-get-state test-info) "COMPLETED")                           ;; completed/abort => rerun if asked
             ;;       (member (db:test-get-status test-info) '("ABORT"))))
	     ((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun
	      (debug:print 0 *default-log-port* "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request")
	      ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a")

              (rmt:general-call 'set-test-start-time #f test-id)
              (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f)
	      ) ;; prime it for running

Modified rmt.scm from [39d97c528a] to [29d7593e43].

20
21
22
23
24
25
26


27
28
29
30
31
32
33
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35







+
+








(use format typed-records) ;; RADT => purpose of json format??

(declare (unit rmt))
(declare (uses api))
(declare (uses http-transport))
(include "common_records.scm")
(include "db_records.scm")

;; (declare (uses rmtmod))

;; (import rmtmod)

;;
;; THESE ARE ALL CALLED ON THE CLIENT SIDE!!!
;;
523
524
525
526
527
528
529
530

531
532
533
534










535
536
537
538
539
540
541
525
526
527
528
529
530
531

532
533
534
535

536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552







-
+



-
+
+
+
+
+
+
+
+
+
+







;; Just some syntatic sugar
(define (rmt:register-test run-id test-name item-path)
  (rmt:general-call 'register-test run-id run-id test-name item-path))

(define (rmt:get-test-id run-id testname item-path)
  (rmt:send-receive 'get-test-id run-id (list run-id testname item-path)))

;; run-id is NOT used
;; run-id is NOT used - but it will be! 
;;
(define (rmt:get-test-info-by-id run-id test-id)
  (if (number? test-id)
      (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id))
      (let* ((testdat  (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id)))
	     (trundatf (conc (db:test-get-rundir testdat) "/.mt_data/test-run.dat")))
	;; now we can update a couple fields from the filesystem
	(if (and (db:test-get-rundir testdat)
		 (file-exists? trundatf))
	    (let* ((duration   (db:test-get-run_duration testdat))
		   (event-time (db:test-get-event_time   testdat))
		   (last-touch (file-modification-time trundatf)))
	      (db:test-set-run_duration! testdat (max duration (- last-touch event-time)))))
	testdat)
      (begin
	(debug:print 0 *default-log-port* "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id)
	(print-call-chain (current-error-port))
	#f)))

(define (rmt:test-get-rundir-from-test-id run-id test-id)
  (rmt:send-receive 'test-get-rundir-from-test-id run-id (list run-id test-id)))

Modified tests.scm from [0094b671e6] to [86ca5b3688].

1942
1943
1944
1945
1946
1947
1948

1949
1950
1951
1952
1953
1954
1955
1956























1957
1958
1959

1960
1961
1962
1963
1964
1965
1966

1967
1968
1969
1970
1971
1972
1973
1942
1943
1944
1945
1946
1947
1948
1949








1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974

1975
1976
1977
1978
1979
1980
1981

1982
1983
1984
1985
1986
1987
1988
1989







+
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+


-
+






-
+







	 (lambda (count)
	   (set! res count))
	 tdb
	 "SELECT count(id) FROM test_rundat;")
	res))
  0)

;; 
(define (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname)
  (rmt:general-call 'update-test-rundat run-id test-id (current-seconds) (or cpuload -1)(or diskfree -1) -1 (or minutes -1))
  (if (and cpuload diskfree)
      (rmt:general-call 'update-cpuload-diskfree run-id cpuload diskfree test-id))
  (if minutes 
      (rmt:general-call 'update-run-duration run-id minutes test-id))
  (if (and uname hostname)
      (rmt:general-call 'update-uname-host run-id uname hostname test-id)))
(define (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname #!key (update-db #f))
  (if (get-environment-variable "MT_TEST_RUN_DIR")
      (let* ((dest-dir (conc (get-environment-variable "MT_TEST_RUN_DIR") "/.mt_data"))
	     (or-dash  (lambda (instr)(if instr instr "-"))))
	(if (not (directory-exists? dest-dir))(create-directory dest-dir #t))
	(let* ((outp (open-output-file (conc dest-dir "/test-run.dat") #:append)))
	  (with-output-to-port outp
	    (lambda ()
	      (print (current-seconds) " " (or-dash run-id)  " " (or-dash test-id)  " "
		     (or-dash cpuload) " " (or-dash diskfree) " "
		     (or-dash minutes) " " (or-dash hostname) " "
		     (or-dash uname)))) ;; put uname last as it has spaces in it
	  (close-output-port outp)))
      (begin
	(rmt:general-call 'update-test-rundat run-id test-id (current-seconds) (or cpuload -1)(or diskfree -1) -1 (or minutes -1))))
  (if update-db
      (begin
	(if (and cpuload diskfree)
	    (rmt:general-call 'update-cpuload-diskfree run-id cpuload diskfree test-id))
	(if minutes 
	    (rmt:general-call 'update-run-duration run-id minutes test-id))
	(if (and uname hostname)
	    (rmt:general-call 'update-uname-host run-id uname hostname test-id)))))
  
;; This one is for running with no db access (i.e. via rmt: internally)
(define (tests:set-full-meta-info db test-id run-id minutes work-area remtries)
(define (tests:set-full-meta-info db test-id run-id minutes work-area remtries #!key (update-db #f))
;; (define (tests:set-full-meta-info test-id run-id minutes work-area)
;;  (let ((remtries 10))
  (let* ((cpuload  (get-cpu-load))
	 (diskfree (get-df (current-directory)))
	 (uname    (get-uname "-srvpio"))
	 (hostname (get-host-name)))
    (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname)))
    (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname update-db: update-db)))
    
;; (define (tests:set-partial-meta-info test-id run-id minutes work-area)
#;(define (tests:set-partial-meta-info test-id run-id minutes work-area remtries)
  (let* ((cpuload  (get-cpu-load))
	 (diskfree (get-df (current-directory)))
	 (remtries 10))
    (handle-exceptions

Modified ulex/ulex.scm from [42b648b50c] to [5cd79bd4a1].

334
335
336
337
338
339
340
341


342
343
344
345
346
347
348
334
335
336
337
338
339
340

341
342
343
344
345
346
347
348
349







-
+
+







		(sqlite3:execute db "UPDATE dbs SET host_port=?,lease_thru=? WHERE dbname=?"
				 (conc host ":" port)
				 (+ (current-seconds) lease)
				 dbfname)
		#t)
	      #f))
	 (#f  (sqlite3:execute db "INSERT INTO dbs (dbname,dbfile,dbtype,host_port,lease_thru) VALUES (?,?,?,?,?)"
			       "captain" dbfname "captain" host-port (+ (current-seconds) lease)))
			       "captain" dbfname "captain" host-port (+ (current-seconds) lease))
	      #t)
	 (else (print "ERROR: Unrecognised result from fold-row")
	       (exit 1)))))))
							    
;;======================================================================
;; network utilities
;;======================================================================