Index: Makefile
==================================================================
--- Makefile
+++ Makefile
@@ -24,10 +24,11 @@
fossil timeline -n 350 -t ci -F "%h,%a,%b,%t,\"%c\"" > recent-commits.csv
SHELL=/bin/bash
PREFIX=$(PWD)
+# CSCOPTS=-lfa2 -specialize -inline-global
CSCOPTS=
INSTALL=install
SRCFILES = common.scm items.scm launch.scm ods.scm runconfig.scm \
server.scm configf.scm db.scm keys.scm \
process.scm runs.scm tasks.scm tests.scm genexample.scm \
@@ -35,12 +36,11 @@
ezsteps.scm rmt.scm api.scm \
subrun.scm archive.scm env.scm \
diff-report.scm cgisetup/models/pgdb.scm
# module source files
-MSRCFILES = dbfile.scm debugprint.scm mtargs.scm commonmod.scm dbmod.scm \
- tcp-transportmod.scm rmtmod.scm portlogger.scm
+MSRCFILES = dbfile.scm debugprint.scm mtargs.scm commonmod.scm dbmod.scm adjutant.scm mutils.scm mttop.scm tcp-transportmod.scm rmtmod.scm portlogger.scm
transport-mode.scm : transport-mode.scm.template
cp transport-mode.scm.template transport-mode.scm
dashboard-transport-mode.scm : dashboard-transport-mode.scm.template
@@ -339,10 +339,13 @@
$(PREFIX)/bin/mt-new-to-old.sh : utils/mt-new-to-old.sh
$(INSTALL) $< $@
chmod a+x $@
+$(PREFIX)/bin/convert-db.sh : utils/convert-db.sh
+ $(INSTALL) $< $@
+ chmod a+x $@
deploytarg/nbfake : utils/nbfake
$(INSTALL) $< $@
chmod a+x $@
@@ -385,10 +388,11 @@
install : $(PREFIX)/bin/.$(ARCHSTR) $(PREFIX)/bin/.$(ARCHSTR)/mtest $(PREFIX)/bin/megatest \
$(PREFIX)/bin/.$(ARCHSTR)/dboard $(PREFIX)/bin/dashboard $(HELPERS) $(PREFIX)/bin/nbfake \
$(PREFIX)/bin/nbfind $(PREFIX)/bin/mtrunner $(PREFIX)/bin/viewscreen $(PREFIX)/bin/mt_xterm \
$(PREFIX)/bin/mt-old-to-new.sh $(PREFIX)/bin/mt-new-to-old.sh \
+ $(PREFIX)/bin/convert-db.sh $(PREFIX)/bin/convert-db.sh \
$(PREFIX)/share/docs/megatest_manual.html $(PREFIX)/bin/remrun \
$(PREFIX)/share/docs/megatest_manual.html $(PREFIX)/bin/remrun $(PREFIX)/bin/mtutil \
$(PREFIX)/share/db/mt-pg.sql \
$(PREFIX)/share/js/jquery-3.1.0.slim.min.js \
$(PREFIX)/bin/.$(ARCHSTR)/lib/libpangox-1.0.so \
Index: adjutant.scm
==================================================================
--- adjutant.scm
+++ adjutant.scm
@@ -22,12 +22,23 @@
(module adjutant *
(import scheme chicken data-structures extras files)
(import (prefix sqlite3 sqlite3:) posix typed-records srfi-18 srfi-69
- md5 message-digest
+ md5 message-digest matchable
regex srfi-1)
-(define (adjutant-run)
- (print "Running the adjutant!"))
+(define (adjutant-run host-type rmt:no-sync-take-job)
+ (print "Running the adjutant!")
+ (let loop ((wait-count 0))
+ (if (< wait-count 10) ;; 6 x 10 seconds = one minute
+ (let* ((dat (rmt:no-sync-take-job host-type)))
+ (match dat
+ ((id ht vars exekey cmdline state event-time last-update)
+ (system cmdline)
+ (loop 0))
+ (else
+ (thread-sleep! 10)
+ (loop (+ wait-count 1)))))
+ (print "I'm bored. Exiting."))))
)
Index: api.scm
==================================================================
--- api.scm
+++ api.scm
@@ -155,12 +155,18 @@
(define *db-write-mutexes* (make-hash-table))
(define *server-signature* #f)
(define *api-threads* '())
-(define (api:register-thread th-in)
- (set! *api-threads* (cons (cons th-in (current-seconds)) *api-threads*)))
+(define (api:register-thread th-in command)
+ (set! *api-threads* (cons (list th-in (current-seconds) command) *api-threads*)))
+
+(define (api:get-thread-command th-in)
+ (let ((thread-data (assoc th-in *api-threads*)))
+ (if thread-data
+ (third thread-data) ; Assuming the command is the third element in the list
+ #f))) ; Return #f if the thread is not found
(define (api:unregister-thread th-in)
(set! *api-threads* (filter (lambda (thdat)
(not (eq? th-in (car thdat))))
*api-threads*)))
@@ -170,10 +176,19 @@
(not (member (thread-state (car thdat)) '(terminated dead))))
*api-threads*)))
(define (api:get-count-threads-alive)
(length *api-threads*))
+
+(define (api:get-threads)
+ (map (lambda (thdat)
+ (let ((thread (first thdat))
+ (timestamp (second thdat))
+ (command (third thdat)))
+ (format "\nThread: ~a, age: ~a, Command: ~a" thread (- (current-seconds) timestamp) command)))
+ *api-threads*))
+
(define *api:last-stats-print* 0)
(define *api-print-db-stats-mutex* (make-mutex))
(define (api:print-db-stats)
(debug:print-info 0 *default-log-port* "Started periodic db stats printer")
@@ -196,11 +211,11 @@
(define (api:tcp-dispatch-request-make-handler dbstruct) ;; cmd run-id params)
(assert *toppath* "FATAL: api:tcp-dispatch-request-make-handler called but *toppath* not set.")
(if (not *server-signature*)
(set! *server-signature* (tt:mk-signature *toppath*)))
(lambda (indat)
- (api:register-thread (current-thread))
+ (api:register-thread (current-thread) (car indat))
(let* ((result
(let* ((numthreads (api:get-count-threads-alive))
(delay-wait (if (> numthreads 10)
(- numthreads 10)
0))
@@ -228,17 +243,24 @@
(assert ok "FATAL: database file and run-id not aligned.")))))
(ttdat *server-info*)
(server-state (tt-state ttdat))
(maxthreads 20) ;; make this a parameter?
(status (cond
- ((and (> numthreads maxthreads)
- (> (random 100) 70)) ;; allow a 30% probability to go through so we can figure out what is going wrong in main.db server.
- 'busy)
- ;; ((> numthreads 5) 'loaded) ;; this gets transmitted to the client which calls tt:backoff-incr to slow stuff down.
+ ((> numthreads maxthreads)
+ (let* ((testsuite (common:get-testsuite-name))
+ (mtexe (common:find-local-megatest))
+ (proc (lambda ()
+ ;; we are overloaded, try to start another server
+ (debug:print 0 *default-log-port* "Too many threads running, starting another server")
+ (tt:server-process-run *toppath* testsuite mtexe run-id))))
+ (set! *server-start-requests* (cons proc *server-start-requests*)))
+ ;; 'busy
+ 'loaded ;; not ideal since the client will not backoff
+ )
(else 'ok)))
(errmsg (case status
- ((busy) (conc "Server overloaded, "numthreads" threads in flight"))
+ ((busy) (conc "Server overloaded, "numthreads" threads in flight, current cmd: " cmd "\n current threads: " (api:get-threads)))
((loaded) (conc "Server loaded, "numthreads" threads in flight"))
(else #f)))
(result (case status
((busy)
(if (eq? cmd 'ping)
@@ -251,11 +273,11 @@
;; (thread-sleep! 0.5))
(normal-proc cmd run-id params))
(else
(normal-proc cmd run-id params))))
(meta (case cmd
- ((ping) `((sstate . ,server-state)))
+ ((ping) `((sstate . ,server-state)(sload . ,numthreads)))
(else `((wait . ,delay-wait)))))
(payload (list status errmsg result meta)))
;; (cmd run-id params meta)
(db:add-stats cmd run-id params (- (current-milliseconds) start-t))
payload))
@@ -264,12 +286,10 @@
;; (set! *api-process-request-count* (- *api-process-request-count* 1))
;; (serialize payload)
(api:unregister-thread (current-thread))
result)))
-
-
(define *api-halt-writes* #f)
(define (api:dispatch-request dbstruct cmd run-id params)
(if (not *no-sync-db*)
@@ -370,10 +390,13 @@
;; NO SYNC DB
((no-sync-set) (apply db:no-sync-set *no-sync-db* params))
((no-sync-get/default) (apply db:no-sync-get/default *no-sync-db* params))
((no-sync-del!) (apply db:no-sync-del! *no-sync-db* params))
((no-sync-get-lock) (apply db:no-sync-get-lock *no-sync-db* params))
+ ((no-sync-add-job) (apply db:no-sync-add-job *no-sync-db* params))
+ ((no-sync-take-job) (apply db:no-sync-take-job *no-sync-db* params))
+ ((no-sync-job-records-clean) (apply db:no-sync-job-records-clean *no-sync-db* params))
;; NO SYNC DB PROCESSES
((register-process) (apply dbfile:register-process *no-sync-db* params))
((set-process-done) (apply dbfile:set-process-done *no-sync-db* params))
((set-process-status) (apply dbfile:set-process-status *no-sync-db* params))
@@ -382,11 +405,11 @@
;; ARCHIVES
;; ((archive-get-allocations)
((archive-register-disk) (apply db:archive-register-disk dbstruct params))
((archive-register-block-name)(apply db:archive-register-block-name dbstruct params))
;; ((archive-allocate-testsuite/area-to-block)(apply db:archive-allocate-testsuite/area-to-block dbstruct block-id testsuite-name areakey))
-
+
;;======================================================================
;; READ ONLY QUERIES
;;======================================================================
;; KEYS
Index: common.scm
==================================================================
--- common.scm
+++ common.scm
@@ -38,13 +38,15 @@
(import commonmod
debugprint
rmtmod
(prefix mtargs args:))
-
+
+(define (remove-server-files directory-path)
+ (let ((files (glob (string-append directory-path "/server*"))))
+ (for-each delete-file* files)))
(include "common_records.scm")
-
(define (remove-files filespec)
(let ((files (glob filespec)))
(for-each delete-file* files)))
(define (stop-the-train)
@@ -53,10 +55,11 @@
(if (and *toppath*
(file-exists? (conc *toppath*"/stop-the-train")))
(let* ((msg (conc "ERROR: found file "*toppath*"/stop-the-train, exiting immediately")))
;; yes, print to current-output-port AND *default-log-port*, annoying but necessary I think
(print msg)
+ (remove-server-files (conc *toppath* "/logs"))
(debug:print 0 *default-log-port* msg)
(remove-files (conc *toppath* "/logs/server*"))
(remove-files (conc *toppath* "/.servinfo/*"))
(remove-files (conc *toppath* "/.mtdb/*lock"))
(exit 1)))
@@ -397,21 +400,19 @@
(define (common:version-changed?)
(not (equal? (common:get-last-run-version)
(common:version-signature))))
-;; From 1.70 to 1.80, db's are compatible.
-
+;; From 1.70 to 1.81, db's are compatible.
+;;
+;; BUG: This logic is almost certainly not quite correct.
+;;
(define (common:api-changed?)
- (let* (
- (megatest-major-version (substring (->string megatest-version) 0 4))
- (run-major-version (substring (conc (common:get-last-run-version)) 0 4))
- )
- (and (not (equal? megatest-major-version "1.80"))
- (not (equal? megatest-major-version megatest-run-version)))
- )
-)
+ (let* ((megatest-major-version (substring (->string megatest-version) 0 4))
+ (run-major-version (substring (conc (common:get-last-run-version)) 0 4)))
+ (and (not (member megatest-major-version '("1.81" "1.80")))
+ (not (equal? megatest-major-version run-major-version)))))
;;======================================================================
;; Move me elsewhere ...
;; RADT => Why do we meed the version check here, this is called only if version misma
;;
@@ -2754,31 +2755,39 @@
;; arm cubie01 cubie02
;; x86_64 zeus xena myth01
;; allhosts #{g hosts arm} #{g hosts x86_64}
;;
;; [host-types]
+;; C/M/A lets megatest know this launcher provides C cores, M bytes memory for architecture A
+;; 2/2G/arm smart -cores 2 -memory 2G -arch arm
;; general #MTLOWESTLOAD #{g hosts allhosts}
;; arm #MTLOWESTLOAD #{g hosts arm}
;; nbgeneral nbjob run JOBCOMMAND -log $MT_LINKTREE/$MT_TARGET/$MT_RUNNAME.$MT_TESTNAME-$MT_ITEM_PATH.lgo
+;;
+;; NOTE: host-rules is ONLY used for MTLOWESTLOAD
;;
;; [host-rules]
;; # maxnload => max normalized load
;; # maxnjobs => max jobs per cpu
;; # maxjobrate => max jobs per second
;; general maxnload=1.1; maxnjobs=1.2; maxjobrate=0.1
;;
;; [launchers]
;; envsetup general
-;; xor/%/n 4C16G
+;; xor/%/n 2/2G/arm
;; % nbgeneral
;;
;; [jobtools]
;; # if defined and not "no" flexi-launcher will bypass "launcher" unless no match.
;; flexi-launcher yes
;; launcher nbfake
+;; mode adjutant|normal (default is normal)
+;;
+;;
+;; mode is 'normal (i.e. directly use launcher) or 'adjutant (i.e. use adjutant)
;;
-(define (common:get-launcher configdat testname itempath)
+(define (common:get-launcher configdat testname itempath mode)
(let ((fallback-launcher (configf:lookup configdat "jobtools" "launcher")))
(if (and (configf:lookup configdat "jobtools" "flexi-launcher") ;; overrides launcher
(not (equal? (configf:lookup configdat "jobtools" "flexi-launcher") "no")))
(let* ((launchers (hash-table-ref/default configdat "launchers" '())))
(if (null? launchers)
@@ -2785,32 +2794,36 @@
fallback-launcher
(let loop ((hed (car launchers))
(tal (cdr launchers)))
(let ((patt (car hed))
(host-type (cadr hed)))
- (if (tests:match patt testname itempath)
+ (if (tests:match patt testname itempath) ;; have a launcher match for this test
(begin
(debug:print-info 2 *default-log-port* "Have flexi-launcher match for " testname "/" itempath " = " host-type)
- (let ((launcher (configf:lookup configdat "host-types" host-type)))
+ (let ((launcher (configf:lookup configdat "host-types" host-type))) ;; find the actual launcher from the host-types table
+ ;; if we are in adjutant mode then we want to return both host-type and launcher
(if launcher
(let* ((launcher-parts (string-split launcher))
(launcher-exe (car launcher-parts)))
(if (equal? launcher-exe "#MTLOWESTLOAD") ;; this is our special case, we will find the lowest load and craft a nbfake commandline
(let host-loop ((targ-host (common:get-least-loaded-host (cdr launcher-parts) host-type configdat))
(count 100))
(if targ-host
(conc "remrun " targ-host)
(if (> count 0)
+
(begin
(debug:print 0 *default-log-port* "INFO: Waiting for a host for host-type " host-type)
(thread-sleep! (- 101 count))
(host-loop (common:get-least-loaded-host (cdr launcher-parts) host-type configdat)
(- count 1)))
(begin
(debug:print 0 *default-log-port* "FATAL: Failed to find a host from #MTLOWESTLOAD for host-type " host-type)
(exit)))))
- launcher))
+ (case mode
+ ((adjutant) (list host-type launcher))
+ (else launcher))))
(begin
(debug:print-info 0 *default-log-port* "WARNING: no launcher found for host-type " host-type)
(if (null? tal)
fallback-launcher
(loop (car tal)(cdr tal)))))))
Index: dashboard-tests.scm
==================================================================
--- dashboard-tests.scm
+++ dashboard-tests.scm
@@ -95,11 +95,12 @@
"Current state: "
"Current status: "
"Test comment: "
"Test id: "
"Test date: "))
- (list (iup:label "" #:expand "VERTICAL"))))
+ (list (iup:label "" #:expand "VERTICAL"
+ ))))
(apply iup:vbox ; #:expand "YES"
(list
(store-label "testname"
(iup:label (db:test-get-testname testdat) #:expand "HORIZONTAL")
(lambda (testdat)(db:test-get-testname testdat)))
@@ -163,11 +164,12 @@
(list "Author: "
"Owner: "
"Reviewed: "
"Tags: "
"Description: "))
- (list (iup:label "" #:expand "VERTICAL"))))
+ (list (iup:label "" #:expand "VERTICAL"
+ ))))
(apply iup:vbox ; #:expand "YES"
(list
(store-meta "author"
(iup:label (db:testmeta-get-author testmeta) #:expand "HORIZONTAL")
(lambda (testmeta)(db:testmeta-get-author testmeta)))
@@ -179,11 +181,12 @@
(lambda (testmeta)(db:testmeta-get-reviewed testmeta)))
(store-meta "tags"
(iup:label (db:testmeta-get-tags testmeta) #:expand "HORIZONTAL")
(lambda (testmeta)(db:testmeta-get-tags testmeta)))
(store-meta "description"
- (iup:label (test-meta-panel-get-description testmeta) #:size "x50"); #:expand "HORIZONTAL")
+ (iup:label (test-meta-panel-get-description testmeta) ;; #:wordwrap "YES" ;; #:size "x50"
+ ) ;; #:expand "HORIZONTAL")
(lambda (testmeta)
(test-meta-panel-get-description testmeta)))
)))))
@@ -207,16 +210,21 @@
(list (iup:label "runname ")
(iup:label "run-id")
(iup:label "run-date"))))
(apply iup:vbox
(append (map (lambda (keyval)
- (iup:label (cadr keyval) #:expand "HORIZONTAL"))
+ (iup:vbox
+ (iup:label (cadr keyval) #:expand "HORIZONTAL")
+ ;; (iup:label "" #:expand "BOTH")
+ )
+ )
keydat)
(list (iup:label runname)
(iup:label (conc run-id))
(iup:label (seconds->year-work-week/day-time event_time))
- (iup:label "" #:expand "VERTICAL"))))))))
+ (iup:label "" ;;#:expand "VERTICAL"
+ ))))))))
;;======================================================================
;; Host info panel
;;======================================================================
(define (host-info-panel testdat store-label)
@@ -232,11 +240,12 @@
"CPU Load: "
"Run duration: "
"Logfile: "
"Top process id: "
"Uname -a: "))
- (iup:label "" #:expand "VERTICAL")))
+ (iup:label "" ;; #:expand "VERTICAL"
+ )))
(apply iup:vbox ; #:expand "YES"
(list
;; NOTE: Yes, the host can change!
(store-label "HostName"
(iup:label ;; (sdb:qry 'getstr
@@ -268,18 +277,20 @@
;;
(define (submegatest-panel dbstruct keydat testdat runname testconfig)
(let* ((test-run-dir (db:test-get-rundir testdat))
(subarea (subrun:get-runarea test-run-dir))
(area-exists (and subarea (common:file-exists? subarea silent: #t))))
- (if subarea
- (iup:frame
- #:title "Megatest Run Info" ; #:expand "YES"
+ (iup:frame
+ #:title "Megatest Run Info" ;; #:expand "HORIZONTAL"
+ (if subarea
(iup:button
"Launch Dashboard"
#:action (lambda (obj)
- (subrun:launch-dashboard test-run-dir))))
- (iup:vbox))))
+ (subrun:launch-dashboard test-run-dir)))
+ (iup:vbox
+ (iup:label "Not a subrun..." #:expand "HORIZONTAL")
+ )))))
;; use a global for setting the buttons colors
;; state status teststeps
(define *state-status* (vector #f #f #f))
(define (update-state-status-buttons testdat)
@@ -671,10 +682,19 @@
(conc "megatest -target " keystring " -runname " runname
" -run -testpatt " (conc testname "/" (if (equal? item-path "")
"%"
item-path))
" -clean-cache"
+ ))))
+ (rerun-clean (lambda (x)
+ (iup:attribute-set!
+ command-text-box "VALUE"
+ (conc "megatest -target " keystring " -runname " runname
+ " -rerun-clean -testpatt " (conc testname "/" (if (equal? item-path "")
+ "%"
+ item-path))
+ " -clean-cache"
))))
(remove-test (lambda (x)
(iup:attribute-set!
command-text-box "VALUE"
(conc "megatest -remove-runs -target " keystring " -runname " runname
@@ -719,31 +739,45 @@
(else
;; (test-set-status! db run-id test-name state status itemdat)
(set! self ;
(iup:dialog #:close_cb (lambda (a)(exit)) ; #:expand "YES"
#:title testfullname
- (iup:vbox ; #:expand "YES"
- ;; The run and test info
- (iup:hbox ; #:expand "YES"
- (run-info-panel dbstruct keydat testdat runname)
- (test-info-panel testdat store-label widgets)
- (test-meta-panel testmeta store-meta))
+ (iup:vbox
(iup:hbox
- (host-info-panel testdat store-label)
- (submegatest-panel dbstruct keydat testdat runname testconfig))
+ (iup:vbox ; #:expand "YES"
+ ;; The run and test info
+ (iup:hbox ; #:expand "YES"
+ (run-info-panel dbstruct keydat testdat runname)
+ (test-info-panel testdat store-label widgets))
+ (host-info-panel testdat store-label))
+ (iup:vbox
+ (test-meta-panel testmeta store-meta)
+ (submegatest-panel dbstruct keydat testdat runname testconfig)))
;; The controls
- (iup:frame #:title "Actions"
+ (iup:hbox ;; frame #:title "Actions"
(iup:vbox
- (iup:hbox
- (iup:button "View Log" #:action viewlog #:size "80x")
- (iup:button "Start Xterm" #:action xterm #:size "80x")
- (iup:button "Run Test" #:action run-test #:size "80x")
- (iup:button "Clean Test" #:action remove-test #:size "80x")
- (iup:button "CleanRunExecute!" #:action clean-run-execute #:size "80x")
- (iup:button "Kill All Jobs" #:action kill-jobs #:size "80x")
- (iup:button "Archive Test" #:action archive-test #:size "80x")
- (iup:button "Close" #:action (lambda (x)(exit)) #:size "80x"))
+ (iup:hbox
+ (iup:frame
+ #:title "Immediate"
+ (iup:hbox
+ (iup:button "Start Xterm" #:action xterm #:size "80x")
+ (iup:button "CleanRunExecute!" #:action clean-run-execute #:size "80x")
+ (iup:button "View Log" #:action viewlog #:size "80x")))
+ (iup:frame
+ #:title "Command line"
+ (iup:hbox
+ (iup:button "Run Test" #:action run-test #:size "80x")
+ (iup:button "Rerun-clean" #:action rerun-clean #:size "80x")
+ (iup:button "Clean Test" #:action remove-test #:size "80x")
+ (iup:button "Kill All Jobs" #:action kill-jobs #:size "80x")))
+ (iup:label "" #:expand "HORIZONTAL")
+ (iup:frame
+ #:title "Other"
+ (iup:hbox
+ ;; (iup:button "Archive Test" #:action archive-test #:size "80x")
+ (iup:button "Close" #:action (lambda (x)(exit)) #:size "80x")
+ )))
(apply
iup:hbox
(list command-text-box command-launch-button))))
(set-fields-panel dbstruct run-id test-id testdat)
(let ((tabs
Index: dashboard.scm
==================================================================
--- dashboard.scm
+++ dashboard.scm
@@ -168,10 +168,12 @@
;; updaters: (make-hash-table)
;; updating: #f
;; hide-not-hide-tabs: #f
;; target: ""
;; ))
+
+(set! *journal-stats-enable* #f)
;;======================================================================
;; buttons color using image
;;======================================================================
Index: db.scm
==================================================================
--- db.scm
+++ db.scm
@@ -1295,17 +1295,98 @@
dead-runs))
;;======================================================================
;; no-sync.db - small bits of data to be shared between servers
;;======================================================================
+
+;; if we are not a server create a db handle. this is not finalized
+;; so watch for problems. I'm still not clear if it is needed to manually
+;; finalize sqlite3 dbs with the sqlite3 egg.
+;;
+(define (db:no-sync-db db-in)
+ (mutex-lock! *db-access-mutex*)
+ (let ((res (if db-in
+ db-in
+ (let ((db (db:open-no-sync-db)))
+ (set! *no-sync-db* db)
+ db))))
+ (mutex-unlock! *db-access-mutex*)
+ res))
+
(define (db:get-dbsync-path)
(case (rmt:transport-mode)
((http)(common:make-tmpdir-name *toppath* ""))
((tcp) (conc *toppath*"/.mtdb"))
((nfs) (conc *toppath*"/.mtdb"))
(else "/tmp/dunno-this-gonna-exist")))
+
+(define (db:no-sync-add-job db-in host-type vars-list exekey cmdline)
+ (sqlite3:execute (db:no-sync-db db-in) "INSERT INTO jobs_queue (host_type,vars,exekey,cmdline,state,event_time,last_update) VALUES (?,?,?,?,?,?,?);"
+ host-type
+ (with-output-to-string
+ (lambda ()
+ (write vars-list)))
+ exekey cmdline "waiting" (current-seconds)(current-seconds)))
+
+;; find next job (waiting longest) that matches host-type - future, we'll find jobs that fit if no exact match
+(define (db:no-sync-take-job db-in host-type)
+ (let* ((db (db:no-sync-db db-in))
+ (stmt1 "SELECT id,host_type,vars,exekey,cmdline,state,event_time,last_update FROM jobs_queue WHERE host_type=? AND state != 'taken' ORDER BY event_time ASC;")
+ (stmt1h (sqlite3:prepare db stmt1))
+ (stmt2 "UPDATE jobs_queue SET state='taken',last_update=? WHERE id=?;")
+ (stmt2h (sqlite3:prepare db stmt2))
+ (res (sqlite3:with-transaction
+ db
+ (lambda ()
+ (let* ((matching-jobs (sqlite3:fold-row
+ (lambda (res . row) ;; id host-type vars exekey state event-time last-update)
+ (cons row res))
+ '()
+ stmt1h
+ host-type)))
+ (if (null? matching-jobs)
+ #f
+ (let ((choosen-one (let loop ((tal matching-jobs)
+ (res #f)) ;; put bestest one in here
+ (if (null? tal)
+ res
+ (let ((curr (car tal))
+ (rem (cdr tal)))
+ curr) ;; here we will compare with res, if better candidate the loop with curr else loop with res
+ ))))
+ (if choosen-one ;; we need to mark it as taken
+ (sqlite3:execute stmt2h (current-seconds) (car choosen-one)))
+ choosen-one)))))))
+ (sqlite3:finalize! stmt1h) ;; it'd be nice to cache these and finalize on exit.
+ (sqlite3:finalize! stmt2h)
+ res))
+
+;; clean out old jobs in queue, i.e. taken and event_time > 24 hrs ago
+;;
+(define (db:no-sync-job-records-clean db)
+ (sqlite3:execute (db:no-sync-db db) "DELETE FROM jobs_queue WHERE state='taken' AND event_time < ?;" (- (current-seconds)(* 24 3600))))
+
+
+(define (db:no-sync-get/default db-in var default)
+ (let ((db (db:no-sync-db db-in))
+ (res default))
+ (sqlite3:for-each-row
+ (lambda (val)
+ (set! res val))
+ (db:no-sync-db db)
+ "SELECT val FROM no_sync_metadat WHERE var=?;"
+ var)
+ (if res
+ (let ((newres (if (string? res)
+ (string->number res)
+ #f)))
+ (if newres
+ newres
+ res))
+ res)))
+
;; This is needed for api.scm
(define (db:open-no-sync-db)
(dbfile:open-no-sync-db (db:get-dbsync-path)))
Index: dbfile.scm
==================================================================
--- dbfile.scm
+++ dbfile.scm
@@ -550,10 +550,26 @@
;; on file existance.
(for-each
(lambda (stmt)
(sqlite3:execute db stmt))
(list
+ "CREATE TABLE IF NOT EXISTS jobs_queue
+ (id INTEGER PRIMARY KEY,
+ host_type TEXT,
+ cores INTEGER,
+ memory TEXT,
+ vars TEXT,
+ exekey TEXT,
+ cmdline TEXT,
+ state TEXT,
+ event_time INTEGER,
+ last_update INTEGER);"
+ "CREATE TABLE IF NOT EXISTS test_extra_data
+ (id INTEGER PRIMARY KEY,
+ run_id INTEGER,
+ test_id INTEGER,
+ last_seen_running INTEGER);"
"CREATE TABLE IF NOT EXISTS no_sync_metadat
(var TEXT,
val TEXT,
CONSTRAINT no_sync_metadat_constraint UNIQUE (var));"
"CREATE TABLE IF NOT EXISTS no_sync_locks
Index: docs/Makefile
==================================================================
--- docs/Makefile
+++ docs/Makefile
@@ -33,5 +33,8 @@
megatest.pdf : megatest.lyx
lyx -e pdf2 megatest.lyx
pkts.pdf : pkts.dot
dot -Tpdf pkts.dot -o pkts.pdf
+
+stepwise.pdf : stepwise-rpc-via-direct-and-tcp-or-http.dot
+ dot stepwise-rpc-via-direct-and-tcp-or-http.dot -Tpdf -o stepwise.pdf
ADDED docs/csirc
Index: docs/csirc
==================================================================
--- /dev/null
+++ docs/csirc
@@ -0,0 +1,33 @@
+(cond-expand
+ (chicken-4
+ ;; chicken 4 stuff here
+ (use readline)
+ (current-input-port (make-readline-port))
+ (install-history-file #f "/.csi.history")
+ )
+ (chicken-5
+ (import (chicken load))
+ (import (chicken format))
+ (import (chicken process-context))
+ (import (chicken process signal))
+ (load-verbose #f)
+ (let ()
+ (unless (get-environment-variable "INSIDE_EMACS")
+ (import breadline)
+ (import breadline-scheme-completion)
+ (history-file (format "~a/.csi_history" (get-environment-variable "HOME")))
+ (stifle-history! 10000)
+ (completer-word-break-characters-set! "\"\'`;|(")
+ (completer-set! scheme-completer)
+ (basic-quote-characters-set! "\"|")
+ (variable-bind! "blink-matching-paren" "on")
+ (paren-blink-timeout-set! 200000)
+ (let ((handler (signal-handler signal/int)))
+ (set-signal-handler! signal/int
+ (lambda (s)
+ (cleanup-after-signal!)
+ (reset-after-signal!)
+ (handler s))))
+ (on-exit reset-terminal!)
+ (current-input-port (make-readline-port))))
+ ))
ADDED docs/stepwise-rpc-via-direct-and-tcp-or-http.dot
Index: docs/stepwise-rpc-via-direct-and-tcp-or-http.dot
==================================================================
--- /dev/null
+++ docs/stepwise-rpc-via-direct-and-tcp-or-http.dot
@@ -0,0 +1,98 @@
+// Copyright 2006-2017, Matthew Welland.
+//
+// This file is part of Megatest.
+//
+// Megatest is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// Megatest is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with Megatest. If not, see .
+//
+
+digraph megatest_state_status {
+ ranksep=0.05;
+ // rankdir=LR
+
+ node [shape=box,style=filled];
+
+ "START" -> "have_server";
+ "DONE" [label="ALL DONE"];
+ "send_request" -> "receive_response";
+ "receive_response" -> "DONE";
+ "have_server" -> "start_http_server" [label="no"];
+
+ subgraph cluster_start_server {
+ label="Start Server";
+ "start_http_server" -> "create_servinfo_file";
+ "start_http_server" -> "enable_direct_mode";
+ "create_servinfo_file" -> "delay_60_sec";
+ "delay_60_sec" -> "set_up_tmp_cache";
+ "set_up_tmp_cache" -> "switch_mode_to_tmp";
+ }
+
+ subgraph cluster_direct_access {
+ label="Direct Access";
+
+ "direct_access" -> "touch_access_file";
+ "touch_access_file" -> "touch_host_pid_file";
+ "touch_host_pid_file" -> "count_host_pid_files";
+ "count_host_pid_files" -> "call_query_proc" [label="count < 5"];
+ "wait_for_low_count_host_pid_files" [label="sleep 1"];
+ "count_host_pid_files" -> "wait_for_low_count_host_pid_files"[label="count >= 5"];
+ "wait_for_low_count_host_pid_files" -> "count_host_pid_files";
+ "call_query_proc" -> "DONE";
+ }
+
+ "have_server" -> "ping_server" [label="yes"];
+ "have_server" -> "direct_access" [label="no" ];
+ "ping_server" -> "send_request" [label="server alive"];
+ "ping_server" -> "remove_server_file" [label="server not alive"];
+ "remove_server_file" -> "START";
+
+
+}
+// subgraph cluster_notstarted {
+// label="Not started";
+
+// "NOT_STARTED FAILS" [
+// label = "{ NOT_STARTED/FAILS |{ NO_ITEMS | FAIL_PREREQ | FAIL_TIMEOUT }}";
+// shape= "record";
+// ]
+//
+// "NOT_STARTED n/a" -> "LAUNCHED n/a" [label=" launch"];
+// "NOT_STARTED WAIT" -> "LAUNCHED n/a"
+//
+// "NOT_STARTED n/a";
+// "NOT_STARTED WAIT" [
+// label = "{NOT_STARTED WAIT|{ NO_SLOTS | WAIT_PREREQ}}";
+// shape = "record";
+// ]
+//
+// // struct3 [shape=record,label="hello\nworld |{ b |{c| d|e}| f}| g | h"];
+//
+// "NOT_STARTED n/a" -> "NOT_STARTED FAILS";
+// "NOT_STARTED n/a" -> "NOT_STARTED WAIT";
+//
+// "RUNNING" [
+// shape="record";
+// label="{RUNNING|{n/a| PASS | FAIL}}";
+// ]
+//
+// "COMPLETED" [
+// shape="record";
+// label = "{COMPLETED|{PASS | SKIP | WAIVED | FAIL | CHECK| ABORT}}";
+// ]
+//
+//
+// "RUNNING" -> "COMPLETED";
+// "RUNNING" -> "INCOMPLETE" [label="test dead for > 24hrs"];
+//
+//
+// "LAUNCHED n/a" -> "REMOTEHOSTSTART n/a" -> "RUNNING";
Index: launch.scm
==================================================================
--- launch.scm
+++ launch.scm
@@ -371,12 +371,12 @@
;; extract logpro from testconfig and write them to files in test run dir
(for-each
(lambda (logprodat)
(match logprodat
((name content)
- (debug:print-info 2 *default-log-port* "Creating logpro file "(current-directory)"/"name".logpro")
- (with-output-to-file (conc name".logpro")
+ (debug:print-info 2 *default-log-port* "Creating logpro file "(current-directory)"/"name ".logpro")
+ (with-output-to-file (conc name".logpro")
(lambda ()
(print content)
;; (change-file-mode name (bitwise-ior perm/irwxg perm/irwxu))
)))
(else
@@ -733,16 +733,19 @@
(th1 (make-thread monitorjob "monitor job"))
(th2 (make-thread runit "run job"))
(tconfig (tests:get-testconfig test-name item-path tconfigreg #t))
(propagate-exit-code (configf:lookup *configdat* "setup" "propagate-exit-code"))
(propagate-status-list '("FAIL" "KILLED" "ABORT" "DEAD" "CHECK" "SKIP" "WAIVED"))
- (test-status "not set")
+ (test-status "not set")
+ (test-state "not set")
(precmd (configf:lookup tconfig "setup" "precmd"))
(postcmd (configf:lookup tconfig "setup" "postcmd")))
;; first, if set, run the precmd
(if precmd ;; (file-exists? precmd)(file-execute-access? precmd))
- (system precmd)) ;; up to test author to put nbfake if desired.
+ (begin
+ ;; (save-environment-as-files "precmd-envt")
+ (system precmd))) ;; up to test author to put nbfake if desired.
(set! job-thread th2)
(thread-start! th1)
(thread-start! th2)
(thread-join! th2)
(debug:print-info 0 *default-log-port* "Megatest execute of test " test-name ", item path " item-path " complete. Notifying the db ...")
@@ -801,25 +804,30 @@
(mutex-unlock! m)
(launch:end-of-run-check run-id )
(debug:print 2 *default-log-port* "Output from running " fullrunscript ", pid " (launch:einf-pid exit-info) " in work area "
work-area ":\n====\n exit code " (launch:einf-exit-code exit-info) "\n" "====\n")
-
- (set! test-status (db:test-get-status (rmt:get-testinfo-state-status run-id test-id)))
+ (let* ((testrec (rmt:get-testinfo-state-status run-id test-id)))
+ (set! test-status (db:test-get-status testrec))
+ (set! test-state (db:test-get-state testrec)))
;; If the propagate-exit-code option has been set in the megatest config, and the test status matches the list, set the exit code to 1.
- (if postcmd
- (system postcmd))
(if (and propagate-exit-code (string=? propagate-exit-code "yes") (member test-status propagate-status-list))
(begin
(debug:print 1 *default-log-port* "Setting exit status to 1 because of test status of " test-status)
(set! *globalexitstatus* 1)
)
)
+ (if postcmd
+ (begin
+ (setenv "MT_TEST_STATE" test-state)
+ (setenv "MT_TEST_STATUS" test-status)
+ ;; (save-environment-as-files "postcmd-envt")
+ (system postcmd)))
(if (not (launch:einf-exit-status exit-info))
(exit 4))))
)))
;; Spec for End of test
@@ -1500,12 +1508,20 @@
(debug:print 1 *default-log-port* "INFO: search and mark zombie tests")
(rmt:set-var key (current-seconds))
(rmt:find-and-mark-incomplete run-id #f))))
-
+(defstruct launch:ajt
+ (vars '())
+ (exekey #f)
+ (host-type #f)
+ (test-sig #f)
+ (cmdline #f))
+;; append vars
+(define (launch:ajt-add-vars dat vars)
+ (launch:ajt-vars-set! dat (append (launch:ajt-vars dat) vars)))
;; 1. look though disks list for disk with most space
;; 2. create run dir on disk, path name is meaningful
;; 3. create link from run dir to megatest runs area
;; 4. remotely run the test on allocated host
@@ -1513,44 +1529,41 @@
;; - could be netbatch
;; (launch-test db (cadr status) test-conf))
(define (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat params)
(assert runname "FATAL: launch-test called with no runname")
(mutex-lock! *launch-setup-mutex*) ;; setting variables and processing the testconfig is NOT thread-safe, reuse the launch-setup mutex
- (let* ( ;; (lock-key (conc "test-" test-id))
- ;; (got-lock (let loop ((lock (rmt:no-sync-get-lock lock-key))
- ;; (expire-time (+ (current-seconds) 15))) ;; give up on getting the lock and steal it after 15 seconds
- ;; (if (car lock)
- ;; #t
- ;; (if (> (current-seconds) expire-time)
- ;; (begin
- ;; (debug:print-info 0 *default-log-port* "Timed out waiting for a lock to launch test " keyvals " " runname " " test-name " " test-path)
- ;; (rmt:no-sync-del! lock-key) ;; destroy the lock
- ;; (loop (rmt:no-sync-get-lock lock-key) expire-time)) ;;
- ;; (begin
- ;; (thread-sleep! 1)
- ;; (loop (rmt:no-sync-get-lock lock-key) expire-time))))))
+ (let* (;; locking code removed from here commented out and pasted at end of file
(item-path (item-list->path itemdat))
- (contour #f)) ;; NOT READY FOR THIS (args:get-arg "-contour")))
+ (contour #f) ;; NOT READY FOR THIS (args:get-arg "-contour")))
+ ;; launcher-mode will be 'adjutant or 'normal
+ (launcher-mode (string->symbol (or (configf:lookup *configdat* "jobtools" "mode") "normal")))
+ (ajtdat (make-launch:ajt)))
(let loop ((delta (- (current-seconds) *last-launch*))
(launch-delay (configf:lookup-number *configdat* "setup" "launch-delay" default: 0)))
(if (> launch-delay delta)
(begin
;; (if (common:low-noise-print 1200 "test launch delay") ;; every two hours or so remind the user about launch delay.
;; (debug:print-info 0 *default-log-port* "NOTE: test launches are delayed by " launch-delay " seconds. See megatest.config launch-delay setting to adjust.")) ;; launch of " test-name " for " (- launch-delay delta) " seconds"))
(thread-sleep! (- launch-delay delta))
(loop (- (current-seconds) *last-launch*) launch-delay))))
(change-directory *toppath*)
- (alist->env-vars ;; consolidate this code with the code in megatest.scm for "-execute", *maybe* - the longer they are set the longer each launch takes (must be non-overlapping with the vars)
- (append
- (list
- (list "MT_RUN_AREA_HOME" *toppath*)
- (list "MT_TEST_NAME" test-name)
- (list "MT_RUNNAME" runname)
- (list "MT_ITEMPATH" item-path)
- (list "MT_CONTOUR" contour)
- )
- itemdat))
+ (let ((var-list (append
+ (list
+ (list "MT_RUN_AREA_HOME" *toppath*)
+ (list "MT_TEST_NAME" test-name)
+ (list "MT_RUNNAME" runname)
+ (list "MT_ITEMPATH" item-path)
+ (list "MT_CONTOUR" contour)
+ )
+ itemdat)))
+ ;; consolidate this code with the code in megatest.scm for
+ ;; "-execute", *maybe* - the longer they are set the longer
+ ;; each launch takes (must be non-overlapping with the vars)
+ (alist->env-vars var-list)
+ ;; the var-list into the ajtdat adjutant record whether it is needed or not.
+ (launch:ajt-add-vars ajtdat var-list))
+
(let* ((tregistry (tests:get-all)) ;; third param (below) is system-allowed
;; for tconfig, why do we allow fallback to test-conf?
(tconfig (or (tests:get-testconfig test-name item-path tregistry #t force-create: #t)
(begin
(debug:print 0 *default-log-port* "WARNING: falling back to pre-calculated testconfig. This is likely not desired.")
@@ -1568,26 +1581,17 @@
;; (memory (configf:lookup tconfig "requirements" "memory"))
;; (hosts (configf:lookup *configdat* "jobtools" "workhosts")) ;; I'm pretty sure this was never completed
(remote-megatest (configf:lookup *configdat* "setup" "executable"))
(run-time-limit (or (configf:lookup tconfig "requirements" "runtimelim")
(configf:lookup *configdat* "setup" "runtimelim")))
- ;; FIXME SOMEDAY: not good how this is so obtuse, this hack is to
- ;; allow running from dashboard. Extract the path
- ;; from the called megatest and convert dashboard
- ;; or dboard to megatest
(local-megatest (common:find-local-megatest))
- #;(local-megatest (let* ((lm (car (argv)))
- (dir (pathname-directory lm))
- (exe (pathname-strip-directory lm)))
- (conc (if dir (conc dir "/") "")
- (case (string->symbol exe)
- ((dboard) "../megatest")
- ((mtest) "../megatest")
- ((dashboard) "megatest")
- (else exe)))))
- (launcher (common:get-launcher *configdat* test-name item-path)) ;; (configf:lookup *configdat* "jobtools" "launcher"))
- (test-sig (conc (common:get-testsuite-name) ":" test-name ":" item-path)) ;; (item-list->path itemdat))) ;; test-path is the full path including the item-path
+ (launcher (let ((l (common:get-launcher *configdat* test-name item-path launcher-mode)))
+ (if (string? l)
+ (string-split l)
+ l))) ;; some nonhomogenuity here. '(cmd param1 param2 ...) OR '(host-type launcher)
+ ;; (item-list->path itemdat))) ;; test-path is the full path including the item-path
+ (test-sig (conc (common:get-testsuite-name) ":" test-name ":" item-path))
(work-area #f)
(toptest-work-area #f) ;; for iterated tests the top test contains data relevant for all
(diskpath #f)
(cmdparms #f)
(fullcmd #f) ;; (define a (with-output-to-string (lambda ()(write x))))
@@ -1597,15 +1601,24 @@
(debug-param (append (if (args:get-arg "-debug") (list "-debug" (args:get-arg "-debug")) '())
(if (args:get-arg "-logging")(list "-logging") '())
(if (configf:lookup *configdat* "misc" "profilesw")
(list (configf:lookup *configdat* "misc" "profilesw"))
'()))))
+ ;; save the test-sig in the ajtdat record
+ (launch:ajt-test-sig-set! ajtdat test-sig)
+ ;; go ahead and figure out if we have a host-type from the
+ ;; launcher call above and save it in the ajtdat record
+ (if (and (eq? launcher-mode 'adjutant)
+ (list? launcher)
+ (> (length launcher) 1))
+ (launch:ajt-host-type-set! ajtdat (car launcher)))
+
;; (if hosts (set! hosts (string-split hosts)))
;; set the megatest to be called on the remote host
(if (not remote-megatest)(set! remote-megatest local-megatest)) ;; "megatest"))
(set! mt-bindir-path (pathname-directory remote-megatest))
- (if launcher (set! launcher (string-split launcher)))
+ ;; (if launcher (set! launcher (string-split launcher))) ;; yuk!
;; set up the run work area for this test
(if (and (args:get-arg "-preclean") ;; user has requested to preclean for this run
(not (member (db:test-get-rundir testinfo)(list "n/a" "/tmp/badname")))) ;; n/a is a placeholder and thus not a read dir
(begin
(debug:print-info 0 *default-log-port* "attempting to preclean directory " (db:test-get-rundir testinfo) " for test " test-name "/" item-path)
@@ -1659,66 +1672,94 @@
(list 'runtlim (if run-time-limit (common:hms-string->seconds run-time-limit) #f))
(list 'env-ovrd (hash-table-ref/default *configdat* "env-override" '()))
(list 'set-vars (if params (hash-table-ref/default params "-setvars" #f)))
(list 'runname runname)
(list 'mt-bindir-path mt-bindir-path))))))))
+ ;; save the cmdparms in the ajtdat
+ (launch:ajt-exekey-set! ajtdat cmdparms)
(setenv "MT_CMDINFO" cmdparms) ;; setting this for use in nblauncher
;; clean out step records from previous run if they exist
;; (rmt:delete-test-step-records run-id test-id)
;; if the dir does not exist we may have a itempath where individual variables are a path, launch anyway
(if (common:file-exists? work-area)
(change-directory work-area)) ;; so that log files from the launch process don't clutter the test dir
- (cond
- ;; ((and launcher hosts) ;; must be using ssh hostname
- ;; (set! fullcmd (append launcher (car hosts)(list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param)))
- ;; (set! fullcmd (append launcher (car hosts)(list remote-megatest test-sig "-execute" cmdparms))))
+
+ ;; save the command line for adjutant mode (might never be needed but best to assemble it here)
+ (launch:ajt-cmdline-set! ajtdat (string-intersperse
+ (append (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param)))
+ (cond
(launcher
(set! fullcmd (append launcher (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param)))
- ;; (set! fullcmd (append launcher (list remote-megatest test-sig "-execute" cmdparms))))
(else
(if (not useshell)(debug:print 0 *default-log-port* "WARNING: internal launching will not work well without \"useshell yes\" in your [jobtools] section"))
(set! fullcmd (append (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param (list (if useshell "&" ""))))))
- ;; (set! fullcmd (list remote-megatest test-sig "-execute" cmdparms (if useshell "&" "")))))
+
(if (args:get-arg "-xterm")(set! fullcmd (append fullcmd (list "-xterm"))))
(debug:print 1 *default-log-port* "Launching " work-area)
;; set pre-launch-env-vars before launching, keep the vars in prevvals and put the envionment back when done
(debug:print 4 *default-log-port* "fullcmd: " fullcmd)
(set! *last-launch* (current-seconds)) ;; all that junk above takes time, set this as late as possible.
- (let* ((commonprevvals (alist->env-vars
- (hash-table-ref/default *configdat* "env-override" '())))
- (miscprevvals (alist->env-vars ;; consolidate this code with the code in megatest.scm for "-execute"
- (append (list (list "MT_TEST_RUN_DIR" work-area)
- (list "MT_TEST_NAME" test-name)
- (list "MT_ITEM_INFO" (conc itemdat))
- (list "MT_RUNNAME" runname)
- (list "MT_TARGET" mt_target)
- (list "MT_ITEMPATH" item-path)
- )
- itemdat)))
- (testprevvals (alist->env-vars
- (hash-table-ref/default tconfig "pre-launch-env-overrides" '())))
+ (let* ((env-override-vars (hash-table-ref/default *configdat* "env-override" '()))
+ (commonprevvals (alist->env-vars env-override-vars))
+ (misc-vars (append (list (list "MT_TEST_RUN_DIR" work-area)
+ (list "MT_TEST_NAME" test-name)
+ (list "MT_ITEM_INFO" (conc itemdat))
+ (list "MT_RUNNAME" runname)
+ (list "MT_TARGET" mt_target)
+ (list "MT_ITEMPATH" item-path))
+ itemdat))
+ (miscprevvals (alist->env-vars misc-vars));; consolidate this code with the code in megatest.scm for "-execute"
+ (test-vars (hash-table-ref/default tconfig "pre-launch-env-overrides" '()))
+ (testprevvals (alist->env-vars test-vars))
+
;; Launchwait defaults to true, must override it to turn off wait
(launchwait (if (equal? (configf:lookup *configdat* "setup" "launchwait") "no") #f #t))
- (launch-results-prev (apply (if launchwait ;; BB: TODO: refactor this to examine return code of launcher, if nonzero, set state to launch failed.
- process:cmd-run-with-stderr-and-exitcode->list
- process-run)
- (if useshell
- (let ((cmdstr (string-intersperse fullcmd " ")))
- (if launchwait
- cmdstr
- (conc cmdstr " >> mt_launch.log 2>&1 &")))
- (car fullcmd))
- (if useshell
- '()
- (cdr fullcmd))))
+ ;; BB: TODO: refactor this to examine return code of launcher, if nonzero, set state to launch failed.
+ (launch-results-prev (if (eq? launcher-mode 'adjutant)
+ '(#t 0) ;; just some fake data to fool downstream but non-applicable code
+ (apply (if launchwait
+ process:cmd-run-with-stderr-and-exitcode->list
+ process-run)
+ (if useshell
+ (let ((cmdstr (string-intersperse fullcmd " ")))
+ (if launchwait
+ cmdstr
+ (conc cmdstr " >> mt_launch.log 2>&1 &")))
+ (car fullcmd))
+ (if useshell
+ '()
+ (cdr fullcmd)))))
(success (if launchwait (equal? 0 (cadr launch-results-prev)) #t))
(launch-results (if launchwait (car launch-results-prev) launch-results-prev)))
- (if (not success)
+
+ (launch:ajt-add-vars ajtdat env-override-vars)
+ (launch:ajt-add-vars ajtdat misc-vars)
+ (launch:ajt-add-vars ajtdat test-vars)
+
+ ;; if in adjutant mode we register the job in the jobs_queue
+ ;; then fire off an adjutant runner
+ ;;
+ (if (eq? launcher-mode 'adjutant)
+ (let* ((adjutant-runner-cmd (append (cdr launcher)
+ (list remote-megatest "-adjutant"
+ (launch:ajt-host-type ajtdat)
+ "-start-dir" *toppath*)))
+ (adj-cmd (conc (string-intersperse (map conc adjutant-runner-cmd) " ")
+ "&")))
+ (rmt:no-sync-add-job
+ (launch:ajt-host-type ajtdat)
+ (launch:ajt-vars ajtdat)
+ (launch:ajt-exekey ajtdat)
+ (launch:ajt-cmdline ajtdat))
+ (print "adj-cmd: " adj-cmd)
+ (system adj-cmd)
+ ))
+
+ (if (not success)
(tests:test-set-status! run-id test-id "COMPLETED" "DEAD" "launcher failed; exited non-zero; check mt_launch.log" #f)) ;; (if launch-results launch-results "FAILED"))
- (mutex-unlock! *launch-setup-mutex*) ;; yes, really should mutex all the way to here. Need to put this entire process into a fork.
;; (rmt:no-sync-del! lock-key) ;; release the lock for starting this test
(if (not launchwait) ;; give the OS a little time to allow the process to start
(thread-sleep! 0.01))
(with-output-to-file "mt_launch.log"
(lambda ()
@@ -1740,10 +1781,14 @@
(process-signal (current-process-id) signal/kill)
))
(alist->env-vars miscprevvals)
(alist->env-vars testprevvals)
(alist->env-vars commonprevvals)
+ ;; yes, really should mutex all the way to here. Need to put this entire process into a fork.
+ ;; the unlock previously was further up. This seemed wrong as we should not proceed until the
+ ;; vars have been reset.
+ (mutex-unlock! *launch-setup-mutex*)
launch-results))
(change-directory *toppath*)
(thread-sleep! (configf:lookup-number *configdat* "setup" "inter-test-delay" default: 0.0))))
;; recover a test where the top controlling mtest may have died
@@ -1769,5 +1814,21 @@
;; now wait on that process if all is correct
;; periodically update the db with runtime
;; when the process exits look at the db, if still RUNNING after 10 seconds set
;; state/status appropriately
(process-wait pid)))
+
+
+ ;; (lock-key (conc "test-" test-id))
+ ;; (got-lock (let loop ((lock (rmt:no-sync-get-lock lock-key))
+ ;; (expire-time (+ (current-seconds) 15))) ;; give up on getting the lock and steal it after 15 seconds
+ ;; (if (car lock)
+ ;; #t
+ ;; (if (> (current-seconds) expire-time)
+ ;; (begin
+ ;; (debug:print-info 0 *default-log-port* "Timed out waiting for a lock to launch test " keyvals " " runname " " test-name " " test-path)
+ ;; (rmt:no-sync-del! lock-key) ;; destroy the lock
+ ;; (loop (rmt:no-sync-get-lock lock-key) expire-time)) ;;
+ ;; (begin
+ ;; (thread-sleep! 1)
+ ;; (loop (rmt:no-sync-get-lock lock-key) expire-time))))))
+
ADDED matt/buttontest.scm
Index: matt/buttontest.scm
==================================================================
--- /dev/null
+++ matt/buttontest.scm
@@ -0,0 +1,53 @@
+(use iup srfi-4)
+
+(module buttontest
+ *
+
+(import iup)
+
+(import scheme
+ srfi-4
+ (prefix iup iup:))
+
+(define (make-image name tcolor bgcolor)
+ (let* ((img-bits1 (u8vector->blob (u8vector
+ 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+ 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+ 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1
+ 1 1 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1
+ 1 1 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1
+ 1 1 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1
+ 1 1 2 2 2 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1
+ 1 1 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1
+ 1 1 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1
+ 1 1 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1
+ 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1
+ 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1
+ 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1
+ 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1
+ 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1
+ 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1
+ 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1
+ 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1
+ 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1
+ 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1
+ 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1
+ 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1
+ 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+ 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+ )))
+ ;; w h
+ (img1 (iup:image/palette 22 24 img-bits1)))
+ (iup:handle-name-set! img1 name)
+ (iup:attribute-set! img1 "1" "10 10 10")
+ (iup:attribute-set! img1 "2" bgcolor) ;; "BGCOLOR")
+ (iup:attribute-set! img1 "3" tcolor)
+ name))
+
+(iup:show
+ (iup:dialog
+ (iup:vbox
+ (iup:button " " image: (make-image "GreenBlack" "0 255 0" "0 0 0")))))
+
+(iup:main-loop)
+)
Index: megatest-version.scm
==================================================================
--- megatest-version.scm
+++ megatest-version.scm
@@ -18,6 +18,6 @@
;; Always use two or four digit decimal
;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00..
;; (declare (unit megatest-version))
-(define megatest-version 1.8031)
+(define megatest-version 1.8102)
Index: megatest.scm
==================================================================
--- megatest.scm
+++ megatest.scm
@@ -53,10 +53,17 @@
(declare (uses dbfile.import))
(declare (uses dbmod))
(declare (uses dbmod.import))
(declare (uses portlogger))
(declare (uses portlogger.import))
+
+(declare (uses adjutant))
+(import adjutant)
+
+(declare (uses mttop))
+(import mttop)
+
(declare (uses tcp-transportmod))
(declare (uses tcp-transportmod.import))
(declare (uses rmtmod))
(declare (uses rmtmod.import))
@@ -83,12 +90,25 @@
(include "db_records.scm")
(include "run_records.scm")
(include "megatest-fossil-hash.scm")
(use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 (prefix base64 base64:))
-(use readline apropos json http-client directory-utils typed-records)
-(use http-client srfi-18 extras format tcp-server tcp)
+(use apropos
+ call-with-environment-variables
+ directory-utils
+ extras
+ format
+ http-client
+ json
+ matchable
+ readline
+ srfi-18
+ tcp
+ tcp-server
+ typed-records
+
+ )
;; Added for csv stuff - will be removed
;;
(use sparse-vectors)
@@ -113,11 +133,13 @@
;;
(let ((debugcontrolf (conc (get-environment-variable "HOME") "/.megatestrc")))
(if (common:file-exists? debugcontrolf)
(begin
;; for some reason, debug:print does not work here. Had to use print.
- (print (conc "WARNING: loading " debugcontrolf))
+ (with-output-to-port (current-error-port)
+ (lambda ()
+ (print (conc "WARNING: loading " debugcontrolf))))
(load debugcontrolf)
)
)
)
@@ -151,10 +173,11 @@
Usage: megatest [options]
-h : this help
-manual : show the Megatest user manual
-version : print megatest version (currently " megatest-version ")
+ help : help for the new Megatest interface
Launching and managing runs
-run : run all tests or as specified by -testpatt
-remove-runs : remove the data for a run, requires -runname and -testpatt
Optionally use :state and :status, use -keep-records to remove only
@@ -248,11 +271,11 @@
-update-meta : update the tests metadata for all tests
-setvars VAR1=val1,VAR2=val2 : Add environment variables to a run NB// these are
overwritten by values set in config files.
-server -|hostname : start the server (reduces contention on megatest.db), use
- to automatically figure out hostname
- -adjutant C,M : start the server/adjutant with allocated cores C and Mem M (Gig),
+ -adjutant host-type : start the server/adjutant with given host-type
use 0,0 to auto use full machine
-transport http|rpc : use http or rpc for transport (default is http)
-log logfile : send stdout and stderr to logfile
-list-servers : list the servers
-kill-servers : kill all servers
@@ -263,11 +286,10 @@
-debug N|N,M,O... : enable debug 0-N or N and M and O ...
-debug-noprop N|M,M,O...: enable debug but do not propagate to subprocesses via MT_DEBUG
-config fname : override the megatest.config file with fname
-append-config fname : append fname to the megatest.config file
-import-sexpr fname : import a sexpr file (use -list-runs % -dumpmode sexpr to create)
- -remove-dbs all : remove Megatest DBs before importing sexpr. (Use only with -import-sexpr)
-regen-testfiles : regenerate scripts and logpro files from testconfig, run in test context
Utilities
-env2file fname : write the environment to fname.csh and fname.sh
-envcap a : save current variables labeled as context 'a' in file envdat.db
@@ -321,10 +343,14 @@
Version " megatest-version ", built from " megatest-fossil-hash ))
;; -gui : start a gui interface
;; -config fname : override the runconfigs file with fname
+
+(mttop-run (command-line-arguments)
+ '("help"))
+
;; process args
(define remargs (args:get-args
(argv)
(list "-runtests" ;; run a specific test
"-config" ;; override the config file name
@@ -989,16 +1015,47 @@
(debug:print 0 *default-log-port* "ERROR: transport mode is tcp - -db is required.")
(exit 1)))))
(else (debug:print 0 *default-log-port* "ERROR: rmt:transport-mode value not recognised "(rmt:transport-mode))))
(set! *didsomething* #t)))
+
+(define (naylist->alist inlst)
+ (map (lambda (dat)
+ (cons (car dat)
+ (or (if (list? (cdr dat))
+ (if (null? (cdr dat)) ""
+ (cadr dat))
+ (cdr dat))
+ ""))) ;; we need a string for call-with-environment-variables
+ inlst))
+
;; The adjutant is a bit different, it does NOT run (launch:setup) as it is not necessarily tied to
;; a specific Megatest area. Detail are being hashed out and this may change.
;;
(if (args:get-arg "-adjutant")
- (begin
- (adjutant-run)
+ (let* ((host-type (args:get-arg "-adjutant")))
+ (launch:setup) ;; dang it, wish this wasn't needed
+ (print "Running the adjutant!")
+ (let loop ((wait-count 0))
+ (if (< wait-count 10) ;; 6 x 10 seconds = one minute
+ (let* ((dat (rmt:no-sync-take-job host-type)))
+ (match dat
+ ((id ht vars exekey cmdline state event-time last-update)
+ (let ((vars-alist (with-input-from-string vars read)
+ ))
+ (print "Vars:")
+ (pp vars-alist)
+ (call-with-environment-variables
+ (naylist->alist vars-alist)
+ (lambda ()
+ (system cmdline))))
+ (loop 0))
+ (else
+ (thread-sleep! 10)
+ (loop (+ wait-count 1)))))
+ (print "I'm bored. Exiting.")))
+ ;; (adjutant-run (args:get-arg "-ajutant") rmt:no-sync-take-job)
(set! *didsomething* #t)))
(if (args:get-arg "-list-servers")
(let* ((tl (launch:setup)) ;; need this to initialize *toppath*
(servdir (tt:get-servinfo-dir *toppath*))
@@ -2130,10 +2187,25 @@
(paths (tests:test-get-paths-matching keys target (args:get-arg "-test-files"))))
(for-each (lambda (path)
(print path))
paths))))))
+;;======================================================================
+;; Utils for test areas
+;;======================================================================
+
+(if (args:get-arg "-regen-testfiles")
+ (if (getenv "MT_TEST_RUN_DIR")
+ (begin
+ (launch:setup)
+ (change-directory (getenv "MT_TEST_RUN_DIR"))
+ (let* ((testname (getenv "MT_TEST_NAME"))
+ (itempath (getenv "MT_ITEMPATH")))
+ (launch:extract-scripts-logpro (getenv "MT_TEST_RUN_DIR") testname itempath #f))
+ (set! *didsomething* #t))
+ (debug:print 0 *default-log-port* "ERROR: Must run -regen-testfiles in a test environment (i.e. test xterm from dashboard)")))
+
;;======================================================================
;; Utils for test areas
;;======================================================================
(if (args:get-arg "-regen-testfiles")
ADDED mttop.scm
Index: mttop.scm
==================================================================
--- /dev/null
+++ mttop.scm
@@ -0,0 +1,55 @@
+;; Copyright 2006-2011, Matthew Welland.
+;;
+;; This program is made available under the GNU GPL version 2.0 or
+;; greater. See the accompanying file COPYING for details.
+;;
+;; This program is distributed WITHOUT ANY WARRANTY; without even the
+;; implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+;; PURPOSE.
+
+;; This is from the perl world, a hash of hashes is a super easy way to keep a handle on
+;; lots of disparate data
+;;
+
+(declare (unit mttop))
+
+(module mttop
+ *
+
+(import chicken scheme
+ ;; data-structures posix
+ srfi-1
+ ;; srfi-13
+ srfi-69
+ ports
+ extras
+ regex
+ posix
+ data-structures
+ matchable
+ )
+
+(define (str-is-cmd cmd all-cmds)
+ (let* ((rx (regexp (conc "^" cmd ".*")))
+ (mx (filter string? (map (lambda (x)
+ (let ((res (string-match rx x)))
+ (if res (car res) #f)))
+ all-cmds))))
+ (if (eq? (length mx) 1) ;; have a command
+ (car mx)
+ #f)))
+
+(define (mttop-run args all-cmds)
+ ;; any path through this call must end in exit if it is NOT an old Megatest call
+ (if (null? args)
+ #f ;; continue on and do the old Megatest stuff
+ (let ((cmd (str-is-cmd (car args) all-cmds)))
+ (if cmd
+ (begin
+ (case (string->symbol cmd)
+ ((help)(print "New help"))
+ (else (print "Command " cmd " is not implemented yet.")))
+ (exit)) ;; always exit here
+ #f)))) ;; or continue on to Megatest old stuff here
+
+)
Index: portlogger.scm
==================================================================
--- portlogger.scm
+++ portlogger.scm
@@ -15,10 +15,11 @@
;;
;; You should have received a copy of the GNU General Public License
;; along with Megatest. If not, see .
;;
+(require-extension (srfi 18) extras tcp s11n)
(declare (unit portlogger))
(declare (uses debugprint))
(declare (uses dbmod))
Index: rmt.scm
==================================================================
--- rmt.scm
+++ rmt.scm
@@ -92,17 +92,30 @@
(define (rmt:send-receive cmd run-id params #!key (attemptnum 1)(ttdat #f))
(assert (or (not run-id) (number? run-id)) "FATAL: run-id is required to be a number or #f")
(assert *toppath* "FATAL: rmt:send-receive called with *toppath* not set.")
(let* ((areapath *toppath*) ;; TODO - resolve from dbstruct to be compatible with multiple areas
(readonly-mode (rmtmod:calc-ro-mode ttdat *toppath*))
- (testsuite (common:get-testsuite-name)))
+ (testsuite (common:get-testsuite-name))
+ (dbfname (conc (dbfile:run-id->dbnum run-id)".db"))
+ (dbdir (conc areapath "/.mtdb")))
+ (if (and (not *journal-stats*)
+ (file-exists? dbdir))
+ (tt:start-stats dbdir)) ;; fixme - find the right call to get the db directory
+
+ ;; check the load on dbfname and add some delay using a droop curve of sorts
+ (if *journal-stats*
+ (let* ((load (tt:get-journal-stats dbfname)))
+ (if (> load 0.1) ;; start activating delay at 10% journal load time
+ (let ((dely (* 50 (* load load)))) ;; 100% journal time=50sec delay
+ (debug:print 0 *default-log-port* "Journal load "load" on "dbfname" delaying queries "dely"s.")
+ (thread-sleep! dely)))))
+
(case (rmt:transport-mode)
((tcp)
(let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value
(attemptnum (+ 1 attemptnum))
(mtexe (common:find-local-megatest))
- (dbfname (conc (dbfile:run-id->dbnum run-id)".db"))
(ttdat (rmt:set-ttdat areapath ttdat))
(conn (tt:get-conn ttdat dbfname))
(is-main (equal? dbfname "main.db")) ;; why not (not run-id) ?
(server-start-proc (if is-main
#f
@@ -716,10 +729,19 @@
(rmt:send-receive 'no-sync-del! #f `(,var)))
(define (rmt:no-sync-get-lock keyname)
(rmt:send-receive 'no-sync-get-lock #f `(,keyname)))
+(define (rmt:no-sync-add-job host-type vars-list exekey cmdline)
+ (rmt:send-receive 'no-sync-add-job #f `(,host-type ,vars-list ,exekey ,cmdline)))
+
+(define (rmt:no-sync-take-job host-type)
+ (rmt:send-receive 'no-sync-take-job #f `(,host-type)))
+
+(define (rmt:no-sync-job-records-clean)
+ (rmt:set-receive 'no-sync-job-records-clean #f '()))
+
;; process registration
(define (rmt:register-process host port pid starttime status purpose dbname mtversion)
(rmt:send-receive 'register-process #f (list host port pid starttime status purpose dbname mtversion)))
@@ -790,5 +812,8 @@
(let* ((cfg-deadtime (configf:lookup-number *configdat* "setup" "deadtime"))
(test-stats-update-period (configf:lookup-number *configdat* "setup" "test-stats-update-period")))
(rmt:find-and-mark-incomplete-engine run-id ovr-deadtime cfg-deadtime test-stats-update-period)
;;call end of eud of run detection for posthook
(launch:end-of-run-check run-id)))
+
+;; orphaned from cherrypick merge
+;; (debug:print 0 *default-log-port* "Inserting " (length tests-data) " tests in run " runname)
Index: rmtmod.scm
==================================================================
--- rmtmod.scm
+++ rmtmod.scm
@@ -18,12 +18,17 @@
;;======================================================================
(declare (unit rmtmod))
(declare (uses debugprint))
+;; (declare (uses debugprint.import))
(declare (uses commonmod))
+;; (declare (uses commonmod.import))
(declare (uses dbfile)) ;; needed for records
+(declare (uses dbmod))
+;; (declare (uses tcp-transportmod))
+;; (declare (uses tcp-transportmod.import))
;; (declare (uses apimod))
;; (declare (uses apimod.import))
;; (declare (uses ulex))
@@ -33,10 +38,14 @@
*
(import scheme chicken data-structures extras matchable srfi-69)
(import (prefix sqlite3 sqlite3:) posix typed-records srfi-18)
(import commonmod dbfile debugprint) ;; (prefix commonmod cmod:))
+(import dbmod
+ ;; tcp-transportmod
+ )
+
;; (import apimod)
;; (import (prefix ulex ulex:))
(include "db_records.scm")
@@ -305,7 +314,25 @@
;; call end of eud of run detection for posthook - from merge, is it needed?
;; (launch:end-of-run-check run-id)
all-ids)
)))))
+;;======================================================================
+;; Misc
+;;======================================================================
+
+;; (define (rmtmod:wait-on-server-load run-id ttdat)
+;; (let* ((dbfname (dbmod:run-id->dbfname run-id))
+;; (get-lowest-thread-load
+;; (lambda ()
+;; (let* ((sdats (tt:get-server-info-sorted ttdat dbfname)))
+;; (car (map tt:get-server-threads sdats))))))
+;; (if ttdat
+;; (let loop ()
+;; (if (> (get-lowest-thread-load) 5) ;; load is pretty high
+;; (begin
+;; (debug:print 0 *default-log-port* "Servers appear overloaded, waiting...")
+;; (thread-sleep! 1)
+;; (loop))))
+;; (debug:print 0 *default-log-port* "Can't wait on server load, *ttdat* not set"))))
)
Index: runs.scm
==================================================================
--- runs.scm
+++ runs.scm
@@ -29,10 +29,11 @@
(declare (uses mt))
(declare (uses archive))
(declare (uses mtargs))
(declare (uses rmtmod))
(declare (uses dbfile))
+(declare (uses tcp-transportmod))
(use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 (srfi 18)
posix-extras directory-utils pathname-expand typed-records format sxml-serializer
sxml-modifications matchable)
@@ -48,10 +49,11 @@
(import commonmod
debugprint
rmtmod
dbfile
+ tcp-transportmod
(prefix mtargs args:))
;; use this struct to facilitate refactoring
;;
@@ -811,31 +813,21 @@
(debug:print-info 1 *default-log-port* "Adding \"" (string-intersperse required-tests " ") "\" to the run queue"))
;; NOTE: these are all parent tests, items are not expanded yet.
(debug:print-info 4 *default-log-port* "test-records=" (hash-table->alist test-records))
(let ((reglen (configf:lookup *configdat* "setup" "runqueue")))
(if (> (length (hash-table-keys test-records)) 0)
- (let* ((keep-going #t)
- (run-queue-retries 5)
- (run-ids (rmt:get-all-run-ids)))
- #;(for-each (lambda (run-id)
- (if keep-going
- (handle-exceptions
- exn
- (debug:print 0 *default-log-port* "error in calling find-and-mark-incomplete for run-id " run-id ", exn=" exn)
- (rmt:find-and-mark-incomplete run-id #f)))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27)
- run-ids)
+ (let* ()
(runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests
(any->number reglen) all-tests-registry)
- (set! keep-going #f)
(if (> run-count 0) ;; handle reruns
(begin
(if (not (hash-table-ref/default flags "-preclean" #f))
(hash-table-set! flags "-preclean" #t))
(if (not (hash-table-ref/default flags "-rerun" #f))
(hash-table-set! flags "-rerun" "ABORT,STUCK/DEAD,n/a,ZERO_ITEMS"))
- (runs:run-tests target runname test-patts user flags run-count: (- run-count 1)))
- (launch:end-of-run-check run-id)))
+ (runs:run-tests target runname test-patts user flags run-count: (- run-count 1)))
+ (launch:end-of-run-check run-id)))
(debug:print-info 0 *default-log-port* "No tests to run")))
(debug:print-info 4 *default-log-port* "All done by here")
;; TODO: try putting post hook call here
; (debug:print-info 2 *default-log-port* " run-count " run-count)
@@ -994,11 +986,11 @@
(if (null? items-list)
(let ((test-id (rmt:get-test-id run-id test-name ""))
(num-items (rmt:test-toplevel-num-items run-id test-name)))
(if (and test-id
(not (> num-items 0)))
- (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "ZERO_ITEMS" "Failed to run due to failed prerequisites"))))
+ (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "ZERO_ITEMS" (conc "Failed to run due to failed prerequisites: " (runs:pretty-string fails))))))
(tests:testqueue-set-items! test-record items-list)
(list hed tal reg reruns))
(begin
(debug:print-error 0 *default-log-port* "The proc from reading the items table did not yield a list - please report this")
(exit 1))))))
@@ -1030,14 +1022,14 @@
(if (and give-up
(not (and (null? tal)(null? reg))))
(let ((trimmed-tal (mt:discard-blocked-tests run-id hed tal test-records))
(trimmed-reg (mt:discard-blocked-tests run-id hed reg test-records)))
- (debug:print 1 *default-log-port* "WARNING: test " hed " has discarded prerequisites, removing it from the queue")
+ (debug:print 1 *default-log-port* "WARNING: test " hed " has discarded prerequisites: " prereqstrs ", removing it from the queue")
(let ((test-id (rmt:get-test-id run-id hed "")))
- (if test-id (mt:test-set-state-status-by-id run-id test-id "COMPLETED" "PREQ_DISCARDED" "Failed to run due to discarded prerequisites")))
+ (if test-id (mt:test-set-state-status-by-id run-id test-id "COMPLETED" "PREQ_DISCARDED" (conc "Failed to run due to discarded prerequisites: " prereqstrs))))
(if (and (null? trimmed-tal)
(null? trimmed-reg))
#f
(runs:loop-values trimmed-tal trimmed-reg reglen regfull reruns)
@@ -1049,11 +1041,11 @@
(null? non-completed))
(debug:print-info 4 *default-log-port* "cond branch - " "ei-4")
(if (runs:can-keep-running? hed 20)
(begin
(runs:inc-cant-run-tests hed)
- (debug:print-info 0 *default-log-port* "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0) ", going to wait 60 sec.") ;;
+ (debug:print-info 0 *default-log-port* "no fails in prerequisites (" (runs:pretty-string prereqs-not-met) ") for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0) ", going to wait 60 sec.") ;;
;; getting here likely means the system is way overloaded, kill a full minute before continuing
;; (thread-sleep! 60) ;; TODO: gate by normalized server load > 1.0 (maxload config thing) CHECKTHIS!!!
;; No runsdat, can't do this yet
;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
;;
@@ -1060,29 +1052,29 @@
(thread-sleep! 5) ;; TODO: gate by normalized server load > 1.0 (maxload config thing)
;; num-retries code was here
;; we use this opportunity to move contents of reg to tal
(list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met?
(begin
- (debug:print-info 1 *default-log-port* "no fails in prerequisites for " hed " but nothing seen running in a while, dropping test " hed " from the run queue")
+ (debug:print-info 1 *default-log-port* "no fails in prerequisites (" (runs:pretty-string prereqs-not-met) ") for " hed " but nothing seen running in a while, dropping test " hed " from the run queue")
(let ((test-id (rmt:get-test-id run-id hed "")))
- (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "TIMED_OUT" "Nothing seen running in a while.")))
+ (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "TIMED_OUT" (conc "Prerequisites (" (runs:pretty-string prereqs-not-met) ") not seen running in a while."))))
(runs:loop-values tal reg reglen regfull reruns)
)))
((and
(or (not (null? fails))
(not (null? prereq-fails)))
(member 'normal testmode))
(debug:print-info 4 *default-log-port* "cond branch - " "ei-5")
- (debug:print-info 1 *default-log-port* "test " hed " (mode=" testmode ") has failed prerequisite(s); "
+ (debug:print-info 1 *default-log-port* "test " hed " (mode=" testmode ") has failed prerequisite(s): "
(string-intersperse (map (lambda (t)(conc (db:test-get-testname t) ":" (db:test-get-state t)"/"(db:test-get-status t))) fails) ", ")
", removing it from to-do list")
(let ((test-id (rmt:get-test-id run-id hed "")))
(if test-id
(if (not (null? prereq-fails))
- (mt:test-set-state-status-by-id-unless-completed run-id test-id "COMPLETED" "PREQ_DISCARDED" "Failed to run due to prior failed prerequisites")
- (mt:test-set-state-status-by-id-unless-completed run-id test-id "COMPLETED" "PREQ_FAIL" "Failed to run due to failed prerequisites"))))
+ (mt:test-set-state-status-by-id-unless-completed run-id test-id "COMPLETED" "PREQ_DISCARDED" (conc "Failed to run due to prior failed prerequisites: "(runs:pretty-string prereq-fails)))
+ (mt:test-set-state-status-by-id-unless-completed run-id test-id "COMPLETED" "PREQ_FAIL" (conc "Failed to run due to failed prerequisites: " (runs:pretty-string fails ))))))
;; (debug:print 4 *default-log-port*"BB> set PREQ_FAIL on "hed)
;; (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "PREQ_FAIL" "Failed to run due to failed prerequisites")))) ;; BB: this works, btu equivalent for itemwait mode does not work.
(if (or (not (null? reg))(not (null? tal)))
(begin
(hash-table-set! test-registry hed 'CANNOTRUN)
@@ -1155,17 +1147,31 @@
(keyvals (runs:dat-keyvals runsdat))
(run-info (runs:dat-run-info runsdat))
(all-tests-registry (runs:dat-all-tests-registry runsdat))
(run-limits-info (runs:dat-can-run-more-tests runsdat))
;; (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running
- (have-resources (car run-limits-info))
+ (have-resources (and (if *journal-stats*
+ (let* ((dbfname (conc
+ (dbfile:run-id->dbnum run-id)
+ ".db"))
+ (load (tt:get-journal-stats dbfname)))
+ (if (> load 0.1) ;; dbs too busy to start more tests
+ (begin
+ (debug:print-info 0 *default-log-port* "Gating launch due to db load "load" based on journal file observations for "dbfname)
+ #f)
+ #t))
+ (begin
+ (debug:print-info 0 *default-log-port* "Journal gating not started for "run-id)
+ #t)) ;; if journal monitoring not started do not gate
+ (car run-limits-info)))
(num-running (list-ref run-limits-info 1))
(num-running-in-jobgroup(list-ref run-limits-info 2))
(max-concurrent-jobs (list-ref run-limits-info 3))
(job-group-limit (list-ref run-limits-info 4))
;; (prereqs-not-met (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))
;; (prereqs-not-met (mt:lazy-get-prereqs-not-met run-id waitons item-path mode: testmode itemmap: itemmap))
+ (prereqs-running (runs:calc-prereqs-running prereqs-not-met))
(fails (if (list? prereqs-not-met) ;; TODO: rename fails to failed-prereqs
(runs:calc-fails prereqs-not-met)
(begin
(debug:print-error 0 *default-log-port* "prereqs-not-met is not a list! " prereqs-not-met)
'())))
@@ -1201,13 +1207,17 @@
(common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f))
;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues
(if maxhomehostload
(common:wait-for-homehost-load maxhomehostload
- (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload))))))
+ (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload)))
+
+ ;; lastly lets check the servers are not overloaded by looking at threads
+ (tt:wait-on-server-load run-id *ttdat*)
+
+ )))
-
(if (and (not (null? prereqs-not-met))
(runs:lownoise (conc "waiting on tests " prereqs-not-met hed) 60))
(debug:print-info 2 *default-log-port* "waiting on tests; " (string-intersperse (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) ", ")))
@@ -1335,45 +1345,44 @@
(thread-sleep! 5)
(list (car newtal)(cdr newtal) reg reruns))
;; the waiton is FAIL so no point in trying to run hed ever again
(begin
(let ((my-test-id (rmt:get-test-id run-id test-name item-path)))
- (mt:test-set-state-status-by-id-unless-completed run-id my-test-id "COMPLETED" "PREQ_FAIL" "Failed to run due to failed prerequisites2"))
+ (mt:test-set-state-status-by-id-unless-completed run-id my-test-id "COMPLETED" "PREQ_FAIL" (conc "Failed to run due to failed prerequisites: " (runs:pretty-string fails))))
(if (or (not (null? reg))(not (null? tal)))
(if (vector? hed)
(begin
(debug:print 1 *default-log-port* "WARNING: Dropping test " test-name "/" item-path
- " from the launch list as it has prerequistes that are FAIL")
+ " from the launch list as it has prerequistes that are FAIL: " (runs:pretty-string fails))
(let ((test-id (rmt:get-test-id run-id hed "")))
- (if test-id (mt:test-set-state-status-by-id-unless-completed run-id test-id "COMPLETED" "PREQ_FAIL" "Failed to run due to failed prerequisites")))
+ (if test-id (mt:test-set-state-status-by-id-unless-completed run-id test-id "COMPLETED" "PREQ_FAIL" (conc "Failed to run due to failed prerequisites: " (runs:pretty-string fails)) )))
(runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?)
;; (thread-sleep! *global-delta*)
;; This next is for the items
(if (not (null? fails))
- ;;(mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "PREQ_FAIL" #f)
- (rmt:set-state-status-and-roll-up-items run-id test-name item-path "NOT_STARTED" "PREQ_FAIL" #f)
- ;;(mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "BLOCKED" #f)
+ (rmt:set-state-status-and-roll-up-items run-id test-name item-path "NOT_STARTED" "PREQ_FAIL" (runs:pretty-string fails))
(rmt:set-state-status-and-roll-up-items run-id test-name item-path "NOT_STARTED" "BLOCKED" #f) )
(hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'removed)
(runs:loop-values tal reg reglen regfull reruns))
(let ((nth-try (hash-table-ref/default test-registry hed 0))) ;; hed not a vector...
(debug:print 2 *default-log-port* "nth-try("hed")="nth-try)
(cond
- ((member "RUNNING" (map db:test-get-state prereqs-not-met))
- (if (runs:lownoise (conc "possible RUNNING prerequistes " hed) 60)
- (debug:print 0 *default-log-port* "WARNING: test " hed " has possible RUNNING prerequisites, don't give up on it yet."))
+ ;;((member "RUNNING" (map db:test-get-state prereqs-not-met))
+ ((> 0 (length prereqs-running))
+ (if (runs:lownoise (conc "possible RUNNING prerequisites " hed) 60)
+ (debug:print 0 *default-log-port* "WARNING: test " hed " has possible RUNNING prerequisites: " prereqs-running ", don't give up on it yet."))
(thread-sleep! 0.1)
(runs:loop-values tal reg reglen regfull reruns))
((or (not nth-try) ;; BB: condition on subsequent tries, condition below fires on first try
(and (number? nth-try)
(< nth-try 2)))
(hash-table-set! test-registry hed (if (number? nth-try)
(+ nth-try 1)
0))
(if (runs:lownoise (conc "not removing test " hed) 60)
- (debug:print 1 *default-log-port* "WARNING: not removing test " hed " from queue although it may not be runnable due to FAILED prerequisites"))
+ (debug:print 1 *default-log-port* "WARNING: not removing test " hed " from queue although it may not be runnable due to FAILED prerequisites: "(runs:pretty-string fails)))
;; may not have processed correctly. Could be a race condition in your test implementation? Dropping test " hed) ;; " as it has prerequistes that are FAIL. (NOTE: hed is not a vector)")
(runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?)
(runs:loop-values newtal reg reglen regfull reruns))
((symbol? nth-try) ;; BB: 'done matches here in one case where prereq itemwait failed. This is first "try"
(if (eq? nth-try 'removed) ;; removed is removed - drop it NOW
@@ -1380,20 +1389,24 @@
(if (null? tal)
#f ;; yes, really
(list (car tal)(cdr tal) reg reruns))
(begin
(if (runs:lownoise (conc "FAILED prerequisites or other issue" hed) 60)
- (debug:print 0 *default-log-port* "WARNING: test " hed " has FAILED prerequisites or other issue. Internal state >" nth-try "< will be overridden and we'll retry."))
- ;; was: (mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "KEEP_TRYING" #f)
- (mt:test-set-state-status-by-testname-unless-completed run-id test-name item-path "COMPLETED" "PREQ_FAIL" #f)
+ (debug:print 0 *default-log-port* "WARNING: test " hed " has FAILED prerequisites (" (runs:pretty-string fails)") or other issue. Internal state >" nth-try "< will be overridden and we'll retry."))
+ (let* ((test-id (rmt:get-test-id run-id hed item-path))
+ (test-info (rmt:get-testinfo-state-status run-id test-id)) ;; we need *current* info
+ (status (db:test-get-status test-info)))
+ (if (equal? status "KEEP_TRYING")
+ (mt:test-set-state-status-by-testname-unless-completed run-id test-name item-path "COMPLETED" "PREQ_FAIL" #f)
+ (mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "KEEP_TRYING" #f)))
(hash-table-set! test-registry hed 'removed) ;; was 0
(if (not (and (null? reg) (null? tal)))
(runs:loop-values tal reg reglen regfull reruns)
#f))))
(else
(if (runs:lownoise (conc "FAILED prerequitests and we tried" hed) 60)
- (debug:print 0 *default-log-port* "WARNING: test " hed " has FAILED prerequitests and we've tried at least 10 times to run it. Giving up now."))
+ (debug:print 0 *default-log-port* "WARNING: test " hed " has FAILED prerequitests: " (runs:pretty-string fails) " and we've tried at least 10 times to run it. Giving up now."))
;; (debug:print 0 *default-log-port* " prereqs: " prereqs-not-met)
(hash-table-set! test-registry hed 'removed)
(mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "TEN_STRIKES" #f)
;; I'm unclear on if this roll up is needed - it may be the root cause of the "all set to FAIL" bug.
(rmt:set-state-status-and-roll-up-items run-id test-name item-path #f "FAIL" #f) ;; treat as FAIL
@@ -2003,10 +2016,22 @@
(and (equal? "NOT_STARTED" (db:test-get-state t))
(member (db:test-get-status t)
'("n/a" "KEEP_TRYING")))
(and (equal? "RUNNING" (db:test-get-state t))))) ;; account for a test that is running
prereqs-not-met))
+
+(define (runs:calc-prereqs-running prereqs-not-met)
+ (if (list? prereqs-not-met)
+ (filter
+ (lambda (t)
+ (or (not (vector? t))
+ (member (db:test-get-state t) '("RUNNING" "LAUNCHED" "REMOTE_HOST_START"))
+ ))
+ prereqs-not-met)
+ '()
+ )
+)
(define (runs:pretty-string lst)
(map (lambda (t)
(if (not (vector? t))
(conc t)
Index: server.scm
==================================================================
--- server.scm
+++ server.scm
@@ -393,11 +393,11 @@
(set! new-server-key (with-input-from-file start-flag (lambda () (read-line))))
(equal? server-key new-server-key)))
#t
;; If either of the above conditions is not true, print a "Gating server start" message, wait + 1, then call this function recursively.
(begin
- (debug:print-info 0 *default-log-port* "Gating server start, last start: "
+ (debug:print-info 2 *default-log-port* "Gating server start, last start: "
(seconds->time-string fmodtime) ", time since last start: " delta ", required idletime: " idletime ", gating reason:" (if old-enough "another job started a server" "too soon to start another server"))
(thread-sleep! ( + 1 idletime))
(server:wait-for-server-start-last-flag areapath)))))))
Index: tcp-transportmod.scm
==================================================================
--- tcp-transportmod.scm
+++ tcp-transportmod.scm
@@ -41,10 +41,11 @@
ports
posix
files
data-structures
+ directory-utils
tcp
))
(chicken-5
(import chicken.base
chicken.condition
@@ -159,14 +160,40 @@
(define (tt:get-conn ttdat dbfname)
(hash-table-ref/default (tt-conns ttdat) dbfname #f))
;; do all the busy work of finding and setting up conn for
;; connecting to a server
+;; This function, `tt:client-connect-to-server`, is designed to manage connections between a client and a server within a testing framework.
+;; The function takes four arguments:
+;; 1. `ttdat`: a data structure that holds information about the testing environment or connections.
+;; 2. `dbfname`: The name of the database file that the client wants to connect to.
+;; 3. `run-id`: An identifier for the current run of the test suite.
+;; 4. `testsuite`: The test suite that is being run.
+;;
+;; Here's a step-by-step explanation of what the function does:
+;;
+;; 1. It first asserts that the `run-id` is valid for the given `dbfname` using the `tt:valid-run-id` function. If the `run-id` is not valid, it raises a fatal error.
+;; 2. It prints debug information indicating that the function `tt:client-connect-to-server` has been called with the given `dbfname`.
+;; 3. It attempts to retrieve an existing connection to the server from a hash table (`tt-conns`) using the `dbfname` as the key. If a connection already exists, it prints debug information and returns the existing connection.
+;; 4. If no existing connection is found, it retrieves the current server information from the servinfo file, using the `tt:get-current-server-info` function.
+;; 5. It uses pattern matching to destructure the server information into variables (`host`, `port`, `start-time`, `server-id`, `pid`, `dbfname2`, `servinffile`). It then asserts that the `dbfname` from the server info matches the one provided to the function.
+;; 6. It constructs a connection object (`conn`) with the server information.
+;; 7. It attempts to ping the server using `tt:timed-ping` to verify that the server is running and can be communicated with.
+;; 8. Depending on the result of the ping:
+;; - If the server is running (`running`), it prints debug information, saves the connection in the hash table, and returns the connection.
+;; - If the server is starting (`starting`), it sleeps for 2 seconds and then recursively calls itself to retry the connection.
+;; - If the server is neither running nor starting, it checks if it's been more than 10 seconds since the last server start attempt. If so, it attempts to start the server using `server-start-proc` and then sleeps for 1 second before retrying the connection.
+;; 9. If no server information is found (`else` case), it checks if it's been more than 3 seconds since the last server start attempt. If so, it starts a new server using `server-start-proc`, updates the last server start time, and sleeps for 4 seconds.
+;; 10. It then sleeps for 1 second and prints debug information before recursively calling itself to retry the connection.
+;;
+;; The function uses recursion to keep trying to connect to the server, with various sleep intervals to prevent overwhelming the system with connection attempts or server starts.
+;; It also uses a hash table to cache connections and avoid reconnecting to a server if a connection already exists.
+;; The function is designed to handle different server states and ensure that a server is running and available before returning a valid connection to the caller.
;;
(define (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc)
(assert (tt:valid-run-id run-id dbfname) "FATAL: invalid run-id "run-id)
- (debug:print-info 2 *default-log-port* "tt:client-connect-to-server " dbfname " " run-id)
+ (debug:print-info 2 *default-log-port* "tt:client-connect-to-server " dbfname)
(let* ((conn (tt:get-conn ttdat dbfname))
(server-start-proc (or server-start-proc
(lambda ()
(assert (equal? dbfname "main.db") ;; only main.db is started here
"FATAL: called server-start-proc for db other than main.db")
@@ -175,20 +202,31 @@
testsuite ;; (dbfile:testsuite-name)
(common:find-local-megatest)
run-id)))))
(if conn
(begin
- (debug:print-info 2 *default-log-port* "already connected to a server")
+ (debug:print-info 2 *default-log-port* "already connected to a server for " dbfname)
conn) ;; we are already connected to the server
;; no conn
+
+ ;; find server with lowest number of threads running (i.e. lowest load)
+ ;;
(let* ((sdats (tt:get-server-info-sorted ttdat dbfname))
(sdat (if (null? sdats)
#f
- (car sdats))))
- (debug:print-info 2 *default-log-port* "found sdat " sdat)
- (match sdat
+ ;; choose server with lowest threads count
+ (car (sort sdats
+ (lambda (a b)
+ (let* ((load-a (tt:get-server-threads a))
+ (load-b (tt:get-server-threads b)))
+ (< load-a load-b))))))))
+
+ ;; (let ((indx (max (random (- (length sdats) 1)) 0)))
+ ;; (list-ref sdats indx)))))
+ ;; (debug:print-info 1 *default-log-port* "found sdat " sdat" from sdats: "sdats)
+ (match sdat
((host port start-time server-id pid dbfname2 servinffile)
(assert (equal? dbfname dbfname2) "FATAL: read server info from wrong file.")
(debug:print-info 2 *default-log-port* "no conn - in match servinffile:" servinffile)
(let* ((host-port (conc host":"port))
(conn (make-tt-conn
@@ -248,11 +286,57 @@
;; returns ( result . ping_time )
(define (tt:timed-ping host port server-id)
(let* ((start-time (current-milliseconds))
(result (tt:ping host port server-id)))
(cons result (- (current-milliseconds) start-time))))
-
+
+;; host:port => ( meta . when-updated)
+(define *server-load* (make-hash-table))
+
+(define (tt:save-server-meta host port meta)
+ (hash-table-set! *server-load* (conc host":"port) (cons meta (current-seconds))))
+
+(define (tt:get-server-threads dat)
+ (let* ((host (car dat))
+ (port (cadr dat))
+ (dat (tt:get-server-meta host port #t)))
+ ;; (debug:print 0 *default-log-port* "host: "host" port: "port" dat: "dat)
+ (if (list? dat)
+ (or (alist-ref 'sload dat) 99998)
+ 99999))) ;; absurd number means don't use this one
+
+;; lazy get, does not auto-refresh meta, this might be a problem
+;;
+(define (tt:get-server-meta host port #!optional (do-ping #f))
+ (let* ((get-meta (lambda ()
+ (let* ((dat (hash-table-ref/default *server-load* (conc host":"port) #f)))
+ (if dat (car dat) #f))))
+ (meta (get-meta)))
+ (if (and (not meta)
+ do-ping)
+ (begin
+ (tt:timed-ping host port #f)
+ (get-meta))
+ meta)))
+
+(define (tt:wait-on-server-load run-id ttdat)
+ (if ttdat ;; if no server yet just pass on through
+ (let* ((dbfname (dbmod:run-id->dbfname run-id))
+ (get-lowest-thread-load
+ (lambda ()
+ (let* ((sdats (tt:get-server-info-sorted ttdat dbfname)))
+ (car (map tt:get-server-threads sdats))))))
+ (if ttdat
+ (let loop ((count 0))
+ (let* ((lowestload (get-lowest-thread-load)))
+ (if (> lowestload 5) ;; load is pretty high
+ (begin
+ (debug:print 0 *default-log-port* "Servers appear overloaded with "lowestload" threads, waiting...")
+ (thread-sleep! 1)
+ (if (< count 10)
+ (loop (+ count 1)))))))
+ (debug:print 0 *default-log-port* "Can't wait on server load, *ttdat* not set")))))
(define (tt:ping host port server-id #!optional (tries-left 5))
(let* ((res (tt:send-receive-direct host port `(ping #f #f #f) ping-mode: #t)) ;; please send me your server-id
(try-again (lambda ()
(if (> tries-left 0)
@@ -263,25 +347,27 @@
;;
;; need two threads, one a 5 second timer
;;
(match res
((status errmsg result meta)
+ (tt:save-server-meta host port meta)
(if (equal? result server-id)
(let* ((server-state (alist-ref 'sstate meta)))
;; (debug:print 0 *default-log-port* "Ping to "host":"port" successful.")
(or server-state 'unk)) ;; then we are good
(begin
- (debug:print 0 *default-log-port* "WARNING: server-id does not match, expected: "server-id", got: "result)
+ (if server-id
+ (debug:print 0 *default-log-port* "WARNING: server-id does not match, expected: "server-id", got: "result))
#f)))
(else
;; (debug:print 0 *default-log-port* "res not in form (status errmsg result meta), got: "res)
(try-again)))))
;; client side handler
;;
;;(tt:handler # get-keys #f () 2 #f "/home/matt/data/megatest/ext-tests" #f "main.db" "ext-tests" "/home/matt/data/megatest/bin/.22.04/../megatest")
-;;
+;;g
(define (tt:handler ttdat cmd run-id params attemptnum readonly-mode dbfname testsuite mtexe server-start-proc)
;; connect-to-server will start a server if needed.
(let* ((areapath (tt-areapath ttdat))
(conn (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc))) ;; looks up conn keyed by dbfname
(if conn
@@ -300,15 +386,19 @@
(case status
((busy) ;; result will be how long the server wants you to delay
(let* ((raw-dly (if (number? result) result 0.1))
(dly (+ raw-dly (/ attemptnum 10)))) ;; (* raw-dly (/ attemptnum 2))))
(debug:print 0 *default-log-port* "WARNING: server for "dbfname" is busy, cmd is "cmd", will try again in "dly" seconds. This is attempt "(- attemptnum 1))
+ (debug:print 0 *default-log-port* errmsg)
(thread-sleep! dly)
(tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc)))
((loaded)
(debug:print 0 *default-log-port* "WARNING: server for "dbfname" is loaded, slowing queries.")
(tt:backoff-incr (tt-conn-host conn)(tt-conn-port conn))
+
+ ;; this would be a good place to force reconnection and connect to a different server
+
result) ;; (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe))
(else
result)))
(else ;; did not receive properly formated result
(if (not res) ;; tt:send-receive telling us that communication failed
@@ -355,11 +445,13 @@
(begin
(thread-sleep! 1) ;; no conn yet set up, give it a rest and try again
(tt:handler ttdat cmd run-id params attemptnum readonly-mode dbfname testsuite mtexe server-start-proc)))))
;; gets server info and appends path to server file
-;; sorts by age, oldest first
+;; sorts by age, --oldest-- now newest first
+;;
+;; move the ping here?
;;
;; returns list of (host port startseconds server-id servinfofile)
;;
(define (tt:get-server-info-sorted ttdat dbfname)
(let* ((areapath (tt-areapath ttdat))
@@ -368,11 +460,11 @@
(sorted (sort sdats (lambda (a b)
(let* ((starta (list-ref a 2))
(startb (list-ref b 2)))
(if (eq? starta startb)
(string>? (list-ref a 3)(list-ref b 3)) ;; if servers started at same time look at server-id
- (< starta startb))))))
+ (> starta startb))))))
(count 0))
(for-each
(lambda (rec)
(if (or (> (length sorted) 1)
(common:low-noise-print 120 "server info sorted"))
@@ -491,10 +583,12 @@
;; server
;;======================================================================
(define (tt:sync-dbs ttdat)
#f)
+
+(define *server-start-requests* '())
;; start the listener and start responding to requests
;;
;; NOTE: organise by dbfname, not run-id so we don't need
;; to pull in more modules
@@ -546,32 +640,36 @@
(keep-srv (and good-ping same-host)))
(if keep-srv
(loop (cdr servrs)
host
(cons servdat result))
- (begin
- ;; (debug:print-info 0 *default-log-port* "good-ping: " good-ping " same-host: " same-host "keep-srv: " keep-srv)
- (handle-exceptions
- exn
- (debug:print-info 0 *default-log-port* "Error removing server info file: "servinfofile", "
- (condition->list exn))
- (delete-file* servinfofile))
- (loop (cdr servrs) prime-host result)))))
+ (let* ((modtime (file-modification-time servinfofile)))
+ ;; if the .servinfo hasn't been touched in five min
+ ;; we can be pretty sure the server is truly dead
+ (if (> (- (current-seconds) modtime) 360)
+ (handle-exceptions
+ exn
+ (debug:print-info 0 *default-log-port*
+ "Error removing server info file: "servinfofile", "
+ (condition->list exn))
+ (delete-file* servinfofile))
+ (loop (cdr servrs) prime-host result))))))
(else
- ;; can't delete it as we don't have a filename. NOTE: Should really never get here.
+ ;; can't delete it as we don't have a filename. NOTE: Should never get here.
(debug:print-info 0 *default-log-port* "ERROR: bad servinfo record \""servdat"\"")
(loop (cdr servrs) prime-host result)) ;; drop
)))))
(home-host (if (null? good-srvrs)
#f
(caar good-srvrs))))
;; by here we have a trustworthy list of servers and we have removed the .servinfo file for any unresponsive servers
;; and the list is in good-srvrs
+ ;;
(cond
((not home-host) ;; no servers yet, go ahead and start
(debug:print-info 0 *default-log-port* "No servers yet, starting on "(get-host-name)))
- ((> (length good-srvrs) 2) ;; don't need more, just exit
+ ((> (length good-srvrs) 3) ;; don't need more, just exit
(debug:print-info 0 *default-log-port* "Have "(length good-srvrs)", no need for more, exiting.")
(exit))
((not (equal? home-host (get-host-name))) ;; there is a home-host and we are not on it
(debug:print-info 0 *default-log-port* "Prime main server is on host "home-host", but we are on host "(get-host-name)", exiting.")
(exit))
@@ -595,10 +693,12 @@
(begin
(debug:print 0 *default-log-port* "ERROR: (tt-port ttdat) no port set! Exiting.")
(exit)))))
;; create a servinfo file start keep-running
+ ;; On WSL there seems to be a race condition where the .servinfo file
+ ;; is not created fast enough
(debug:print 0 *default-log-port* "Creating servinfo file for " dbfname)
(tt:create-server-registration-file ttdat dbfname)
(procinf-status-set! *procinf* "running")
(tt-state-set! ttdat 'running)
(dbfile:with-no-sync-db
@@ -621,10 +721,12 @@
(dbfile:insert-or-update-process nsdb *procinf*)))
(debug:print 0 *default-log-port* "Exiting now.")
(exit))))))
(define (tt:keep-running ttdat dbfname dbstruct)
+
+ (thread-sleep! 1)
;; at this point the server is running and responding to calls, we just monitor
;; for db calls and exit if there are none.
;; if I am not in the first 3 servers, exit
@@ -637,18 +739,21 @@
(my-index (list-index (lambda (x)
(equal? (list-ref x 6)
(tt-servinf-file ttdat)))
servers))
(ok (cond
+ ((not my-index)
+ (debug:print 0 *default-log-port* "WARNING: Apparently I don't exist.")
+ #f) ;; keep trying or give up?
((not *server-run*)
(debug:print 0 *default-log-port* "WARNING: received a stop server from client by remote request.")
#f)
((null? servers)
(debug:print 0 *default-log-port* "WARNING: no servinfo files found, this cannot be.")
#f) ;; not ok
- ((> my-index 2)
- (debug:print 0 *default-log-port* "WARNING: there are more than two servers ahead of me, I'm not needed, exiting.")
+ ((> my-index 3)
+ (debug:print 0 *default-log-port* "WARNING: there are more than three servers ahead of me, I'm not needed, exiting.")
#f) ;; not ok to not be in first three
((eq? (tt-state ttdat) 'running) #t) ;; we are good to keep going
((> (- (current-seconds) start-time) 30)
(debug:print 0 *default-log-port* "WARNING: over 30 seconds and not yet in runnning mode. Exiting.")
#f)
@@ -669,11 +774,19 @@
(set! (file-modification-time sinfo-file) (current-seconds))
((dbr:dbstruct-sync-proc dbstruct) last-update)
(dbr:dbstruct-last-update-set! dbstruct curr-secs))))
(if (< (- (current-seconds) (tt-last-access ttdat)) (tt-server-timeout-param))
- (begin
+ ;; process any requests to start a new server due to load on this one
+ (let* ((requests *server-start-requests*))
+ (set! *server-start-requests* '())
+ (if (> (length requests) 0)
+ (debug:print-info 0 *default-log-port* "Processing "(length requests)" server start requests"))
+ (for-each (lambda (proc)
+ (proc)
+ (thread-sleep! 1))
+ requests)
(thread-sleep! 5)
(loop)))))
(tt:shutdown-server ttdat)
(debug:print 0 *default-log-port* "INFO: Server timed out, exiting from tt:keep-running.")))
@@ -701,16 +814,26 @@
(assert (and host port) "FATAL: tt:create-server-registration-file called with no conn, dbfname="dbfname)
(tt-servinf-file-set! ttdat servinf)
(with-output-to-file servinf
(lambda ()
(print "SERVER STARTED: "host":"port" AT "(current-seconds)" server-id: "serv-id" pid: "(current-process-id)" dbfname: "dbfname)))
- serv-id))
+ (let loop ((count 0))
+ (if (not (file-exists? servinf))
+ (begin
+ (debug:print 0 *default-log-port* "WARNING: file "servinf" was created but it doesn't show up on disk! We'll try again.")
+ (thread-sleep! 1)
+ (if (> count 10)
+ (debug:print 0 *default-log-port* "WARNING: file "servinf" was not created.")
+ (loop (+ count 1))))))
+ serv-id))
;; find valid server
;; get servers listed, last part of name must match :
;; if more than one, wait one second and look again
-;; future: ping oldest, if alive remove other : files
+;;
+;; NOTE: this only gets the servinfo data, no network activity here
+;; i.e. no ping etc.
;;
(define (tt:find-server areapath dbfname)
(let* ((servdir (tt:get-servinfo-dir areapath))
(sfiles (glob (conc servdir"/*:"dbfname)))
(goodfiles '()))
@@ -775,10 +898,12 @@
(define (tt:too-recent-server-start dbfname)
(let* ((last-run-time (hash-table-ref/default *last-server-start* dbfname #f)))
(and last-run-time
(< (- (current-seconds) last-run-time) 5))))
+
+(define *last-server-start-request-time* 0)
;; Given an area path, start a server process ### NOTE ### > file 2>&1
;; if the target-host is set
;; try running on that host
;; incidental: rotate logs in logs/ dir.
@@ -787,26 +912,28 @@
(assert areapath "FATAL: tt:server-process-run called without areapath defined.")
(assert testsuite "FATAL: tt:server-process-run called without testsuite defined.")
(assert mtexe "FATAL: tt:server-process-run called without mtexe defined.")
;; mtest -server - -m testsuite:ext-tests -db 6.db
(let* ((dbfname (dbmod:run-id->dbfname run-id)))
- (if (tt:too-recent-server-start dbfname)
+ (if (or (< (- (current-seconds) *last-server-start-request-time*) 5) ;; attempted start less than 5 sec ago
+ (tt:too-recent-server-start dbfname))
#f
(let* ((load (get-normalized-cpu-load))
(srvrs (tt:find-server areapath dbfname))
(trying (length srvrs))
(nrun (number-of-processes-running (conc "mtest.*server.*"testsuite".*"dbfname))))
+ (set! *last-server-start-request-time* (current-seconds))
(cond
- ((> load 2.0)
- (debug:print 0 *default-log-port* "Normalized load "load" on " (get-host-name) " is over the limit of 2.0. Not starting a server. Please reduce the load on "(get-host-name)" by killing some processes")
- (thread-sleep! 1)
+ ((> load 10.0)
+ (debug:print 0 *default-log-port* "Normalized load " load " over 10, (load: " (commonmod:get-cpu-load) " cores: " (get-current-host-cores) " exiting...")
+ (thread-sleep! 1) ;; I'm not convinced that a delay here is helpful. -mrw-
#f)
((> nrun 100)
(debug:print 0 *default-log-port* nrun" servers running on " (get-host-name) ", not starting another.")
(thread-sleep! 1)
#f)
- ((> trying 2)
+ ((> trying 3)
(debug:print 0 *default-log-port* trying" servers registered in .servinfo dir. not starting another.")
(thread-sleep! 1)
#f)
(else
(if (not (file-exists? (conc areapath"/logs")))
@@ -1002,7 +1129,125 @@
(define (get-all-ips)
(map address-info-host
(filter (lambda (x)
(equal? (address-info-type x) "tcp"))
(address-infos (get-host-name)))))
+
+;;======================================================================
+;; Other Utils
+;;======================================================================
+
+(defstruct jstats
+ (count 0)
+ (jcount (make-hash-table)) ;; 1.db => journal_count
+ )
+
+;; timeblk => jstats
+(define *journal-stats* #f) ;; (make-hash-table))
+(define *journal-stats-enable* #t) ;; change to #f to turn off
+
+;; monte-carlo-esque random sampling of journal files
+;; for all the files:
+;; if .journal
+;; update stats +1 +1
+;; update stats +1 0
+;;
+(define (tt:write-load-tracking dbdir)
+ (if *journal-stats-enable*
+ (let* ((cs (current-seconds))
+ (key (inexact->exact (quotient cs 10)))
+ (old (- key 5)) ;; 4 x 10 seconds ago
+ (jstat (if (hash-table-exists? *journal-stats* key)
+ (hash-table-ref *journal-stats* key )
+ (let ((new (make-jstats)))
+ (hash-table-set! *journal-stats* key new)
+ new))))
+ ;; clear out old records
+ (for-each
+ (lambda (key)
+ (if (< key old)
+ (hash-table-delete! *journal-stats* key)))
+ (hash-table-keys *journal-stats*))
+
+ ;; increment our count of observations
+ (jstats-count-set! jstat (+ (jstats-count jstat) 1))
+
+ ;; now find and increment journal file counts
+ (directory-fold
+ (lambda (fname res)
+ ;; is it a journal file?
+ (let ((parts (string-match "^(.*\\.db)-journal.*" fname)))
+ (match parts
+ ((_ dbfname)
+ (hash-table-set! (jstats-jcount jstat) dbfname
+ (+ (hash-table-ref/default (jstats-jcount jstat) dbfname 0) 1.0)
+ ))
+ (else #f)
+ )))
+ '()
+ dbdir
+ ))))
+
+(define *journal-stats-mutex* (make-mutex))
+
+(define (tt:journal-stats-run dbdir)
+ (if (not *journal-stats*)(set! *journal-stats* (make-hash-table)))
+ (let loop ()
+ (mutex-lock! *journal-stats-mutex*)
+ (tt:write-load-tracking dbdir)
+ (mutex-unlock! *journal-stats-mutex*)
+ (thread-sleep! (/ (random 1000) 100.0))
+ (loop)))
+
+;; call this to start a thread that is keeping the journal-stats up to date.
+(define (tt:start-stats dbdir)
+
+ (thread-start!
+ (make-thread
+ (lambda ()(tt:journal-stats-run dbdir)) "Journal stats collection thread")))
+
+(define (tt:get-journal-stats #!optional (dbfname #f))
+ (let* ((result (make-jstats))
+ (hitcounts (jstats-jcount result)))
+ (if (and *journal-stats*
+ *journal-stats-enable*)
+ (begin
+ (mutex-lock! *journal-stats-mutex*)
+ (hash-table-for-each
+ *journal-stats*
+ (lambda (k v) ;; key jstats
+ (let* ((count (jstats-count v))
+ (jcount (jstats-jcount v))) ;; dbfname => hit count
+ (jstats-count-set! result
+ (+ (jstats-count result)
+ (jstats-count v)))
+ (hash-table-for-each
+ jcount
+ (lambda (dbfname hit-count)
+ (hash-table-set! hitcounts dbfname
+ (+ hit-count
+ (hash-table-ref/default hitcounts dbfname 0))))))))
+ (mutex-unlock! *journal-stats-mutex*))
+ (debug:print 0 *default-log-port* "INFO: *journal-stats* not set."))
+ ;; convert to normalized alist
+ (let* ((tot (max (jstats-count result) 1)) ;; avoid divide by zero
+ (hits (jstats-jcount result)) ;; 1.db => count
+ (res (hash-table-map
+ hits
+ (lambda (fname hitcount)
+ (cons fname (/ hitcount tot))))))
+ (if dbfname
+ (or (alist-ref dbfname res equal?) 0)
+ res))))
+
+;; megatest> (import tcp-transportmod)
+;; megatest> (tt:write-load-tracking ".mtdb")
+;; megatest> (hash-table-keys *journal-stats*)
+;; (172060297)
+;; megatest> (jstats->alist (hash-table-ref *journal-stats* 172060297))
+;; ((count . 1) (jcount . #))
+;; megatest> (jstats-jcount (hash-table-ref *journal-stats* 172060297))
+;; #
+;; megatest> (hash-table->alist (jstats-jcount (hash-table-ref *journal-stats* 172060297)))
+;; (("1.db" . 4))
)
ADDED utils/convert-db.sh
Index: utils/convert-db.sh
==================================================================
--- /dev/null
+++ utils/convert-db.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+if [ -z "megatest.config" ]; then
+ echo "The file 'megatest.config' does not exist. This must be run in a megatest area."
+ exit 1
+fi
+if [ -d ".mtdb" ]; then
+ echo "The .mtdb directory already exists. Will not do the conversion"
+ exit 1
+fi
+if [ -d ".megatest" ]; then
+ echo "Found a .megatest directory. Will convert from megatest 1.70 to 1.71/1.80 format"
+ /p/foundry/env/pkgs/megatest/1.70/16/bin/megatest -list-runs % -dumpmode sexpr > data.sexpr
+else
+ if [ -f "megatest.db" ]; then
+ echo "Found megatest.db. Will convert from megatest 1.65 to 1.71/1.80 format"
+ /p/foundry/env/pkgs/megatest/1.65/92/bin/megatest -list-runs % -dumpmode sexpr > data.sexpr
+ else
+ echo "Did not find .megatest or megatest.db. Cannot do the conversion"
+ exit 1
+ fi
+fi
+which megatest
+megatest -import-sexpr data.sexpr
+
ADDED utils/setcicd
Index: utils/setcicd
==================================================================
--- /dev/null
+++ utils/setcicd
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+branch=$(fossil branch current)
+wikiname=${branch}_cicd
+echo "ready" > $wikiname
+if fossil wiki export $wikiname;then
+ fossil wiki commit $wikiname $wikiname
+else
+ fossil wiki create $wikiname $wikiname
+fi