Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -371,11 +371,13 @@ ((add-var) (apply db:add-var dbstruct params)) ((insert-run) (apply db:insert-run dbstruct params)) ;; STEPS - ((teststep-set-status!) (apply db:teststep-set-status! dbstruct params)) + ((teststep-set-status!) + ;; (apply db:teststep-set-status! dbstruct params)) + (db:add-cached-write dbstruct db:teststep-set-status! run-id params)) ((delete-steps-for-test!) (apply db:delete-steps-for-test! dbstruct params)) ;; TEST DATA ((test-data-rollup) (apply db:test-data-rollup dbstruct params)) ((csv->test-data) (apply db:csv->test-data dbstruct params)) Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -43,11 +43,11 @@ (include "common_records.scm") (define (remove-files filespec) (let ((files (glob filespec))) - (for-each delete-file files))) + (for-each delete-file* files))) (define (stop-the-train) (thread-start! (make-thread (lambda () (let loop () (if (and *toppath* Index: dashboard.scm ================================================================== --- dashboard.scm +++ dashboard.scm @@ -72,10 +72,11 @@ ;; set some parameters here - these need to be put in something that can be loaded from other ;; executables such as dashboard and mtutil ;; (include "dashboard-transport-mode.scm") + (dbfile:db-init-proc db:initialize-main-db) (set! rmtmod:send-receive rmt:send-receive) (define help (conc "Megatest Dashboard, documentation at http://www.kiatoa.com/fossils/megatest version " megatest-version @@ -118,13 +119,13 @@ 0)) (if (args:get-arg "-mode") (let* ((mode (string->symbol (args:get-arg "-mode")))) (rmt:transport-mode mode)) - (rmt:transport-mode 'tcp)) +) -(if (args:get-arg "-test") ;; need to use tcp for test control panel +(if (args:get-arg "-test") ;; need to use tcp for test control panel -- Why? - Martin (rmt:transport-mode 'tcp)) ;; RA => Might require revert for filters ;; create a watch dog to move changes from lt/.db/*.db to megatest.db ;; @@ -3112,11 +3113,11 @@ (debug:print 2 *default-log-port* "WARNING: error in accessing databases in get-youngest-run-db-mod-time: " ((condition-property-accessor 'exn 'message) exn) " db-dir="dbdir ", exn=" exn) (current-seconds)) ;; something went wrong - just print an error and return current-seconds (common:max (map (lambda (filen) (file-modification-time filen)) - (glob (conc dbdir "/*.db*")))))) + (cons (conc dbdir "/main.db") (glob (conc dbdir "/?.db"))))))) (define (dashboard:monitor-changed? commondat tabdat) (let* ((run-update-time (current-seconds)) (monitor-db-path (dboard:tabdat-monitor-db-path tabdat)) (monitor-modtime (if (and monitor-db-path (common:file-exists? monitor-db-path)) @@ -3828,11 +3829,12 @@ (let* ((mtdb-path (conc *toppath* "/.mtdb/main.db")) (target (args:get-arg "-target")) (commondat (dboard:commondat-make))) (if target (begin - (args:remove-arg-from-ht "-target") + (hash-table-delete! args:arg-hash "-target") ;; workaround for the following commented out function + ;; (args:remove-arg-from-ht "-target") This function is in mtargs/mtargs.scm, but it's in an egg that is not in the current build of chicken 4. (dboard:commondat-target-set! commondat target) ) ) (if (not (launch:setup)) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -2739,12 +2739,10 @@ "INSERT OR REPLACE into test_steps (test_id,stepname,state,status,event_time,comment,logfile) VALUES(?,?,?,?,?,?,?);" test-id teststep-name state-in status-in (current-seconds) (if comment comment "") (if logfile logfile ""))))) - - (define (db:delete-steps-for-test! dbstruct run-id test-id) ;; TODO: figure out why status is the key field rather than state (note: CONSTRAINT test_steps_constraint UNIQUE (test_id,stepname,state) ) (db:with-db dbstruct run-id @@ -4247,10 +4245,56 @@ (conc (current-directory) "/" outputfile))) results) ;; brutal clean up (dbfile:add-dbdat dbstruct #f dbdat) (system "rm -rf tempdir"))) + +;;====================================================================== +;; cached writes stuff +;;====================================================================== + +(define (db:add-cached-write dbstruct proc run-id params) + (debug:print 0 *default-log-port* "Adding cached write for run-id "run-id" params " params) + (mutex-lock! *cached-writes-mutex*) + (let* ((hkey (cons dbstruct run-id)) + (cached-writes-queue (hash-table-ref/default *cached-writes-queues* hkey '()))) + (hash-table-set! *cached-writes-queues* hkey (cons (list proc params) cached-writes-queue))) + (if (not *cached-writes-flag*) + (begin + (set! *cached-writes-flag* #t) + (thread-start! (make-thread + (lambda () + (debug:print 0 *default-log-port* "process cached writes thread started.") + (thread-sleep! 1) + (db:process-cached-writes-queue)))))) + (mutex-unlock! *cached-writes-mutex*)) + +(define (db:process-cached-writes-queue) + (mutex-lock! *cached-writes-mutex*) + (hash-table-for-each + *cached-writes-queues* + (lambda (hkey writes-list) + (let* ((dbstruct (car hkey)) + (run-id (cdr hkey))) + (debug:print 0 *default-log-port* "Processing "(length writes-list)" cached writes for run "run-id) + (db:with-db + dbstruct + run-id + #t + (lambda (dbdat db) + (sqlite3:with-transaction + db + (lambda () + (for-each + (lambda (queued-write) + (match queued-write + ((proc params)(apply proc dbstruct params)) + (else (assert #f "BAD queued-write")))) + writes-list))) + (hash-table-delete! *cached-writes-queues* hkey)))))) + (set! *cached-writes-flag* #f) + (mutex-unlock! *cached-writes-mutex*)) ;; (db:extract-ods-file db "outputfile.ods" '(("sysname" "%")("fsname" "%")("datapath" "%")) "%") ;;====================================================================== ;; moving watch dogs here due to dependencies Index: dbfile.scm ================================================================== --- dbfile.scm +++ dbfile.scm @@ -446,11 +446,12 @@ (if journal-mode (sqlite3:execute db (conc "PRAGMA journal_mode = "journal-mode";"))) (if (and init-proc (or force-init (not db-exists))) (init-proc db)) - db))) + db)) + expire-time: 30) (begin (if (file-exists? fname ) (let ((db (sqlite3:open-database fname))) ;; pragmas synchronous not needed because this db is used read-only ;; (sqlite3:execute db (conc "PRAGMA synchronous = "mode";") @@ -1513,11 +1514,11 @@ #f (with-input-from-file fname (lambda () (equal? key-string (read-line))))) (begin - (dbfile:print-err "dbfile:simple-file-lock created " fname " but it was gone 3 seconds later") + (dbfile:print-err "dbfile:simple-file-lock created " fname " but it was gone 0.25 seconds later") #f ) ) ) ) @@ -1579,7 +1580,17 @@ ;; (db:hoh-set! stmt-cache db stmt newstmth) (hash-table-set! stmt-cache stmt newstmth) newstmth)))) (mutex-unlock! *get-cache-stmth-mutex*) result)) + +;;====================================================================== +;; cached writes - run list of procs inside transaction +;; NOTE: this only works because we have once database per process +;;====================================================================== + +(define *cached-writes-mutex* (make-mutex)) +(define *cached-writes-flag* #f) +(define *cached-writes-queues* (make-hash-table)) ;; dbstruct->list of writes + ) Index: dbmod.scm ================================================================== --- dbmod.scm +++ dbmod.scm @@ -77,11 +77,11 @@ (assert areapath "FATAL: dbmod:nfs-get-dbstruct called without areapath set.") (let* ((dbfname (dbmod:run-id->dbfname run-id)) (dbstruct (hash-table-ref/default *dbmod:nfs-db-handles* dbfname #f))) (if dbstruct dbstruct - (let* ((newdbstruct (dbmod:open-dbmoddb areapath run-id dbfname init-proc keys syncdir: 'fromdisk tmpadj: tmpadj))) + (let* ((newdbstruct (dbmod:open-dbmoddb areapath run-id dbfname init-proc keys syncdir: 'todisk))) (hash-table-set! *dbmod:nfs-db-handles* dbfname newdbstruct) newdbstruct)))) ;;====================================================================== ;; The cachedb one-db file per server method goes in here @@ -119,12 +119,14 @@ (loop (- count 1))) (begin (debug:print-info 0 *default-log-port* "dbmod:with-db, database is busy, giving up.") (exit 1)))) (exn () - (dbfile:print-err exn "ERROR: Unknown error with database for run-id "run-id", message: " - ((condition-property-accessor 'exn 'message) exn)) + (dbfile:print-err exn "ERROR: Unknown error with db for run-id " + run-id", message: " + ((condition-property-accessor 'exn 'message) exn) + ", details: "(condition->list exn)) (exit 2)))))) (if use-mutex (mutex-unlock! *db-with-db-mutex*)) res))) (define (db:with-db dbstruct run-id w/r proc . params) @@ -520,11 +522,11 @@ ;; done (debug:print 2 *default-log-port* "Attaching "destdbfile" as auxdb") (handle-exceptions exn (begin - (debug:print 0 "ATTACH failed, exiting. exn="(condition->list exn)) + (debug:print 0 *default-log-port* "ATTACH failed, exiting. exn="(condition->list exn)) (exit 1)) (sqlite3:execute dbh (conc "ATTACH '"destdbfile"' AS auxdb;"))) (for-each (lambda (table) (let* ((dummy (debug:print 2 *default-log-port* "Doing table " table)) Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -341,10 +341,55 @@ (or new-cpu-load cpu-load) (or new-disk-free disk-free) (if do-sync (current-seconds) last-sync))))))) (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional + +;; read testconfig and create .logpro and script files +;; - use #f for tconfigreg to re-read the testconfigs from disk +;; +(define (launch:extract-scripts-logpro test-dir test-name item-path tconfigreg-in) + (let* ((tconfigreg (or tconfigreg-in + (tests:get-all))) + (tconfig-fname (conc test-dir "/.testconfig")) + (tconfig-tmpfile (conc tconfig-fname ".tmp")) + (tconfig (tests:get-testconfig test-name item-path tconfigreg #t force-create: #t)) ;; 'return-procs))) + (scripts (configf:get-section tconfig "scripts")) + (logpros (configf:get-section tconfig "logpro"))) + ;; create .testconfig file + (configf:write-alist tconfig tconfig-tmpfile) + (file-move tconfig-tmpfile tconfig-fname #t) + (delete-file* ".final-status") + + ;; extract scripts from testconfig and write them to files in test run dir + (for-each + (lambda (scriptdat) + (match scriptdat + ((name content) + (debug:print-info 2 *default-log-port* "Creating script "(current-directory)"/"name) + (with-output-to-file name + (lambda () + (print content))) + (change-file-mode name (bitwise-ior perm/irwxg perm/irwxu))) + (else + (debug:print-info 0 "Invalid script definiton found in [scripts] section of testconfig. \"" scriptdat "\"")))) + scripts) + + ;; extract logpro from testconfig and write them to files in test run dir + (for-each + (lambda (logprodat) + (match logprodat + ((name content) + (debug:print-info 2 *default-log-port* "Creating logpro file "(current-directory)"/"name) + (with-output-to-file name + (lambda () + (print content) + ;; (change-file-mode name (bitwise-ior perm/irwxg perm/irwxu)) + ))) + (else + (debug:print-info 0 "Invalid logpro definiton found in [logpro] section of testconfig. \"" logprodat "\"")))) + logpros))) (define (launch:execute encoded-cmd) (let* ((cmdinfo (common:read-encoded-string encoded-cmd)) (tconfigreg #f)) (setenv "MT_CMDINFO" encoded-cmd) @@ -616,12 +661,11 @@ (if mt-bindir-path (setenv "PATH" (conc tmppath":"mt-bindir-path)))) ;;(bb-check-path msg: "launch:execute post block 4") ;; (change-directory top-path) ;; Can setup as client for server mode now ;; (client:setup) - - + ;; environment overrides are done *before* the remaining critical envars. (alist->env-vars env-ovrd) ;;(bb-check-path msg: "launch:execute post block 41") (runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals) ;;(bb-check-path msg: "launch:execute post block 42") @@ -647,37 +691,39 @@ (set! fullrunscript "xterm") (if (and fullrunscript (common:file-exists? fullrunscript) (not (file-execute-access? fullrunscript))) (system (conc "chmod ug+x " fullrunscript)))) - - ;; We are about to actually kick off the test - ;; so this is a good place to remove the records for - ;; any previous runs - ;; (db:test-remove-steps db run-id testname itemdat) - ;; now is also a good time to write the .testconfig file - (let* ((tconfig-fname (conc work-area "/.testconfig")) - (tconfig-tmpfile (conc tconfig-fname ".tmp")) - (tconfig (tests:get-testconfig test-name item-path tconfigreg #t force-create: #t)) ;; 'return-procs))) - (scripts (configf:get-section tconfig "scripts"))) - ;; create .testconfig file - (configf:write-alist tconfig tconfig-tmpfile) - (file-move tconfig-tmpfile tconfig-fname #t) - (delete-file* ".final-status") - - ;; extract scripts from testconfig and write them to files in test run dir - (for-each - (lambda (scriptdat) - (match scriptdat - ((name content) - (with-output-to-file name - (lambda () - (print content) - (change-file-mode name (bitwise-ior perm/irwxg perm/irwxu))))) - (else - (debug:print-info 0 "Invalid script definiton found in [scripts] section of testconfig. \"" scriptdat "\"")))) - scripts)) + (launch:extract-scripts-logpro work-area test-name item-path tconfigreg) + +;;;;; ;; We are about to actually kick off the test +;;;;; ;; so this is a good place to remove the records for +;;;;; ;; any previous runs +;;;;; ;; (db:test-remove-steps db run-id testname itemdat) +;;;;; ;; now is also a good time to write the .testconfig file +;;;;; (let* ((tconfig-fname (conc work-area "/.testconfig")) +;;;;; (tconfig-tmpfile (conc tconfig-fname ".tmp")) +;;;;; (tconfig (tests:get-testconfig test-name item-path tconfigreg #t force-create: #t)) ;; 'return-procs))) +;;;;; (scripts (configf:get-section tconfig "scripts")) +;;;;; (precmd (configf:lookup tconfig ) +;;;;; ;; create .testconfig file +;;;;; (configf:write-alist tconfig tconfig-tmpfile) +;;;;; (file-move tconfig-tmpfile tconfig-fname #t) +;;;;; (delete-file* ".final-status") +;;;;; +;;;;; ;; extract scripts from testconfig and write them to files in test run dir +;;;;; (for-each +;;;;; (lambda (scriptdat) +;;;;; (match scriptdat +;;;;; ((name content) +;;;;; (with-output-to-file name +;;;;; (lambda () +;;;;; (print content) +;;;;; (change-file-mode name (bitwise-ior perm/irwxg perm/irwxu))))) +;;;;; (else +;;;;; (debug:print-info 0 "Invalid script definiton found in [scripts] section of testconfig. \"" scriptdat "\"")))) +;;;;; scripts)) ;; (let* ((m (make-mutex)) (kill-job? #f) (exit-info (make-launch:einf pid: #t exit-status: #t exit-code: #t rollup-status: 0)) ;; pid exit-status exit-code (i.e. process was successfully run) rollup-status @@ -693,12 +739,19 @@ (th1 (make-thread monitorjob "monitor job")) (th2 (make-thread runit "run job")) (tconfig (tests:get-testconfig test-name item-path tconfigreg #t)) (propagate-exit-code (configf:lookup *configdat* "setup" "propagate-exit-code")) (propagate-status-list '("FAIL" "KILLED" "ABORT" "DEAD" "CHECK" "SKIP" "WAIVED")) - (test-status "not set") - ) + (test-status "not set") + (test-state "not set") + (precmd (configf:lookup tconfig "setup" "precmd")) + (postcmd (configf:lookup tconfig "setup" "postcmd"))) + ;; first, if set, run the precmd + (if precmd ;; (file-exists? precmd)(file-execute-access? precmd)) + (begin + ;; (save-environment-as-files "precmd-envt") + (system precmd))) ;; up to test author to put nbfake if desired. (set! job-thread th2) (thread-start! th1) (thread-start! th2) (thread-join! th2) (debug:print-info 0 *default-log-port* "Megatest execute of test " test-name ", item path " item-path " complete. Notifying the db ...") @@ -757,22 +810,30 @@ (mutex-unlock! m) (launch:end-of-run-check run-id ) (debug:print 2 *default-log-port* "Output from running " fullrunscript ", pid " (launch:einf-pid exit-info) " in work area " work-area ":\n====\n exit code " (launch:einf-exit-code exit-info) "\n" "====\n") - - (set! test-status (db:test-get-status (rmt:get-testinfo-state-status run-id test-id))) + (let* ((testrec (rmt:get-testinfo-state-status run-id test-id))) + (set! test-status (db:test-get-status testrec)) + (set! test-state (db:test-get-state testrec))) ;; If the propagate-exit-code option has been set in the megatest config, and the test status matches the list, set the exit code to 1. + (if (and propagate-exit-code (string=? propagate-exit-code "yes") (member test-status propagate-status-list)) (begin (debug:print 1 *default-log-port* "Setting exit status to 1 because of test status of " test-status) (set! *globalexitstatus* 1) ) ) + (if postcmd + (begin + (setenv "MT_TEST_STATE" test-state) + (setenv "MT_TEST_STATUS" test-status) + ;; (save-environment-as-files "postcmd-envt") + (system postcmd))) (if (not (launch:einf-exit-status exit-info)) (exit 4)))) ))) ;; Spec for End of test @@ -887,11 +948,11 @@ (let ((tmpfile (conc fulldir "/.megatest.cfg." (current-seconds))) (targfile (conc fulldir "/.megatest.cfg-" megatest-version "-" megatest-fossil-hash)) (rconfig (conc fulldir "/.runconfig." megatest-version "-" megatest-fossil-hash))) (if (common:file-exists? rconfig) ;; only cache megatest.config AFTER runconfigs has been cached (begin - (debug:print-info 0 *default-log-port* "Caching megatest.config in " tmpfile) + (debug:print-info 2 *default-log-port* "Caching megatest.config in " tmpfile) (if (not (common:in-running-test?)) (configf:write-alist *configdat* tmpfile)) (system (conc "ln -sf " tmpfile " " targfile)))) ))) (debug:print-info 1 *default-log-port* "No linktree yet, no caching configs."))))) Index: megatest-version.scm ================================================================== --- megatest-version.scm +++ megatest-version.scm @@ -18,6 +18,6 @@ ;; Always use two or four digit decimal ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00.. ;; (declare (unit megatest-version)) -(define megatest-version 1.8017) +(define megatest-version 1.8032) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -111,11 +111,17 @@ ;; load the ~/.megatestrc file, put (use trace)(trace-call-sites #t)(trace function-you-want-to-trace) in this file ;; (let ((debugcontrolf (conc (get-environment-variable "HOME") "/.megatestrc"))) (if (common:file-exists? debugcontrolf) - (load debugcontrolf))) + (begin + ;; for some reason, debug:print does not work here. Had to use print. + (print (conc "WARNING: loading " debugcontrolf)) + (load debugcontrolf) + ) + ) +) ;; usage logging, careful with this, it is not designed to deal with all real world challenges! ;; (if (and *usage-log-file* (file-write-access? *usage-log-file*)) @@ -257,11 +263,12 @@ -debug N|N,M,O... : enable debug 0-N or N and M and O ... -debug-noprop N|M,M,O...: enable debug but do not propagate to subprocesses via MT_DEBUG -config fname : override the megatest.config file with fname -append-config fname : append fname to the megatest.config file -import-sexpr fname : import a sexpr file (use -list-runs % -dumpmode sexpr to create) - + -regen-testfiles : regenerate scripts and logpro files from testconfig, run in test context + Utilities -env2file fname : write the environment to fname.csh and fname.sh -envcap a : save current variables labeled as context 'a' in file envdat.db -envdelta a-b : output enviroment delta from context a to context b to -o fname set the output mode with -dumpmode csh, bash or ini @@ -466,10 +473,11 @@ "-local" ;; run some commands using local db access "-generate-html" "-generate-html-structure" "-list-run-time" "-list-test-time" + "-regen-testfiles" ;; misc queries "-list-disks" "-list-targets" "-list-db-targets" @@ -999,14 +1007,11 @@ (format #t fmtstr "DB" "host:port" "PID" "age" "last mod" "state") (for-each (lambda (dbfile) (let* ( (dbfname (conc (pathname-file dbfile) ".db")) - (sfiles (tt:find-server *toppath* dbfname)) ) - (for-each - (lambda (sfile) (let ( (sinfos (tt:get-server-info-sorted ttdat dbfname)) ) (for-each (lambda (sinfo) @@ -1028,13 +1033,10 @@ ) ) sinfos ) ) - ) - sfiles - ) ) ) dbfiles ) (set! *didsomething* #t) @@ -1079,11 +1081,11 @@ (killed (system (conc "ssh " host " kill " pid " > /dev/null"))) (dummy2 (sleep 1)) (state (if (> (system (conc "ssh " host " ps " pid " > /dev/null")) 0) "dead" "alive")) ) (format #t fmtstr db (conc host ":" port) pid age last-mod state) - (system (conc "rm " sfile)) + (delete-file* sfile) ) ) sinfos ) ) @@ -2119,10 +2121,25 @@ (paths (tests:test-get-paths-matching keys target (args:get-arg "-test-files")))) (for-each (lambda (path) (print path)) paths)))))) +;;====================================================================== +;; Utils for test areas +;;====================================================================== + +(if (args:get-arg "-regen-testfiles") + (if (getenv "MT_TEST_RUN_DIR") + (begin + (launch:setup) + (change-directory (getenv "MT_TEST_RUN_DIR")) + (let* ((testname (getenv "MT_TEST_NAME")) + (itempath (getenv "MT_ITEMPATH"))) + (launch:extract-scripts-logpro (getenv "MT_TEST_RUN_DIR") testname itempath #f)) + (set! *didsomething* #t)) + (debug:print 0 *default-log-port* "ERROR: Must run -regen-testfiles in a test environment (i.e. test xterm from dashboard)"))) + ;;====================================================================== ;; Archive tests ;;====================================================================== ;; Archive tests matching target, runname, and testpatt (if (equal? (args:get-arg "-archive") "replicate-db") Index: mtargs/mtargs.scm ================================================================== --- mtargs/mtargs.scm +++ mtargs/mtargs.scm @@ -20,10 +20,11 @@ ( arg-hash get-arg get-arg-number get-arg-from + remove-arg-from-ht get-args usage print-args any-defined? ) @@ -64,10 +65,14 @@ (define (get-arg-from ht arg . default) (if (null? default) (hash-table-ref/default ht arg #f) (hash-table-ref/default ht arg (car default)))) + +(define (remove-arg-from-ht arg) + (hash-table-delete! arg-hash arg) +) (define (get-args args params switches arg-hash num-needed) (let* ((numtargs (length args)) (adj-num-needed (if num-needed (+ num-needed 2) #f))) (if (< numtargs (if adj-num-needed adj-num-needed 2)) Index: portlogger.scm ================================================================== --- portlogger.scm +++ portlogger.scm @@ -31,11 +31,12 @@ (import (prefix sqlite3 sqlite3:)) (import debugprint dbmod) ;; lsof -i (define (portlogger:open-db fname) - (let* ((avail (tasks:wait-on-journal fname 5 remove: #t)) ;; wait up to about 10 seconds for the journal to go away + (let* (;; (avail (tasks:wait-on-journal fname 5 remove: #t)) ;; wait up to about 10 seconds for the journal to go away + (avail #t) (exists (file-exists? fname)) (db (if avail (sqlite3:open-database fname) (begin (system (conc "rm -f " fname)) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -72,34 +72,11 @@ ;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname)) ;; (define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected (assert *toppath* "FATAL: rmt:send-receive called with *toppath* not set.") - - (if (not (eq? (rmt:transport-mode) 'nfs)) - (begin - (if (> attemptnum 2) - (debug:print 0 *default-log-port* "INFO: attemptnum in rmt:send-receive is " attemptnum)) - - (cond - ((> attemptnum 2) (thread-sleep! 0.05)) - ((> attemptnum 10) (thread-sleep! 0.5)) - ((> attemptnum 20) (thread-sleep! 1))) - - ;; I'm turning this off, it may make sense to move it - ;; into http-transport-handler - (if (and (> attemptnum 5) (= 0 (modulo attemptnum 15))) - (begin - (debug:print 0 *default-log-port* "ERROR: can't connect to server, trying to start a server.") - (case (rmt:transport-mode) - ((http) - (server:run *toppath*) - (thread-sleep! 3)) - (else - (thread-sleep! 1) ;; for tcp the server is started by routines in tcp-transportmod. For nfs there is no server - )))))) - + ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds. ;; 3. do the query, if on homehost use local access ;; (let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value @@ -117,11 +94,11 @@ ((nfs) (nfs-transport-handler runremote cmd rid params attemptnum area-dat areapath readonly-mode testsuite mtexe)) ))) (define (nfs-transport-handler runremote cmd run-id params attemptnum area-dat areapath readonly-mode testsuite mtexe) (let* ((keys (common:get-fields *configdat*)) - (dbstruct (dbmod:nfs-get-dbstruct run-id keys (dbfile:db-init-proc) areapath tmpadj: "/dashboard"))) + (dbstruct (dbmod:nfs-get-dbstruct run-id keys (dbfile:db-init-proc) areapath))) (api:dispatch-request dbstruct cmd run-id params))) (define (tcp-transport-handler runremote cmd run-id params attemptnum area-dat areapath readonly-mode testsuite mtexe) (if (not runremote) (let* ((newremote (make-and-init-remote areapath))) Index: rmtmod.scm ================================================================== --- rmtmod.scm +++ rmtmod.scm @@ -87,16 +87,24 @@ (let* ((runname (car run-dat)) (all-dat (cdr run-dat)) (tests-data (alist-ref "data" all-dat equal?)) (run-meta (alist-ref "meta" all-dat equal?)) (run-id (rmt:insert-run target runname run-meta))) - (for-each - (lambda (test-dat) - (let* ((test-id (car test-dat)) + (if (list? tests-data) + (begin + (debug:print 0 *default-log-port* "import-run: inserting " (length tests-data) " tests") + (for-each + (lambda (test-dat) + (let* ((test-id (car test-dat)) (test-rec (cdr test-dat))) - (rmt:insert-test run-id test-rec))) - tests-data))) + (rmt:insert-test run-id test-rec))) + tests-data) + ) + (debug:print 0 *default-log-port* "import-run: run has no tests") + ) + ) +) ;; insert run if not there, return id either way (define (rmt:insert-run target runname run-meta) ;; look for id, return if found (debug:print 0 *default-log-port* "Insert run: "target"/"runname) @@ -107,13 +115,22 @@ (rmtmod:send-receive 'insert-run #f (list target runname run-meta)) (simple-run-id (car runs))))) (define (rmt:insert-test run-id test-rec) (let* ((testname (alist-ref "testname" test-rec equal?)) - (item-path (alist-ref "item_path" test-rec equal?))) - (debug:print 0 *default-log-port* " Insert test in run "run-id": "testname"/"item-path) - (rmtmod:send-receive 'insert-test run-id test-rec))) + (item-path (alist-ref "item_path" test-rec equal?)) + (test-id (rmt:get-test-id run-id testname item-path)) + ) + (if test-id + (debug:print 0 *default-log-port* "test "testname"/"item-path " already exists in run-id " run-id) + (begin + (debug:print 0 *default-log-port* " Insert test in run "run-id": "testname"/"item-path) + (rmtmod:send-receive 'insert-test run-id test-rec) + ) + ) + ) +) ;;====================================================================== ;; T E S T S ;;====================================================================== Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -2916,11 +2916,11 @@ (fld (car key)) (val (configf:lookup test-conf "test_meta" fld))) ;; (debug:print 5 *default-log-port* "idx: " idx " fld: " fld " val: " val) (if (and val (not (equal? (vector-ref currrecord idx) val))) (begin - (debug:print 0 *default-log-port* "Updating " test-name " " fld " to " val) + (debug:print 2 *default-log-port* "Updating " test-name " " fld " to " val) (rmt:testmeta-update-field test-name fld val))))) '(("author" 2)("owner" 3)("description" 4)("reviewed" 5)("tags" 9)("jobgroup" 10))))) ;; find tests with matching tags, tagpatt is a string "tagpatt1,tagpatt2%, ..." ;; Index: tcp-transportmod.scm ================================================================== --- tcp-transportmod.scm +++ tcp-transportmod.scm @@ -129,24 +129,50 @@ (tcp-buffer-size 2048) ;; (max-connections 4096) ;; do all the busy work of finding and setting up conn for ;; connecting to a server +;; This function, `tt:client-connect-to-server`, is designed to manage connections between a client and a server within a testing framework. +;; The function takes four arguments: +;; 1. `ttdat`: a data structure that holds information about the testing environment or connections. +;; 2. `dbfname`: The name of the database file that the client wants to connect to. +;; 3. `run-id`: An identifier for the current run of the test suite. +;; 4. `testsuite`: The test suite that is being run. +;; +;; Here's a step-by-step explanation of what the function does: +;; +;; 1. It first asserts that the `run-id` is valid for the given `dbfname` using the `tt:valid-run-id` function. If the `run-id` is not valid, it raises a fatal error. +;; 2. It prints debug information indicating that the function `tt:client-connect-to-server` has been called with the given `dbfname`. +;; 3. It attempts to retrieve an existing connection to the server from a hash table (`tt-conns`) using the `dbfname` as the key. If a connection already exists, it prints debug information and returns the existing connection. +;; 4. If no existing connection is found, it retrieves the current server information from the servinfo file, using the `tt:get-current-server-info` function. +;; 5. It uses pattern matching to destructure the server information into variables (`host`, `port`, `start-time`, `server-id`, `pid`, `dbfname2`, `servinffile`). It then asserts that the `dbfname` from the server info matches the one provided to the function. +;; 6. It constructs a connection object (`conn`) with the server information. +;; 7. It attempts to ping the server using `tt:timed-ping` to verify that the server is running and can be communicated with. +;; 8. Depending on the result of the ping: +;; - If the server is running (`running`), it prints debug information, saves the connection in the hash table, and returns the connection. +;; - If the server is starting (`starting`), it sleeps for 2 seconds and then recursively calls itself to retry the connection. +;; - If the server is neither running nor starting, it checks if it's been more than 10 seconds since the last server start attempt. If so, it attempts to start the server using `server-start-proc` and then sleeps for 1 second before retrying the connection. +;; 9. If no server information is found (`else` case), it checks if it's been more than 3 seconds since the last server start attempt. If so, it starts a new server using `server-start-proc`, updates the last server start time, and sleeps for 4 seconds. +;; 10. It then sleeps for 1 second and prints debug information before recursively calling itself to retry the connection. +;; +;; The function uses recursion to keep trying to connect to the server, with various sleep intervals to prevent overwhelming the system with connection attempts or server starts. +;; It also uses a hash table to cache connections and avoid reconnecting to a server if a connection already exists. +;; The function is designed to handle different server states and ensure that a server is running and available before returning a valid connection to the caller. ;; (define (tt:client-connect-to-server ttdat dbfname run-id testsuite) (assert (tt:valid-run-id run-id dbfname) "FATAL: invalid run-id "run-id) - (debug:print-info 2 *default-log-port* "tt:client-connect-to-server " dbfname " " run-id) + (debug:print-info 2 *default-log-port* "tt:client-connect-to-server " dbfname) (let* ((conn (hash-table-ref/default (tt-conns ttdat) dbfname #f)) (server-start-proc (lambda () (tt:server-process-run (tt-areapath ttdat) testsuite ;; (dbfile:testsuite-name) (common:find-local-megatest) run-id)))) (if conn (begin - (debug:print-info 2 *default-log-port* "already connected to a server") + (debug:print-info 2 *default-log-port* "already connected to a server for " dbfname) conn) ;; we are already connected to the server (let* ((sdat (tt:get-current-server-info ttdat dbfname))) (match sdat ((host port start-time server-id pid dbfname2 servinffile) (assert (equal? dbfname dbfname2) "FATAL: read server info from wrong file.") @@ -189,11 +215,11 @@ (if (> (- (current-seconds) (tt-last-serv-start ttdat)) 3) ;; BUG - grow this number really do not want to swamp the machine with servers (begin (debug:print-info 0 *default-log-port* "Starting server for "dbfname) (server-start-proc) (tt-last-serv-start-set! ttdat (current-seconds)) - (thread-sleep! 3) + (thread-sleep! 4) )) (thread-sleep! 1) (debug:print-info 0 *default-log-port* "Connect to server for " dbfname) (tt:client-connect-to-server ttdat dbfname run-id testsuite))))))) @@ -232,10 +258,11 @@ ;;(tt:handler # get-keys #f () 2 #f "/home/matt/data/megatest/ext-tests" #f "main.db" "ext-tests" "/home/matt/data/megatest/bin/.22.04/../megatest") ;; (define (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe) (debug:print 2 *default-log-port* "tt:handler cmd: " cmd " run-id: " run-id " attemptnum: " attemptnum) ;; NOTE: areapath is passed in and in tt struct. We'll use passed in value for now. + ;; connect-to-server will start a server if needed. (let* ((conn (tt:client-connect-to-server ttdat dbfname run-id testsuite))) ;; (hash-table-ref/default (tt-conns ttdat) dbfname #f))) (if conn ;; have connection, call the server (let* ((res (tt:send-receive ttdat conn cmd run-id params))) ;; res is (status errmsg result meta) @@ -260,11 +287,11 @@ (tt:backoff-incr (tt-conn-host conn)(tt-conn-port conn)) result) ;; (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)) (else result))) (else ;; did not receive properly formated result - (if (not res) ;; tt:handler is telling us that communication failed + (if (not res) ;; tt:send-receive telling us that communication failed (let* ((host (tt-conn-host conn)) (port (tt-conn-port conn)) ;; (dbfname (tt-conn-port conn)) ;; 192.168.0.127:4242-726924:4.db (pid (tt-conn-pid conn)) ;;(servinf (tt-conn-servinf-file conn))) @@ -293,11 +320,11 @@ ;; try again (thread-sleep! 0.25) ;; dunno, I think this needs to be here (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)) )))) (begin ;; no server file, delay and try again - (debug:print 2 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", but do not see servinf file "servinf) + (debug:print 2 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", no servinf file. Server exited? ") (thread-sleep! 0.5) (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)))) (begin ;; this case is where res is malformed. Probably should abort (assert #f "FATAL: tt:handler received bad data "res) ;; (debug:print 0 *default-log-port* "INFO: got corrupt data from server "host":"port", "res", for "dbfname", will try again.") @@ -476,11 +503,12 @@ ;; is there already a server for this dbfile? Then exit. (debug:print 2 *default-log-port* "tt:start-server: " dbfname-in) (let* ((ttdat (make-tt areapath: areapath)) (dbfname (or dbfname-in (dbmod:run-id->dbfname run-id))) (servers (tt:find-server areapath dbfname))) ;; should use tt:get-current-server-info instead - (if (> (length servers) 4) + (debug:print 0 *default-log-port* "Found " (length servers) " already running for " dbfname) + (if (> (length servers) 0) (begin (debug:print 0 *default-log-port* "INFO: found server(s) already running for db "dbfname", "(string-intersperse servers ",")" Exiting.") (exit)) (let* ((dbstruct (dbmod:open-dbmoddb areapath run-id dbfname (dbfile:db-init-proc) keys))) (tt-handler-set! ttdat (handler dbstruct)) @@ -535,14 +563,13 @@ (if (tt-cleanup-proc ttdat) ((tt-cleanup-proc ttdat))) (dbfile:with-no-sync-db nosyncdbpath (lambda (db) (let* ((dbtmpname (dbr:dbstruct-dbtmpname dbstruct))) - (debug:print-info 0 *default-log-port* "Running clean up, including removing db file "dbtmpname) + (debug:print-info 0 *default-log-port* "keep-running: removing lock for file "dbtmpname) (db:no-sync-del! db dbfname) - #;(if dbtmpname - (delete-file dbtmpname)))))))) + )))))) (set! *server-info* ttdat) (let loop ((count 0)) (if (> count 240) (begin (debug:print 0 *default-log-port* "FATAL: Could not start a tcp server, giving up.") @@ -585,10 +612,11 @@ (if (and res (common:low-noise-print 120 "top server message")) (debug:print-info 0 *default-log-port* "Keep running, I'm the top server for " dbfname" on "(tt-host ttdat)":"(tt-port ttdat))) res)) (else + ;; wrong servinfo file (debug:print-info 0 *default-log-port* "I'm not the lead server: "servers) (let* ((leadsrv (car servers))) (match leadsrv ((host port startseconds server-id pid dbfname servinfofile) (let* ((result (tt:timed-ping host port server-id)) @@ -713,21 +741,38 @@ ;; if more than one, wait one second and look again ;; future: ping oldest, if alive remove other : files ;; (define (tt:find-server areapath dbfname) (let* ((servdir (tt:get-servinfo-dir areapath)) - (sfiles (glob (conc servdir"/*:"dbfname)))) - sfiles)) + (sfiles (glob (conc servdir"/*:"dbfname))) + (good-files '())) + (for-each + (lambda (sfile) + (let* ((sinfo (tt:server-get-info sfile)) + (host (list-ref sinfo 0)) + (port (list-ref sinfo 1)) + (server-id (list-ref sinfo 3)) + (pid (list-ref sinfo 4)) + (status (system (conc "ssh " host " ps " pid " > /dev/null"))) + ) + (if (= status 0) + (set! good-files (cons sfile good-files)) + (delete-file* sfile) + ) + ) + ) + sfiles + ) + (debug:print-info 2 *default-log-port* "tt:find-server: good-files: " good-files " sfiles: " sfiles) + good-files)) ;; given a path to a server info file return: host port startseconds server-id pid dbfname logf ;; example of what it's looking for in the log file: ;; SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 ;; (define (tt:server-get-info logf) (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+) dbfname: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id - (dbprep-rx (regexp "^SERVER: dbprep")) - (dbprep-found 0) (bad-dat (list #f #f #f #f #f #f logf))) (let ((fdat (handle-exceptions exn (begin ;; WARNING: this is potentially dangerous to blanket ignore the errors @@ -737,13 +782,11 @@ (if (null? fdat) ;; bad data, return bad-dat bad-dat (let loop ((inl (car fdat)) (tail (cdr fdat)) (lnum 0)) - (let ((mlst (string-match server-rx inl)) - (dbprep (string-match dbprep-rx inl))) - (if dbprep (set! dbprep-found 1)) + (let ((mlst (string-match server-rx inl))) (if (not mlst) (if (> lnum 500) ;; give up if more than 500 lines of server log read bad-dat (if (null? tail) bad-dat @@ -884,11 +927,15 @@ (if (< port 65535) (begin (portlogger:open-run-close portlogger:set-failed port) (thread-sleep! 0.25) (setup-listener-portlogger uconn)) - #f) + (begin + (assert #t "setup-listener-portlogger: could not get a port") + #f + ) + ) (connect-listener uconn port)))) (define (connect-listener uconn port) ;; (tcp-listener-socket LISTENER)(socket-name so) ;; sockaddr-address, sockaddr-port, sockaddr->string Index: tests.scm ================================================================== --- tests.scm +++ tests.scm @@ -148,11 +148,11 @@ ;; return items given config ;; (define (tests:get-items tconfig) (let ((items (hash-table-ref/default tconfig "items" #f)) ;; items 4 - (itemstable (hash-table-ref/default tconfig "itemstable" #f))) + (itemstable (hash-table-ref/default tconfig "itemstable" #f))) ;; if either items or items table is a proc return it so test running ;; process can know to call items:get-items-from-config ;; if either is a list and none is a proc go ahead and call get-items ;; otherwise return #f - this is not an iterated test (cond @@ -160,10 +160,12 @@ (debug:print-info 4 *default-log-port* "items is a procedure, will calc later") items) ;; calc later ((procedure? itemstable) (debug:print-info 4 *default-log-port* "itemstable is a procedure, will calc later") itemstable) ;; calc later + ((and (not items) (not itemstable)) + #f) ((filter (lambda (x) (let ((val (car x))) (if (procedure? val) val #f))) (append (if (list? items) items '()) (if (list? itemstable) itemstable '()))) @@ -1638,11 +1640,11 @@ (if (and testexists cache-file (file-write-access? cache-path) allow-write-cache) (let ((tpath (conc cache-path "/.testconfig"))) - (debug:print-info 1 *default-log-port* "Caching testconfig for " test-name " in " tpath) + (debug:print-info 2 *default-log-port* "Caching testconfig for " test-name " in " tpath) (if (and tcfg (not (common:in-running-test?))) (configf:write-alist tcfg tpath)))) tcfg)))))) ;; sort tests by priority and waiton