Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -238,16 +238,69 @@ *api-threads*))) (define (api:get-count-threads-alive) (length *api-threads*)) - -;; indat is (cmd run-id params meta) +;; ========================================================================================================================================== +;; api:tcp-dispatch-request-make-handler is a complex TCP request handler that manages server load, dispatches requests, and ensures that the +;; server's state is consistent with the incoming requests. It is designed to be used in a concurrent environment where multiple requests are +;; being handled simultaneously. +;; +;; It processes incoming requests and dispatches them accordingly. +;; The function takes a dbstruct argument, which is a structure representing the database. +;; +;; 1. The function asserts that global variable `*toppath*` is set. +;; +;; 2. It checks if `*server-signature*` is not set, and if so, it sets it using the `tt:mk-signature` function with `*toppath*` as an argument. +;; The `*server-signature*` is used to identify the server instance. +;; +;; 3. The function returns a lambda function that takes `indat` as an argument. indat is (cmd run-id params meta) This lambda is the actual +;; request handler that will be called with the incoming data. +;; +;; 4. Inside the lambda, the current thread is registered with `api:register-thread`. +;; +;; 5. Several local variables are initialized: +;; - `newcount`: A counter for the number of requests being processed. +;; - `numthreads`: The number of alive threads handling requests. +;; - `delay-wait`: A calculated delay based on the number of requests. +;; +;; 6. A `normal-proc` lambda is defined to handle the incoming command (`cmd`), `run-id`, and `params`. It uses a `case` statement to handle +;; different commands. If the command is "ping", it returns the server signature. Otherwise, it dispatches the request using +;; `api:dispatch-request`. +;; +;; 7. The function updates the `*api-process-request-count*` and `*db-last-access*` global variables. +;; +;; 8. It checks if the number of requests (`newcount`) does not match the number of threads (`numthreads`) and performs cleanup and debugging +;; if necessary. +;; +;; 9. The `match` expression is used to destructure `indat` into its components (`cmd`, `run-id`, `params`, `meta`). +;; +;; 10. Several local variables are set based on the destructured data and the current server state: +;; - `db-ok`: Checks if the database file name matches the expected one for the given `run-id`. +;; - `ttdat`: Retrieves server information. +;; - `server-state`: Gets the current state of the server. +;; - `status`: Determines the server's load status based on `newcount`. +;; - `errmsg`: Generates an error message based on the server's status. +;; - `result`: Processes the command based on the server's status. +;; +;; 11. The `meta` variable is updated with additional information based on the command. +;; +;; 12. The `payload` is constructed, which includes the status, error message, result, and meta information. +;; +;; 13. The `*api-process-request-count*` is decremented, as the request has been processed. +;; +;; 14. The current thread is unregistered with `api:unregister-thread`. +;; +;; 15. Finally, the `payload` is returned, which would be the response to the incoming request. +;; +;; Nothing should be printed within the lambda because it interacts with the current input/output ports, which could interfere with the +;; request/response flow. ;; -;; WARNING: Do not print anything in the lambda of this function as it -;; reads/writes to current in/out port +;; The `else` clause at the end of the `match` expression asserts a fatal error if `indat` cannot be deserialized, indicating that the incoming +;; data is not in the expected format. ;; + (define (api:tcp-dispatch-request-make-handler dbstruct) ;; cmd run-id params) (assert *toppath* "FATAL: api:tcp-dispatch-request-make-handler called but *toppath* not set.") (if (not *server-signature*) (set! *server-signature* (tt:mk-signature *toppath*))) (lambda (indat) @@ -371,11 +424,13 @@ ((add-var) (apply db:add-var dbstruct params)) ((insert-run) (apply db:insert-run dbstruct params)) ;; STEPS - ((teststep-set-status!) (apply db:teststep-set-status! dbstruct params)) + ((teststep-set-status!) + ;; (apply db:teststep-set-status! dbstruct params)) + (db:add-cached-write dbstruct db:teststep-set-status! run-id params)) ((delete-steps-for-test!) (apply db:delete-steps-for-test! dbstruct params)) ;; TEST DATA ((test-data-rollup) (apply db:test-data-rollup dbstruct params)) ((csv->test-data) (apply db:csv->test-data dbstruct params)) Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -43,11 +43,11 @@ (include "common_records.scm") (define (remove-files filespec) (let ((files (glob filespec))) - (for-each delete-file files))) + (for-each delete-file* files))) (define (stop-the-train) (thread-start! (make-thread (lambda () (let loop () (if (and *toppath* @@ -302,14 +302,11 @@ ;; transport to be used ;; http - use http-transport ;; http-read-cached - use http-transport for writes but in-mem cached for reads (rmode 'http) - (hh-dat (let ((res (or (server:choose-server *toppath* 'homehost) - (cons #f #f)))) - (assert (pair? res)(conc "FATAL: hh-dat should be a pair, got "res)) - res)) + (hh-dat (common:get-homehost)) ;; homehost record ( addr . hhflag ) (server-url #f) ;; (server:check-if-running *toppath*) #f)) (server-id #f) (server-info #f) ;; (if *toppath* (server:check-if-running *toppath*) #f)) (last-server-check 0) ;; last time we checked to see if the server was alive (connect-time (current-seconds)) ;; when we first connected @@ -1292,10 +1289,73 @@ ;;====================================================================== ;; logic for getting homehost. Returns (host . at-home) ;; IF *toppath* is not set, wait up to five seconds trying every two seconds ;; (this is to accomodate the watchdog) ;; +(define (common:get-homehost #!key (trynum 5)) + ;; called often especially at start up. use mutex to eliminate collisions + (mutex-lock! *homehost-mutex*) + (cond + (*home-host* + (mutex-unlock! *homehost-mutex*) + *home-host*) + ((not *toppath*) + (mutex-unlock! *homehost-mutex*) + (launch:setup) ;; safely mutexed now + (if (> trynum 0) + (begin + (thread-sleep! 2) + (common:get-homehost trynum: (- trynum 1))) + #f)) + (else + (let* ((currhost (get-host-name)) + (bestadrs (server:get-best-guess-address currhost)) + ;; first look in config, then look in file .homehost, create it if not found + (homehost (or (configf:lookup *configdat* "server" "homehost" ) + (handle-exceptions + exn + (if (> trynum 0) + (let ((delay-time (* (- 5 trynum) 5))) + (mutex-unlock! *homehost-mutex*) + (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Failed to read .homehost file, delaying " + delay-time " seconds and trying again, message: " ((condition-property-accessor 'exn 'message) exn) + ", exn=" exn) + (thread-sleep! delay-time) + (common:get-homehost trynum: (- trynum 1))) + (begin + (mutex-unlock! *homehost-mutex*) + (debug:print 0 *default-log-port* "ERROR: ["(common:human-time) + "] Failed to read .homehost file after trying five times. Giving up and exiting, message: " + ((condition-property-accessor 'exn 'message) exn)) + (exit 1))) + (let ((hhf (conc *toppath* "/.homehost"))) + (if (common:file-exists? hhf) + (with-input-from-file hhf read-line) + (if (file-write-access? *toppath*) + (begin + (debug:print 0 *default-log-port* "No .homehost file found. Setting it to the current machine") + (with-output-to-file hhf + (lambda () + (print bestadrs))) + (begin + (mutex-unlock! *homehost-mutex*) + (car (common:get-homehost)))) + #f)))))) + (at-home (or (equal? homehost currhost) + (equal? homehost bestadrs)))) + (set! *home-host* (cons homehost at-home)) + (mutex-unlock! *homehost-mutex*) + *home-host*)))) + +;;====================================================================== +;; am I on the homehost? +;; +(define (common:on-homehost?) + (let ((hh (common:get-homehost))) + (if hh + (cdr hh) + #f))) ;;====================================================================== ;; do we honor the caches of the config files? ;; (define (common:use-cache?) Index: dashboard.scm ================================================================== --- dashboard.scm +++ dashboard.scm @@ -72,10 +72,11 @@ ;; set some parameters here - these need to be put in something that can be loaded from other ;; executables such as dashboard and mtutil ;; (include "dashboard-transport-mode.scm") + (dbfile:db-init-proc db:initialize-main-db) (set! rmtmod:send-receive rmt:send-receive) (define help (conc "Megatest Dashboard, documentation at http://www.kiatoa.com/fossils/megatest version " megatest-version @@ -118,13 +119,13 @@ 0)) (if (args:get-arg "-mode") (let* ((mode (string->symbol (args:get-arg "-mode")))) (rmt:transport-mode mode)) - (rmt:transport-mode 'tcp)) +) -(if (args:get-arg "-test") ;; need to use tcp for test control panel +(if (args:get-arg "-test") ;; need to use tcp for test control panel -- Why? - Martin (rmt:transport-mode 'tcp)) ;; RA => Might require revert for filters ;; create a watch dog to move changes from lt/.db/*.db to megatest.db ;; @@ -3112,11 +3113,11 @@ (debug:print 2 *default-log-port* "WARNING: error in accessing databases in get-youngest-run-db-mod-time: " ((condition-property-accessor 'exn 'message) exn) " db-dir="dbdir ", exn=" exn) (current-seconds)) ;; something went wrong - just print an error and return current-seconds (common:max (map (lambda (filen) (file-modification-time filen)) - (glob (conc dbdir "/*.db*")))))) + (cons (conc dbdir "/main.db") (glob (conc dbdir "/?.db"))))))) (define (dashboard:monitor-changed? commondat tabdat) (let* ((run-update-time (current-seconds)) (monitor-db-path (dboard:tabdat-monitor-db-path tabdat)) (monitor-modtime (if (and monitor-db-path (common:file-exists? monitor-db-path)) @@ -3828,11 +3829,12 @@ (let* ((mtdb-path (conc *toppath* "/.mtdb/main.db")) (target (args:get-arg "-target")) (commondat (dboard:commondat-make))) (if target (begin - (args:remove-arg-from-ht "-target") + (hash-table-delete! args:arg-hash "-target") ;; workaround for the following commented out function + ;; (args:remove-arg-from-ht "-target") This function is in mtargs/mtargs.scm, but it's in an egg that is not in the current build of chicken 4. (dboard:commondat-target-set! commondat target) ) ) (if (not (launch:setup)) @@ -3840,15 +3842,17 @@ (debug:print 0 *default-log-port* "Failed to find megatest.config, exiting") (exit 1) ) ) - #;(if (not (common:on-homehost?)) + (if (not (common:on-homehost?)) (begin - (debug:print 0 *default-log-port* "WARNING: You are starting the dashboard on a machine that is not the homehost:" (server:get-homehost)) + (debug:print 0 *default-log-port* "WARNING: You are starting the dashboard on a machine that is not the homehost:" (car (common:get-homehost))) (debug:print 0 *default-log-port* "It will be slower.") - )) + ) + (debug:print 0 *default-log-port* "Dashboard started on the homehost: " (car (common:get-homehost))) + ) (if (and (common:file-exists? mtdb-path) (file-write-access? mtdb-path)) (if (not (args:get-arg "-skip-version-check")) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -2739,12 +2739,10 @@ "INSERT OR REPLACE into test_steps (test_id,stepname,state,status,event_time,comment,logfile) VALUES(?,?,?,?,?,?,?);" test-id teststep-name state-in status-in (current-seconds) (if comment comment "") (if logfile logfile ""))))) - - (define (db:delete-steps-for-test! dbstruct run-id test-id) ;; TODO: figure out why status is the key field rather than state (note: CONSTRAINT test_steps_constraint UNIQUE (test_id,stepname,state) ) (db:with-db dbstruct run-id @@ -4247,10 +4245,56 @@ (conc (current-directory) "/" outputfile))) results) ;; brutal clean up (dbfile:add-dbdat dbstruct #f dbdat) (system "rm -rf tempdir"))) + +;;====================================================================== +;; cached writes stuff +;;====================================================================== + +(define (db:add-cached-write dbstruct proc run-id params) + (debug:print 0 *default-log-port* "Adding cached write for run-id "run-id" params " params) + (mutex-lock! *cached-writes-mutex*) + (let* ((hkey (cons dbstruct run-id)) + (cached-writes-queue (hash-table-ref/default *cached-writes-queues* hkey '()))) + (hash-table-set! *cached-writes-queues* hkey (cons (list proc params) cached-writes-queue))) + (if (not *cached-writes-flag*) + (begin + (set! *cached-writes-flag* #t) + (thread-start! (make-thread + (lambda () + (debug:print 0 *default-log-port* "process cached writes thread started.") + (thread-sleep! 1) + (db:process-cached-writes-queue)))))) + (mutex-unlock! *cached-writes-mutex*)) + +(define (db:process-cached-writes-queue) + (mutex-lock! *cached-writes-mutex*) + (hash-table-for-each + *cached-writes-queues* + (lambda (hkey writes-list) + (let* ((dbstruct (car hkey)) + (run-id (cdr hkey))) + (debug:print 0 *default-log-port* "Processing "(length writes-list)" cached writes for run "run-id) + (db:with-db + dbstruct + run-id + #t + (lambda (dbdat db) + (sqlite3:with-transaction + db + (lambda () + (for-each + (lambda (queued-write) + (match queued-write + ((proc params)(apply proc dbstruct params)) + (else (assert #f "BAD queued-write")))) + writes-list))) + (hash-table-delete! *cached-writes-queues* hkey)))))) + (set! *cached-writes-flag* #f) + (mutex-unlock! *cached-writes-mutex*)) ;; (db:extract-ods-file db "outputfile.ods" '(("sysname" "%")("fsname" "%")("datapath" "%")) "%") ;;====================================================================== ;; moving watch dogs here due to dependencies Index: dbfile.scm ================================================================== --- dbfile.scm +++ dbfile.scm @@ -446,11 +446,12 @@ (if journal-mode (sqlite3:execute db (conc "PRAGMA journal_mode = "journal-mode";"))) (if (and init-proc (or force-init (not db-exists))) (init-proc db)) - db))) + db)) + expire-time: 30) (begin (if (file-exists? fname ) (let ((db (sqlite3:open-database fname))) ;; pragmas synchronous not needed because this db is used read-only ;; (sqlite3:execute db (conc "PRAGMA synchronous = "mode";") @@ -1513,11 +1514,11 @@ #f (with-input-from-file fname (lambda () (equal? key-string (read-line))))) (begin - (dbfile:print-err "dbfile:simple-file-lock created " fname " but it was gone 3 seconds later") + (dbfile:print-err "dbfile:simple-file-lock created " fname " but it was gone 0.25 seconds later") #f ) ) ) ) @@ -1579,7 +1580,17 @@ ;; (db:hoh-set! stmt-cache db stmt newstmth) (hash-table-set! stmt-cache stmt newstmth) newstmth)))) (mutex-unlock! *get-cache-stmth-mutex*) result)) + +;;====================================================================== +;; cached writes - run list of procs inside transaction +;; NOTE: this only works because we have once database per process +;;====================================================================== + +(define *cached-writes-mutex* (make-mutex)) +(define *cached-writes-flag* #f) +(define *cached-writes-queues* (make-hash-table)) ;; dbstruct->list of writes + ) Index: dbmod.scm ================================================================== --- dbmod.scm +++ dbmod.scm @@ -77,11 +77,11 @@ (assert areapath "FATAL: dbmod:nfs-get-dbstruct called without areapath set.") (let* ((dbfname (dbmod:run-id->dbfname run-id)) (dbstruct (hash-table-ref/default *dbmod:nfs-db-handles* dbfname #f))) (if dbstruct dbstruct - (let* ((newdbstruct (dbmod:open-dbmoddb areapath run-id dbfname init-proc keys syncdir: 'fromdisk tmpadj: tmpadj))) + (let* ((newdbstruct (dbmod:open-dbmoddb areapath run-id dbfname init-proc keys syncdir: 'todisk))) (hash-table-set! *dbmod:nfs-db-handles* dbfname newdbstruct) newdbstruct)))) ;;====================================================================== ;; The cachedb one-db file per server method goes in here @@ -119,12 +119,14 @@ (loop (- count 1))) (begin (debug:print-info 0 *default-log-port* "dbmod:with-db, database is busy, giving up.") (exit 1)))) (exn () - (dbfile:print-err exn "ERROR: Unknown error with database for run-id "run-id", message: " - ((condition-property-accessor 'exn 'message) exn)) + (dbfile:print-err exn "ERROR: Unknown error with db for run-id " + run-id", message: " + ((condition-property-accessor 'exn 'message) exn) + ", details: "(condition->list exn)) (exit 2)))))) (if use-mutex (mutex-unlock! *db-with-db-mutex*)) res))) (define (db:with-db dbstruct run-id w/r proc . params) @@ -520,11 +522,11 @@ ;; done (debug:print 2 *default-log-port* "Attaching "destdbfile" as auxdb") (handle-exceptions exn (begin - (debug:print 0 "ATTACH failed, exiting. exn="(condition->list exn)) + (debug:print 0 *default-log-port* "ATTACH failed, exiting. exn="(condition->list exn)) (exit 1)) (sqlite3:execute dbh (conc "ATTACH '"destdbfile"' AS auxdb;"))) (for-each (lambda (table) (let* ((dummy (debug:print 2 *default-log-port* "Doing table " table)) Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -341,10 +341,55 @@ (or new-cpu-load cpu-load) (or new-disk-free disk-free) (if do-sync (current-seconds) last-sync))))))) (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional + +;; read testconfig and create .logpro and script files +;; - use #f for tconfigreg to re-read the testconfigs from disk +;; +(define (launch:extract-scripts-logpro test-dir test-name item-path tconfigreg-in) + (let* ((tconfigreg (or tconfigreg-in + (tests:get-all))) + (tconfig-fname (conc test-dir "/.testconfig")) + (tconfig-tmpfile (conc tconfig-fname ".tmp")) + (tconfig (tests:get-testconfig test-name item-path tconfigreg #t force-create: #t)) ;; 'return-procs))) + (scripts (configf:get-section tconfig "scripts")) + (logpros (configf:get-section tconfig "logpro"))) + ;; create .testconfig file + (configf:write-alist tconfig tconfig-tmpfile) + (file-move tconfig-tmpfile tconfig-fname #t) + (delete-file* ".final-status") + + ;; extract scripts from testconfig and write them to files in test run dir + (for-each + (lambda (scriptdat) + (match scriptdat + ((name content) + (debug:print-info 2 *default-log-port* "Creating script "(current-directory)"/"name) + (with-output-to-file name + (lambda () + (print content))) + (change-file-mode name (bitwise-ior perm/irwxg perm/irwxu))) + (else + (debug:print-info 0 "Invalid script definiton found in [scripts] section of testconfig. \"" scriptdat "\"")))) + scripts) + + ;; extract logpro from testconfig and write them to files in test run dir + (for-each + (lambda (logprodat) + (match logprodat + ((name content) + (debug:print-info 2 *default-log-port* "Creating logpro file "(current-directory)"/"name) + (with-output-to-file name + (lambda () + (print content) + ;; (change-file-mode name (bitwise-ior perm/irwxg perm/irwxu)) + ))) + (else + (debug:print-info 0 "Invalid logpro definiton found in [logpro] section of testconfig. \"" logprodat "\"")))) + logpros))) (define (launch:execute encoded-cmd) (let* ((cmdinfo (common:read-encoded-string encoded-cmd)) (tconfigreg #f)) (setenv "MT_CMDINFO" encoded-cmd) @@ -616,12 +661,11 @@ (if mt-bindir-path (setenv "PATH" (conc tmppath":"mt-bindir-path)))) ;;(bb-check-path msg: "launch:execute post block 4") ;; (change-directory top-path) ;; Can setup as client for server mode now ;; (client:setup) - - + ;; environment overrides are done *before* the remaining critical envars. (alist->env-vars env-ovrd) ;;(bb-check-path msg: "launch:execute post block 41") (runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals) ;;(bb-check-path msg: "launch:execute post block 42") @@ -647,37 +691,39 @@ (set! fullrunscript "xterm") (if (and fullrunscript (common:file-exists? fullrunscript) (not (file-execute-access? fullrunscript))) (system (conc "chmod ug+x " fullrunscript)))) - - ;; We are about to actually kick off the test - ;; so this is a good place to remove the records for - ;; any previous runs - ;; (db:test-remove-steps db run-id testname itemdat) - ;; now is also a good time to write the .testconfig file - (let* ((tconfig-fname (conc work-area "/.testconfig")) - (tconfig-tmpfile (conc tconfig-fname ".tmp")) - (tconfig (tests:get-testconfig test-name item-path tconfigreg #t force-create: #t)) ;; 'return-procs))) - (scripts (configf:get-section tconfig "scripts"))) - ;; create .testconfig file - (configf:write-alist tconfig tconfig-tmpfile) - (file-move tconfig-tmpfile tconfig-fname #t) - (delete-file* ".final-status") - - ;; extract scripts from testconfig and write them to files in test run dir - (for-each - (lambda (scriptdat) - (match scriptdat - ((name content) - (with-output-to-file name - (lambda () - (print content) - (change-file-mode name (bitwise-ior perm/irwxg perm/irwxu))))) - (else - (debug:print-info 0 "Invalid script definiton found in [scripts] section of testconfig. \"" scriptdat "\"")))) - scripts)) + (launch:extract-scripts-logpro work-area test-name item-path tconfigreg) + +;;;;; ;; We are about to actually kick off the test +;;;;; ;; so this is a good place to remove the records for +;;;;; ;; any previous runs +;;;;; ;; (db:test-remove-steps db run-id testname itemdat) +;;;;; ;; now is also a good time to write the .testconfig file +;;;;; (let* ((tconfig-fname (conc work-area "/.testconfig")) +;;;;; (tconfig-tmpfile (conc tconfig-fname ".tmp")) +;;;;; (tconfig (tests:get-testconfig test-name item-path tconfigreg #t force-create: #t)) ;; 'return-procs))) +;;;;; (scripts (configf:get-section tconfig "scripts")) +;;;;; (precmd (configf:lookup tconfig ) +;;;;; ;; create .testconfig file +;;;;; (configf:write-alist tconfig tconfig-tmpfile) +;;;;; (file-move tconfig-tmpfile tconfig-fname #t) +;;;;; (delete-file* ".final-status") +;;;;; +;;;;; ;; extract scripts from testconfig and write them to files in test run dir +;;;;; (for-each +;;;;; (lambda (scriptdat) +;;;;; (match scriptdat +;;;;; ((name content) +;;;;; (with-output-to-file name +;;;;; (lambda () +;;;;; (print content) +;;;;; (change-file-mode name (bitwise-ior perm/irwxg perm/irwxu))))) +;;;;; (else +;;;;; (debug:print-info 0 "Invalid script definiton found in [scripts] section of testconfig. \"" scriptdat "\"")))) +;;;;; scripts)) ;; (let* ((m (make-mutex)) (kill-job? #f) (exit-info (make-launch:einf pid: #t exit-status: #t exit-code: #t rollup-status: 0)) ;; pid exit-status exit-code (i.e. process was successfully run) rollup-status @@ -693,12 +739,19 @@ (th1 (make-thread monitorjob "monitor job")) (th2 (make-thread runit "run job")) (tconfig (tests:get-testconfig test-name item-path tconfigreg #t)) (propagate-exit-code (configf:lookup *configdat* "setup" "propagate-exit-code")) (propagate-status-list '("FAIL" "KILLED" "ABORT" "DEAD" "CHECK" "SKIP" "WAIVED")) - (test-status "not set") - ) + (test-status "not set") + (test-state "not set") + (precmd (configf:lookup tconfig "setup" "precmd")) + (postcmd (configf:lookup tconfig "setup" "postcmd"))) + ;; first, if set, run the precmd + (if precmd ;; (file-exists? precmd)(file-execute-access? precmd)) + (begin + ;; (save-environment-as-files "precmd-envt") + (system precmd))) ;; up to test author to put nbfake if desired. (set! job-thread th2) (thread-start! th1) (thread-start! th2) (thread-join! th2) (debug:print-info 0 *default-log-port* "Megatest execute of test " test-name ", item path " item-path " complete. Notifying the db ...") @@ -757,22 +810,30 @@ (mutex-unlock! m) (launch:end-of-run-check run-id ) (debug:print 2 *default-log-port* "Output from running " fullrunscript ", pid " (launch:einf-pid exit-info) " in work area " work-area ":\n====\n exit code " (launch:einf-exit-code exit-info) "\n" "====\n") - - (set! test-status (db:test-get-status (rmt:get-testinfo-state-status run-id test-id))) + (let* ((testrec (rmt:get-testinfo-state-status run-id test-id))) + (set! test-status (db:test-get-status testrec)) + (set! test-state (db:test-get-state testrec))) ;; If the propagate-exit-code option has been set in the megatest config, and the test status matches the list, set the exit code to 1. + (if (and propagate-exit-code (string=? propagate-exit-code "yes") (member test-status propagate-status-list)) (begin (debug:print 1 *default-log-port* "Setting exit status to 1 because of test status of " test-status) (set! *globalexitstatus* 1) ) ) + (if postcmd + (begin + (setenv "MT_TEST_STATE" test-state) + (setenv "MT_TEST_STATUS" test-status) + ;; (save-environment-as-files "postcmd-envt") + (system postcmd))) (if (not (launch:einf-exit-status exit-info)) (exit 4)))) ))) ;; Spec for End of test @@ -887,11 +948,11 @@ (let ((tmpfile (conc fulldir "/.megatest.cfg." (current-seconds))) (targfile (conc fulldir "/.megatest.cfg-" megatest-version "-" megatest-fossil-hash)) (rconfig (conc fulldir "/.runconfig." megatest-version "-" megatest-fossil-hash))) (if (common:file-exists? rconfig) ;; only cache megatest.config AFTER runconfigs has been cached (begin - (debug:print-info 0 *default-log-port* "Caching megatest.config in " tmpfile) + (debug:print-info 2 *default-log-port* "Caching megatest.config in " tmpfile) (if (not (common:in-running-test?)) (configf:write-alist *configdat* tmpfile)) (system (conc "ln -sf " tmpfile " " targfile)))) ))) (debug:print-info 1 *default-log-port* "No linktree yet, no caching configs."))))) Index: megatest-version.scm ================================================================== --- megatest-version.scm +++ megatest-version.scm @@ -18,6 +18,6 @@ ;; Always use two or four digit decimal ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00.. ;; (declare (unit megatest-version)) -(define megatest-version 1.8017) +(define megatest-version 1.8032) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -111,11 +111,17 @@ ;; load the ~/.megatestrc file, put (use trace)(trace-call-sites #t)(trace function-you-want-to-trace) in this file ;; (let ((debugcontrolf (conc (get-environment-variable "HOME") "/.megatestrc"))) (if (common:file-exists? debugcontrolf) - (load debugcontrolf))) + (begin + ;; for some reason, debug:print does not work here. Had to use print. + (print (conc "WARNING: loading " debugcontrolf)) + (load debugcontrolf) + ) + ) +) ;; usage logging, careful with this, it is not designed to deal with all real world challenges! ;; (if (and *usage-log-file* (file-write-access? *usage-log-file*)) @@ -257,11 +263,12 @@ -debug N|N,M,O... : enable debug 0-N or N and M and O ... -debug-noprop N|M,M,O...: enable debug but do not propagate to subprocesses via MT_DEBUG -config fname : override the megatest.config file with fname -append-config fname : append fname to the megatest.config file -import-sexpr fname : import a sexpr file (use -list-runs % -dumpmode sexpr to create) - + -regen-testfiles : regenerate scripts and logpro files from testconfig, run in test context + Utilities -env2file fname : write the environment to fname.csh and fname.sh -envcap a : save current variables labeled as context 'a' in file envdat.db -envdelta a-b : output enviroment delta from context a to context b to -o fname set the output mode with -dumpmode csh, bash or ini @@ -466,10 +473,11 @@ "-local" ;; run some commands using local db access "-generate-html" "-generate-html-structure" "-list-run-time" "-list-test-time" + "-regen-testfiles" ;; misc queries "-list-disks" "-list-targets" "-list-db-targets" @@ -965,10 +973,15 @@ (if (args:get-arg "-server") (let* (;; (run-id (args:get-arg "-run-id")) (dbfname (args:get-arg "-db")) (tl (launch:setup)) (keys (keys:config-get-fields *configdat*))) + (if (not (common:on-homehost?)) + (begin + (debug:print 0 *default-log-port* "Attempt to start a server on a machine that is not the homehost. Aborting") + (exit + ))) (case (rmt:transport-mode) ((tcp) (let* ((timeout (server:expiration-timeout))) (debug:print 0 *default-log-port* "INFO: Starting server for " dbfname " using tcp method with server timeout of "timeout) (tt-server-timeout-param timeout) @@ -999,14 +1012,11 @@ (format #t fmtstr "DB" "host:port" "PID" "age" "last mod" "state") (for-each (lambda (dbfile) (let* ( (dbfname (conc (pathname-file dbfile) ".db")) - (sfiles (tt:find-server *toppath* dbfname)) ) - (for-each - (lambda (sfile) (let ( (sinfos (tt:get-server-info-sorted ttdat dbfname)) ) (for-each (lambda (sinfo) @@ -1028,13 +1038,10 @@ ) ) sinfos ) ) - ) - sfiles - ) ) ) dbfiles ) (set! *didsomething* #t) @@ -1079,11 +1086,11 @@ (killed (system (conc "ssh " host " kill " pid " > /dev/null"))) (dummy2 (sleep 1)) (state (if (> (system (conc "ssh " host " ps " pid " > /dev/null")) 0) "dead" "alive")) ) (format #t fmtstr db (conc host ":" port) pid age last-mod state) - (system (conc "rm " sfile)) + (delete-file* sfile) ) ) sinfos ) ) @@ -1968,10 +1975,12 @@ (args:get-arg "-run") (args:get-arg "-rerun-clean") (args:get-arg "-rerun-all") (args:get-arg "-runtests") (args:get-arg "-kill-rerun")) + (begin + (common:get-homehost) ;; set the .homehost if it's not set. (let ((need-clean (or (args:get-arg "-rerun-clean") (args:get-arg "-rerun-all"))) (orig-cmdline (string-intersperse (argv) " "))) (general-run-call "-runall" @@ -1997,11 +2006,11 @@ orig-cmdline))))) (debug:print 0 *default-log-port* "ORIG: " orig-cmdline) (debug:print 0 *default-log-port* "NEW: " newcmdline) (system newcmdline))) run-specs)) - (handle-run-requests target runname keys keyvals need-clean)))))) + (handle-run-requests target runname keys keyvals need-clean))))))) ;;====================================================================== ;; run one test ;;====================================================================== @@ -2119,10 +2128,25 @@ (paths (tests:test-get-paths-matching keys target (args:get-arg "-test-files")))) (for-each (lambda (path) (print path)) paths)))))) +;;====================================================================== +;; Utils for test areas +;;====================================================================== + +(if (args:get-arg "-regen-testfiles") + (if (getenv "MT_TEST_RUN_DIR") + (begin + (launch:setup) + (change-directory (getenv "MT_TEST_RUN_DIR")) + (let* ((testname (getenv "MT_TEST_NAME")) + (itempath (getenv "MT_ITEMPATH"))) + (launch:extract-scripts-logpro (getenv "MT_TEST_RUN_DIR") testname itempath #f)) + (set! *didsomething* #t)) + (debug:print 0 *default-log-port* "ERROR: Must run -regen-testfiles in a test environment (i.e. test xterm from dashboard)"))) + ;;====================================================================== ;; Archive tests ;;====================================================================== ;; Archive tests matching target, runname, and testpatt (if (equal? (args:get-arg "-archive") "replicate-db") Index: mtargs/mtargs.scm ================================================================== --- mtargs/mtargs.scm +++ mtargs/mtargs.scm @@ -20,10 +20,11 @@ ( arg-hash get-arg get-arg-number get-arg-from + remove-arg-from-ht get-args usage print-args any-defined? ) @@ -64,10 +65,14 @@ (define (get-arg-from ht arg . default) (if (null? default) (hash-table-ref/default ht arg #f) (hash-table-ref/default ht arg (car default)))) + +(define (remove-arg-from-ht arg) + (hash-table-delete! arg-hash arg) +) (define (get-args args params switches arg-hash num-needed) (let* ((numtargs (length args)) (adj-num-needed (if num-needed (+ num-needed 2) #f))) (if (< numtargs (if adj-num-needed adj-num-needed 2)) Index: portlogger.scm ================================================================== --- portlogger.scm +++ portlogger.scm @@ -31,11 +31,12 @@ (import (prefix sqlite3 sqlite3:)) (import debugprint dbmod) ;; lsof -i (define (portlogger:open-db fname) - (let* ((avail (tasks:wait-on-journal fname 5 remove: #t)) ;; wait up to about 10 seconds for the journal to go away + (let* (;; (avail (tasks:wait-on-journal fname 5 remove: #t)) ;; wait up to about 10 seconds for the journal to go away + (avail #t) (exists (file-exists? fname)) (db (if avail (sqlite3:open-database fname) (begin (system (conc "rm -f " fname)) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -72,34 +72,11 @@ ;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname)) ;; (define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected (assert *toppath* "FATAL: rmt:send-receive called with *toppath* not set.") - - (if (not (eq? (rmt:transport-mode) 'nfs)) - (begin - (if (> attemptnum 2) - (debug:print 0 *default-log-port* "INFO: attemptnum in rmt:send-receive is " attemptnum)) - - (cond - ((> attemptnum 2) (thread-sleep! 0.05)) - ((> attemptnum 10) (thread-sleep! 0.5)) - ((> attemptnum 20) (thread-sleep! 1))) - - ;; I'm turning this off, it may make sense to move it - ;; into http-transport-handler - (if (and (> attemptnum 5) (= 0 (modulo attemptnum 15))) - (begin - (debug:print 0 *default-log-port* "ERROR: can't connect to server, trying to start a server.") - (case (rmt:transport-mode) - ((http) - (server:run *toppath*) - (thread-sleep! 3)) - (else - (thread-sleep! 1) ;; for tcp the server is started by routines in tcp-transportmod. For nfs there is no server - )))))) - + ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds. ;; 3. do the query, if on homehost use local access ;; (let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value @@ -117,11 +94,11 @@ ((nfs) (nfs-transport-handler runremote cmd rid params attemptnum area-dat areapath readonly-mode testsuite mtexe)) ))) (define (nfs-transport-handler runremote cmd run-id params attemptnum area-dat areapath readonly-mode testsuite mtexe) (let* ((keys (common:get-fields *configdat*)) - (dbstruct (dbmod:nfs-get-dbstruct run-id keys (dbfile:db-init-proc) areapath tmpadj: "/dashboard"))) + (dbstruct (dbmod:nfs-get-dbstruct run-id keys (dbfile:db-init-proc) areapath))) (api:dispatch-request dbstruct cmd run-id params))) (define (tcp-transport-handler runremote cmd run-id params attemptnum area-dat areapath readonly-mode testsuite mtexe) (if (not runremote) (let* ((newremote (make-and-init-remote areapath))) Index: rmtmod.scm ================================================================== --- rmtmod.scm +++ rmtmod.scm @@ -87,16 +87,24 @@ (let* ((runname (car run-dat)) (all-dat (cdr run-dat)) (tests-data (alist-ref "data" all-dat equal?)) (run-meta (alist-ref "meta" all-dat equal?)) (run-id (rmt:insert-run target runname run-meta))) - (for-each - (lambda (test-dat) - (let* ((test-id (car test-dat)) + (if (list? tests-data) + (begin + (debug:print 0 *default-log-port* "import-run: inserting " (length tests-data) " tests") + (for-each + (lambda (test-dat) + (let* ((test-id (car test-dat)) (test-rec (cdr test-dat))) - (rmt:insert-test run-id test-rec))) - tests-data))) + (rmt:insert-test run-id test-rec))) + tests-data) + ) + (debug:print 0 *default-log-port* "import-run: run has no tests") + ) + ) +) ;; insert run if not there, return id either way (define (rmt:insert-run target runname run-meta) ;; look for id, return if found (debug:print 0 *default-log-port* "Insert run: "target"/"runname) @@ -107,13 +115,22 @@ (rmtmod:send-receive 'insert-run #f (list target runname run-meta)) (simple-run-id (car runs))))) (define (rmt:insert-test run-id test-rec) (let* ((testname (alist-ref "testname" test-rec equal?)) - (item-path (alist-ref "item_path" test-rec equal?))) - (debug:print 0 *default-log-port* " Insert test in run "run-id": "testname"/"item-path) - (rmtmod:send-receive 'insert-test run-id test-rec))) + (item-path (alist-ref "item_path" test-rec equal?)) + (test-id (rmt:get-test-id run-id testname item-path)) + ) + (if test-id + (debug:print 0 *default-log-port* "test "testname"/"item-path " already exists in run-id " run-id) + (begin + (debug:print 0 *default-log-port* " Insert test in run "run-id": "testname"/"item-path) + (rmtmod:send-receive 'insert-test run-id test-rec) + ) + ) + ) +) ;;====================================================================== ;; T E S T S ;;====================================================================== Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -994,11 +994,11 @@ (if (null? items-list) (let ((test-id (rmt:get-test-id run-id test-name "")) (num-items (rmt:test-toplevel-num-items run-id test-name))) (if (and test-id (not (> num-items 0))) - (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "ZERO_ITEMS" "Failed to run due to failed prerequisites")))) + (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "ZERO_ITEMS" (conc "Failed to run due to failed prerequisites: " (runs:pretty-string fails)))))) (tests:testqueue-set-items! test-record items-list) (list hed tal reg reruns)) (begin (debug:print-error 0 *default-log-port* "The proc from reading the items table did not yield a list - please report this") (exit 1)))))) @@ -1030,14 +1030,14 @@ (if (and give-up (not (and (null? tal)(null? reg)))) (let ((trimmed-tal (mt:discard-blocked-tests run-id hed tal test-records)) (trimmed-reg (mt:discard-blocked-tests run-id hed reg test-records))) - (debug:print 1 *default-log-port* "WARNING: test " hed " has discarded prerequisites, removing it from the queue") + (debug:print 1 *default-log-port* "WARNING: test " hed " has discarded prerequisites: " prereqstrs ", removing it from the queue") (let ((test-id (rmt:get-test-id run-id hed ""))) - (if test-id (mt:test-set-state-status-by-id run-id test-id "COMPLETED" "PREQ_DISCARDED" "Failed to run due to discarded prerequisites"))) + (if test-id (mt:test-set-state-status-by-id run-id test-id "COMPLETED" "PREQ_DISCARDED" (conc "Failed to run due to discarded prerequisites: " prereqstrs)))) (if (and (null? trimmed-tal) (null? trimmed-reg)) #f (runs:loop-values trimmed-tal trimmed-reg reglen regfull reruns) @@ -1049,11 +1049,11 @@ (null? non-completed)) (debug:print-info 4 *default-log-port* "cond branch - " "ei-4") (if (runs:can-keep-running? hed 20) (begin (runs:inc-cant-run-tests hed) - (debug:print-info 0 *default-log-port* "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0) ", going to wait 60 sec.") ;; + (debug:print-info 0 *default-log-port* "no fails in prerequisites (" (runs:pretty-string prereqs-not-met) ") for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0) ", going to wait 60 sec.") ;; ;; getting here likely means the system is way overloaded, kill a full minute before continuing ;; (thread-sleep! 60) ;; TODO: gate by normalized server load > 1.0 (maxload config thing) CHECKTHIS!!! ;; No runsdat, can't do this yet ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) ;; @@ -1060,29 +1060,29 @@ (thread-sleep! 5) ;; TODO: gate by normalized server load > 1.0 (maxload config thing) ;; num-retries code was here ;; we use this opportunity to move contents of reg to tal (list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met? (begin - (debug:print-info 1 *default-log-port* "no fails in prerequisites for " hed " but nothing seen running in a while, dropping test " hed " from the run queue") + (debug:print-info 1 *default-log-port* "no fails in prerequisites (" (runs:pretty-string prereqs-not-met) ") for " hed " but nothing seen running in a while, dropping test " hed " from the run queue") (let ((test-id (rmt:get-test-id run-id hed ""))) - (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "TIMED_OUT" "Nothing seen running in a while."))) + (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "TIMED_OUT" (conc "Prerequisites (" (runs:pretty-string prereqs-not-met) ") not seen running in a while.")))) (runs:loop-values tal reg reglen regfull reruns) ))) ((and (or (not (null? fails)) (not (null? prereq-fails))) (member 'normal testmode)) (debug:print-info 4 *default-log-port* "cond branch - " "ei-5") - (debug:print-info 1 *default-log-port* "test " hed " (mode=" testmode ") has failed prerequisite(s); " + (debug:print-info 1 *default-log-port* "test " hed " (mode=" testmode ") has failed prerequisite(s): " (string-intersperse (map (lambda (t)(conc (db:test-get-testname t) ":" (db:test-get-state t)"/"(db:test-get-status t))) fails) ", ") ", removing it from to-do list") (let ((test-id (rmt:get-test-id run-id hed ""))) (if test-id (if (not (null? prereq-fails)) - (mt:test-set-state-status-by-id-unless-completed run-id test-id "COMPLETED" "PREQ_DISCARDED" "Failed to run due to prior failed prerequisites") - (mt:test-set-state-status-by-id-unless-completed run-id test-id "COMPLETED" "PREQ_FAIL" "Failed to run due to failed prerequisites")))) + (mt:test-set-state-status-by-id-unless-completed run-id test-id "COMPLETED" "PREQ_DISCARDED" (conc "Failed to run due to prior failed prerequisites: "(runs:pretty-string prereq-fails))) + (mt:test-set-state-status-by-id-unless-completed run-id test-id "COMPLETED" "PREQ_FAIL" (conc "Failed to run due to failed prerequisites: " (runs:pretty-string fails )))))) ;; (debug:print 4 *default-log-port*"BB> set PREQ_FAIL on "hed) ;; (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "PREQ_FAIL" "Failed to run due to failed prerequisites")))) ;; BB: this works, btu equivalent for itemwait mode does not work. (if (or (not (null? reg))(not (null? tal))) (begin (hash-table-set! test-registry hed 'CANNOTRUN) @@ -1162,10 +1162,11 @@ (num-running-in-jobgroup(list-ref run-limits-info 2)) (max-concurrent-jobs (list-ref run-limits-info 3)) (job-group-limit (list-ref run-limits-info 4)) ;; (prereqs-not-met (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)) ;; (prereqs-not-met (mt:lazy-get-prereqs-not-met run-id waitons item-path mode: testmode itemmap: itemmap)) + (prereqs-running (runs:calc-prereqs-running prereqs-not-met)) (fails (if (list? prereqs-not-met) ;; TODO: rename fails to failed-prereqs (runs:calc-fails prereqs-not-met) (begin (debug:print-error 0 *default-log-port* "prereqs-not-met is not a list! " prereqs-not-met) '()))) @@ -1335,45 +1336,44 @@ (thread-sleep! 5) (list (car newtal)(cdr newtal) reg reruns)) ;; the waiton is FAIL so no point in trying to run hed ever again (begin (let ((my-test-id (rmt:get-test-id run-id test-name item-path))) - (mt:test-set-state-status-by-id-unless-completed run-id my-test-id "COMPLETED" "PREQ_FAIL" "Failed to run due to failed prerequisites2")) + (mt:test-set-state-status-by-id-unless-completed run-id my-test-id "COMPLETED" "PREQ_FAIL" (conc "Failed to run due to failed prerequisites: " (runs:pretty-string fails)))) (if (or (not (null? reg))(not (null? tal))) (if (vector? hed) (begin (debug:print 1 *default-log-port* "WARNING: Dropping test " test-name "/" item-path - " from the launch list as it has prerequistes that are FAIL") + " from the launch list as it has prerequistes that are FAIL: " (runs:pretty-string fails)) (let ((test-id (rmt:get-test-id run-id hed ""))) - (if test-id (mt:test-set-state-status-by-id-unless-completed run-id test-id "COMPLETED" "PREQ_FAIL" "Failed to run due to failed prerequisites"))) + (if test-id (mt:test-set-state-status-by-id-unless-completed run-id test-id "COMPLETED" "PREQ_FAIL" (conc "Failed to run due to failed prerequisites: " (runs:pretty-string fails)) ))) (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) ;; (thread-sleep! *global-delta*) ;; This next is for the items (if (not (null? fails)) - ;;(mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "PREQ_FAIL" #f) - (rmt:set-state-status-and-roll-up-items run-id test-name item-path "NOT_STARTED" "PREQ_FAIL" #f) - ;;(mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "BLOCKED" #f) + (rmt:set-state-status-and-roll-up-items run-id test-name item-path "NOT_STARTED" "PREQ_FAIL" (runs:pretty-string fails)) (rmt:set-state-status-and-roll-up-items run-id test-name item-path "NOT_STARTED" "BLOCKED" #f) ) (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'removed) (runs:loop-values tal reg reglen regfull reruns)) (let ((nth-try (hash-table-ref/default test-registry hed 0))) ;; hed not a vector... (debug:print 2 *default-log-port* "nth-try("hed")="nth-try) (cond - ((member "RUNNING" (map db:test-get-state prereqs-not-met)) - (if (runs:lownoise (conc "possible RUNNING prerequistes " hed) 60) - (debug:print 0 *default-log-port* "WARNING: test " hed " has possible RUNNING prerequisites, don't give up on it yet.")) + ;;((member "RUNNING" (map db:test-get-state prereqs-not-met)) + ((> 0 (length prereqs-running)) + (if (runs:lownoise (conc "possible RUNNING prerequisites " hed) 60) + (debug:print 0 *default-log-port* "WARNING: test " hed " has possible RUNNING prerequisites: " prereqs-running ", don't give up on it yet.")) (thread-sleep! 0.1) (runs:loop-values tal reg reglen regfull reruns)) ((or (not nth-try) ;; BB: condition on subsequent tries, condition below fires on first try (and (number? nth-try) (< nth-try 2))) (hash-table-set! test-registry hed (if (number? nth-try) (+ nth-try 1) 0)) (if (runs:lownoise (conc "not removing test " hed) 60) - (debug:print 1 *default-log-port* "WARNING: not removing test " hed " from queue although it may not be runnable due to FAILED prerequisites")) + (debug:print 1 *default-log-port* "WARNING: not removing test " hed " from queue although it may not be runnable due to FAILED prerequisites: "(runs:pretty-string fails))) ;; may not have processed correctly. Could be a race condition in your test implementation? Dropping test " hed) ;; " as it has prerequistes that are FAIL. (NOTE: hed is not a vector)") (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) (runs:loop-values newtal reg reglen regfull reruns)) ((symbol? nth-try) ;; BB: 'done matches here in one case where prereq itemwait failed. This is first "try" (if (eq? nth-try 'removed) ;; removed is removed - drop it NOW @@ -1380,20 +1380,20 @@ (if (null? tal) #f ;; yes, really (list (car tal)(cdr tal) reg reruns)) (begin (if (runs:lownoise (conc "FAILED prerequisites or other issue" hed) 60) - (debug:print 0 *default-log-port* "WARNING: test " hed " has FAILED prerequisites or other issue. Internal state >" nth-try "< will be overridden and we'll retry.")) + (debug:print 0 *default-log-port* "WARNING: test " hed " has FAILED prerequisites (" (runs:pretty-string fails)") or other issue. Internal state >" nth-try "< will be overridden and we'll retry.")) ;; was: (mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "KEEP_TRYING" #f) (mt:test-set-state-status-by-testname-unless-completed run-id test-name item-path "COMPLETED" "PREQ_FAIL" #f) (hash-table-set! test-registry hed 'removed) ;; was 0 (if (not (and (null? reg) (null? tal))) (runs:loop-values tal reg reglen regfull reruns) #f)))) (else (if (runs:lownoise (conc "FAILED prerequitests and we tried" hed) 60) - (debug:print 0 *default-log-port* "WARNING: test " hed " has FAILED prerequitests and we've tried at least 10 times to run it. Giving up now.")) + (debug:print 0 *default-log-port* "WARNING: test " hed " has FAILED prerequitests: " (runs:pretty-string fails) " and we've tried at least 10 times to run it. Giving up now.")) ;; (debug:print 0 *default-log-port* " prereqs: " prereqs-not-met) (hash-table-set! test-registry hed 'removed) (mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "TEN_STRIKES" #f) ;; I'm unclear on if this roll up is needed - it may be the root cause of the "all set to FAIL" bug. (rmt:set-state-status-and-roll-up-items run-id test-name item-path #f "FAIL" #f) ;; treat as FAIL @@ -2003,10 +2003,22 @@ (and (equal? "NOT_STARTED" (db:test-get-state t)) (member (db:test-get-status t) '("n/a" "KEEP_TRYING"))) (and (equal? "RUNNING" (db:test-get-state t))))) ;; account for a test that is running prereqs-not-met)) + +(define (runs:calc-prereqs-running prereqs-not-met) + (if (list? prereqs-not-met) + (filter + (lambda (t) + (or (not (vector? t)) + (member (db:test-get-state t) '("RUNNING" "LAUNCHED" "REMOTE_HOST_START")) + )) + prereqs-not-met) + '() + ) +) (define (runs:pretty-string lst) (map (lambda (t) (if (not (vector? t)) (conc t) @@ -2916,11 +2928,11 @@ (fld (car key)) (val (configf:lookup test-conf "test_meta" fld))) ;; (debug:print 5 *default-log-port* "idx: " idx " fld: " fld " val: " val) (if (and val (not (equal? (vector-ref currrecord idx) val))) (begin - (debug:print 0 *default-log-port* "Updating " test-name " " fld " to " val) + (debug:print 2 *default-log-port* "Updating " test-name " " fld " to " val) (rmt:testmeta-update-field test-name fld val))))) '(("author" 2)("owner" 3)("description" 4)("reviewed" 5)("tags" 9)("jobgroup" 10))))) ;; find tests with matching tags, tagpatt is a string "tagpatt1,tagpatt2%, ..." ;; Index: tcp-transportmod.scm ================================================================== --- tcp-transportmod.scm +++ tcp-transportmod.scm @@ -129,24 +129,50 @@ (tcp-buffer-size 2048) ;; (max-connections 4096) ;; do all the busy work of finding and setting up conn for ;; connecting to a server +;; This function, `tt:client-connect-to-server`, is designed to manage connections between a client and a server within a testing framework. +;; The function takes four arguments: +;; 1. `ttdat`: a data structure that holds information about the testing environment or connections. +;; 2. `dbfname`: The name of the database file that the client wants to connect to. +;; 3. `run-id`: An identifier for the current run of the test suite. +;; 4. `testsuite`: The test suite that is being run. +;; +;; Here's a step-by-step explanation of what the function does: +;; +;; 1. It first asserts that the `run-id` is valid for the given `dbfname` using the `tt:valid-run-id` function. If the `run-id` is not valid, it raises a fatal error. +;; 2. It prints debug information indicating that the function `tt:client-connect-to-server` has been called with the given `dbfname`. +;; 3. It attempts to retrieve an existing connection to the server from a hash table (`tt-conns`) using the `dbfname` as the key. If a connection already exists, it prints debug information and returns the existing connection. +;; 4. If no existing connection is found, it retrieves the current server information from the servinfo file, using the `tt:get-current-server-info` function. +;; 5. It uses pattern matching to destructure the server information into variables (`host`, `port`, `start-time`, `server-id`, `pid`, `dbfname2`, `servinffile`). It then asserts that the `dbfname` from the server info matches the one provided to the function. +;; 6. It constructs a connection object (`conn`) with the server information. +;; 7. It attempts to ping the server using `tt:timed-ping` to verify that the server is running and can be communicated with. +;; 8. Depending on the result of the ping: +;; - If the server is running (`running`), it prints debug information, saves the connection in the hash table, and returns the connection. +;; - If the server is starting (`starting`), it sleeps for 2 seconds and then recursively calls itself to retry the connection. +;; - If the server is neither running nor starting, it checks if it's been more than 10 seconds since the last server start attempt. If so, it attempts to start the server using `server-start-proc` and then sleeps for 1 second before retrying the connection. +;; 9. If no server information is found (`else` case), it checks if it's been more than 3 seconds since the last server start attempt. If so, it starts a new server using `server-start-proc`, updates the last server start time, and sleeps for 4 seconds. +;; 10. It then sleeps for 1 second and prints debug information before recursively calling itself to retry the connection. +;; +;; The function uses recursion to keep trying to connect to the server, with various sleep intervals to prevent overwhelming the system with connection attempts or server starts. +;; It also uses a hash table to cache connections and avoid reconnecting to a server if a connection already exists. +;; The function is designed to handle different server states and ensure that a server is running and available before returning a valid connection to the caller. ;; (define (tt:client-connect-to-server ttdat dbfname run-id testsuite) (assert (tt:valid-run-id run-id dbfname) "FATAL: invalid run-id "run-id) - (debug:print-info 2 *default-log-port* "tt:client-connect-to-server " dbfname " " run-id) + (debug:print-info 2 *default-log-port* "tt:client-connect-to-server " dbfname) (let* ((conn (hash-table-ref/default (tt-conns ttdat) dbfname #f)) (server-start-proc (lambda () (tt:server-process-run (tt-areapath ttdat) testsuite ;; (dbfile:testsuite-name) (common:find-local-megatest) run-id)))) (if conn (begin - (debug:print-info 2 *default-log-port* "already connected to a server") + (debug:print-info 2 *default-log-port* "already connected to a server for " dbfname) conn) ;; we are already connected to the server (let* ((sdat (tt:get-current-server-info ttdat dbfname))) (match sdat ((host port start-time server-id pid dbfname2 servinffile) (assert (equal? dbfname dbfname2) "FATAL: read server info from wrong file.") @@ -189,11 +215,11 @@ (if (> (- (current-seconds) (tt-last-serv-start ttdat)) 3) ;; BUG - grow this number really do not want to swamp the machine with servers (begin (debug:print-info 0 *default-log-port* "Starting server for "dbfname) (server-start-proc) (tt-last-serv-start-set! ttdat (current-seconds)) - (thread-sleep! 3) + (thread-sleep! 4) )) (thread-sleep! 1) (debug:print-info 0 *default-log-port* "Connect to server for " dbfname) (tt:client-connect-to-server ttdat dbfname run-id testsuite))))))) @@ -232,10 +258,11 @@ ;;(tt:handler # get-keys #f () 2 #f "/home/matt/data/megatest/ext-tests" #f "main.db" "ext-tests" "/home/matt/data/megatest/bin/.22.04/../megatest") ;; (define (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe) (debug:print 2 *default-log-port* "tt:handler cmd: " cmd " run-id: " run-id " attemptnum: " attemptnum) ;; NOTE: areapath is passed in and in tt struct. We'll use passed in value for now. + ;; connect-to-server will start a server if needed. (let* ((conn (tt:client-connect-to-server ttdat dbfname run-id testsuite))) ;; (hash-table-ref/default (tt-conns ttdat) dbfname #f))) (if conn ;; have connection, call the server (let* ((res (tt:send-receive ttdat conn cmd run-id params))) ;; res is (status errmsg result meta) @@ -260,11 +287,11 @@ (tt:backoff-incr (tt-conn-host conn)(tt-conn-port conn)) result) ;; (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)) (else result))) (else ;; did not receive properly formated result - (if (not res) ;; tt:handler is telling us that communication failed + (if (not res) ;; tt:send-receive telling us that communication failed (let* ((host (tt-conn-host conn)) (port (tt-conn-port conn)) ;; (dbfname (tt-conn-port conn)) ;; 192.168.0.127:4242-726924:4.db (pid (tt-conn-pid conn)) ;;(servinf (tt-conn-servinf-file conn))) @@ -293,11 +320,11 @@ ;; try again (thread-sleep! 0.25) ;; dunno, I think this needs to be here (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)) )))) (begin ;; no server file, delay and try again - (debug:print 2 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", but do not see servinf file "servinf) + (debug:print 2 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", no servinf file. Server exited? ") (thread-sleep! 0.5) (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)))) (begin ;; this case is where res is malformed. Probably should abort (assert #f "FATAL: tt:handler received bad data "res) ;; (debug:print 0 *default-log-port* "INFO: got corrupt data from server "host":"port", "res", for "dbfname", will try again.") @@ -476,11 +503,12 @@ ;; is there already a server for this dbfile? Then exit. (debug:print 2 *default-log-port* "tt:start-server: " dbfname-in) (let* ((ttdat (make-tt areapath: areapath)) (dbfname (or dbfname-in (dbmod:run-id->dbfname run-id))) (servers (tt:find-server areapath dbfname))) ;; should use tt:get-current-server-info instead - (if (> (length servers) 4) + (debug:print 0 *default-log-port* "Found " (length servers) " already running for " dbfname) + (if (> (length servers) 0) (begin (debug:print 0 *default-log-port* "INFO: found server(s) already running for db "dbfname", "(string-intersperse servers ",")" Exiting.") (exit)) (let* ((dbstruct (dbmod:open-dbmoddb areapath run-id dbfname (dbfile:db-init-proc) keys))) (tt-handler-set! ttdat (handler dbstruct)) @@ -535,14 +563,13 @@ (if (tt-cleanup-proc ttdat) ((tt-cleanup-proc ttdat))) (dbfile:with-no-sync-db nosyncdbpath (lambda (db) (let* ((dbtmpname (dbr:dbstruct-dbtmpname dbstruct))) - (debug:print-info 0 *default-log-port* "Running clean up, including removing db file "dbtmpname) + (debug:print-info 0 *default-log-port* "keep-running: removing lock for file "dbtmpname) (db:no-sync-del! db dbfname) - #;(if dbtmpname - (delete-file dbtmpname)))))))) + )))))) (set! *server-info* ttdat) (let loop ((count 0)) (if (> count 240) (begin (debug:print 0 *default-log-port* "FATAL: Could not start a tcp server, giving up.") @@ -585,10 +612,11 @@ (if (and res (common:low-noise-print 120 "top server message")) (debug:print-info 0 *default-log-port* "Keep running, I'm the top server for " dbfname" on "(tt-host ttdat)":"(tt-port ttdat))) res)) (else + ;; wrong servinfo file (debug:print-info 0 *default-log-port* "I'm not the lead server: "servers) (let* ((leadsrv (car servers))) (match leadsrv ((host port startseconds server-id pid dbfname servinfofile) (let* ((result (tt:timed-ping host port server-id)) @@ -713,21 +741,38 @@ ;; if more than one, wait one second and look again ;; future: ping oldest, if alive remove other : files ;; (define (tt:find-server areapath dbfname) (let* ((servdir (tt:get-servinfo-dir areapath)) - (sfiles (glob (conc servdir"/*:"dbfname)))) - sfiles)) + (sfiles (glob (conc servdir"/*:"dbfname))) + (good-files '())) + (for-each + (lambda (sfile) + (let* ((sinfo (tt:server-get-info sfile)) + (host (list-ref sinfo 0)) + (port (list-ref sinfo 1)) + (server-id (list-ref sinfo 3)) + (pid (list-ref sinfo 4)) + (status (system (conc "ssh " host " ps " pid " > /dev/null"))) + ) + (if (= status 0) + (set! good-files (cons sfile good-files)) + (delete-file* sfile) + ) + ) + ) + sfiles + ) + (debug:print-info 2 *default-log-port* "tt:find-server: good-files: " good-files " sfiles: " sfiles) + good-files)) ;; given a path to a server info file return: host port startseconds server-id pid dbfname logf ;; example of what it's looking for in the log file: ;; SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 ;; (define (tt:server-get-info logf) (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+) dbfname: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id - (dbprep-rx (regexp "^SERVER: dbprep")) - (dbprep-found 0) (bad-dat (list #f #f #f #f #f #f logf))) (let ((fdat (handle-exceptions exn (begin ;; WARNING: this is potentially dangerous to blanket ignore the errors @@ -737,13 +782,11 @@ (if (null? fdat) ;; bad data, return bad-dat bad-dat (let loop ((inl (car fdat)) (tail (cdr fdat)) (lnum 0)) - (let ((mlst (string-match server-rx inl)) - (dbprep (string-match dbprep-rx inl))) - (if dbprep (set! dbprep-found 1)) + (let ((mlst (string-match server-rx inl))) (if (not mlst) (if (> lnum 500) ;; give up if more than 500 lines of server log read bad-dat (if (null? tail) bad-dat @@ -884,11 +927,15 @@ (if (< port 65535) (begin (portlogger:open-run-close portlogger:set-failed port) (thread-sleep! 0.25) (setup-listener-portlogger uconn)) - #f) + (begin + (assert #t "setup-listener-portlogger: could not get a port") + #f + ) + ) (connect-listener uconn port)))) (define (connect-listener uconn port) ;; (tcp-listener-socket LISTENER)(socket-name so) ;; sockaddr-address, sockaddr-port, sockaddr->string Index: tests.scm ================================================================== --- tests.scm +++ tests.scm @@ -148,11 +148,11 @@ ;; return items given config ;; (define (tests:get-items tconfig) (let ((items (hash-table-ref/default tconfig "items" #f)) ;; items 4 - (itemstable (hash-table-ref/default tconfig "itemstable" #f))) + (itemstable (hash-table-ref/default tconfig "itemstable" #f))) ;; if either items or items table is a proc return it so test running ;; process can know to call items:get-items-from-config ;; if either is a list and none is a proc go ahead and call get-items ;; otherwise return #f - this is not an iterated test (cond @@ -160,10 +160,12 @@ (debug:print-info 4 *default-log-port* "items is a procedure, will calc later") items) ;; calc later ((procedure? itemstable) (debug:print-info 4 *default-log-port* "itemstable is a procedure, will calc later") itemstable) ;; calc later + ((and (not items) (not itemstable)) + #f) ((filter (lambda (x) (let ((val (car x))) (if (procedure? val) val #f))) (append (if (list? items) items '()) (if (list? itemstable) itemstable '()))) @@ -1638,11 +1640,11 @@ (if (and testexists cache-file (file-write-access? cache-path) allow-write-cache) (let ((tpath (conc cache-path "/.testconfig"))) - (debug:print-info 1 *default-log-port* "Caching testconfig for " test-name " in " tpath) + (debug:print-info 2 *default-log-port* "Caching testconfig for " test-name " in " tpath) (if (and tcfg (not (common:in-running-test?))) (configf:write-alist tcfg tpath)))) tcfg)))))) ;; sort tests by priority and waiton