Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -1,5 +1,7 @@ + + ;;====================================================================== ;; Copyright 2006-2013, Matthew Welland. ;; ;; This file is part of Megatest. ;; @@ -163,11 +165,11 @@ (params (vector-ref dat 1)) (start-t (current-milliseconds)) (readonly-mode (dbr:dbstruct-read-only dbstruct)) (readonly-command (member cmd api:read-only-queries)) (writecmd-in-readonly-mode (and readonly-mode (not readonly-command))) - (foo (begin + #;(foo (begin (common:telemetry-log (conc "api-in:"(->string cmd)) payload: `((params . ,params))) #t)) (res @@ -359,16 +361,16 @@ start-t))) (hash-table-set! *db-api-call-time* cmd (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '())))) (if writecmd-in-readonly-mode (begin - (common:telemetry-log (conc "api-out:"(->string cmd)) + #;(common:telemetry-log (conc "api-out:"(->string cmd)) payload: `((params . ,params) (ok-res . #t))) (vector #f res)) (begin - (common:telemetry-log (conc "api-out:"(->string cmd)) + #;(common:telemetry-log (conc "api-out:"(->string cmd)) payload: `((params . ,params) (ok-res . #f))) (vector #t res)))))))) ;; http-server send-response @@ -377,26 +379,34 @@ ;; ;; NB// Runs on the server as part of the server loop ;; (define (api:process-request dbstruct $) ;; the $ is the request vars proc (set! *api-process-request-count* (+ *api-process-request-count* 1)) + (debug:print 4 *default-log-port* "server-id:" *server-id*) (let* ((cmd ($ 'cmd)) (paramsj ($ 'params)) - (params (db:string->obj paramsj transport: 'http)) ;; incoming data from the POST (or is it a GET?) - (resdat (api:execute-requests dbstruct (vector cmd params))) ;; process the request, resdat = #( flag result ) - (success (vector-ref resdat 0)) - (res (vector-ref resdat 1))) ;; (vector flag payload), get the payload, ignore the flag (why?) - (if (not success) - (debug:print 0 *default-log-port* "ERROR: success flag is #f for " cmd " with params " params)) - (if (> *api-process-request-count* *max-api-process-requests*) - (set! *max-api-process-requests* *api-process-request-count*)) - (set! *api-process-request-count* (- *api-process-request-count* 1)) - ;; This can be here but needs controls to ensure it doesn't run more than every 4 seconds - ;; (rmt:dat->json-str - ;; (if (or (string? res) - ;; (list? res) - ;; (number? res) - ;; (boolean? res)) - ;; res - ;; (list "ERROR, not string, list, number or boolean" 1 cmd params res))))) - (db:obj->string res transport: 'http))) + (key ($ 'key)) + (params (db:string->obj paramsj transport: 'http))) ;; incoming data from the POST (or is it a GET?) + (debug:print 4 *default-log-port* "cmd:" cmd " with params " params "key " key) + (if (equal? key *server-id*) + (let* ((resdat (api:execute-requests dbstruct (vector cmd params))) ;; process the request, resdat = #( flag result ) + (success (vector-ref resdat 0)) + (res (vector-ref resdat 1))) ;; (vector flag payload), get the payload, ignore the flag (why?) + (debug:print 4 *default-log-port* "res:" res) + (if (not success) + (debug:print 0 *default-log-port* "ERROR: success flag is #f for " cmd " with params " params)) + (if (> *api-process-request-count* *max-api-process-requests*) + (set! *max-api-process-requests* *api-process-request-count*)) + (set! *api-process-request-count* (- *api-process-request-count* 1)) + ;; This can be here but needs controls to ensure it doesn't run more than every 4 seconds + ;; (rmt:dat->json-str + ;; (if (or (string? res) + ;; (list? res) + ;; (number? res) + ;; (boolean? res)) + ;; res + ;; (list "ERROR, not string, list, number or boolean" 1 cmd params res))))) + (db:obj->string res transport: 'http)) + (begin + (debug:print 0 *default-log-port* "Server refused to process request. Sever id mismatch. recived " key " expected: " *server-id* ".\nOther arguments recived: cmd=" cmd " params = " params) + (db:obj->string (conc "Server refused to process request server-id mismatch: " key ", " *server-id*) transport: 'http))))) Index: archive.scm ================================================================== --- archive.scm +++ archive.scm @@ -391,10 +391,11 @@ )) (define (archive:ls->list bup-exe archive-dir internal-path) (let ((cmd (conc bup-exe " -d " archive-dir " ls -l " internal-path "| awk '{print $6}' | sort")) (res '())) + (debug:print-info 0 *default-log-port* cmd) (handle-exceptions exn #f ;; anything goes wrong - assume the process in NOT running. (with-input-from-pipe cmd @@ -412,22 +413,23 @@ (seconds->local-time sec) "%Y-%m-%d-%H%M%S")) (define (archive:get-timestamp-dir bup-exe archive-dir testsuite-name run-id test-partial-path test-last-update) - (print (seconds->std-time-str test-last-update)) + (debug:print-info 0 *default-log-port* "Test last update time:" (seconds->std-time-str test-last-update)) (let* ((internal-path (conc testsuite-name "-" run-id)) + (archive-update-delay (string->number (or (configf:lookup *configdat* "archive" "test-update-delay") "900" ))) (ts-list (archive:ls->list bup-exe archive-dir internal-path)) (ds-flag (vector-ref (seconds->local-time) 8))) (let loop ((hed (car ts-list)) (tail (cdr ts-list))) (if (and (null? tail) (equal? hed "latest")) #f (if (and (not (null? tail)) (equal? hed "latest")) (loop (car tail) (cdr tail)) (let* ((archive-seconds (time-string->seconds hed ds-flag))) - (if (< (abs (- archive-seconds test-last-update)) 120) + (if (< (abs (- archive-seconds test-last-update)) archive-update-delay) (let* ((test-list (archive:ls->list bup-exe archive-dir (conc internal-path "/" hed "/" test-partial-path)))) (if (> (length test-list) 0) hed (if (not (null? tail)) (loop (car tail) (cdr tail)) @@ -481,10 +483,11 @@ (exclude-file (args:get-arg "-exclude-rx-from"))) (if (not archive-timestamp-dir) (debug:print-error 0 *default-log-port* "Archive not found for testsuite" (common:get-testsuite-name) " run/test/itempath" test-partial-path) (begin ;; some sanity checks, move an existing path out of the way - iif it is not a toplevel with children + (debug:print-info 0 *default-log-port* "Archive time: " archive-timestamp-dir) (if (and (not toplevel/children) ;; special handling needed for toplevel with children prev-test-physical-path (common:file-exists? prev-test-physical-path)) ;; what to do? abort or clean up or link it in? (let* ((base (pathname-directory prev-test-physical-path)) (dirn (pathname-file prev-test-physical-path)) Index: client.scm ================================================================== --- client.scm +++ client.scm @@ -88,18 +88,25 @@ (let* ((server-dat (server:get-rand-best areapath)) ;; (server:get-first-best areapath)) (runremote (or area-dat *runremote*))) (if (not server-dat) ;; no server found (client:setup-http areapath remaining-tries: (- remaining-tries 1)) (let ((host (cadr server-dat)) - (port (caddr server-dat))) + (port (caddr server-dat)) + (server-id (caddr (cddr server-dat)))) (debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) (if (and (not area-dat) (not *runremote*)) - (set! *runremote* (make-remote))) - (if (and host port) + (begin + (set! *runremote* (make-remote)) + (let* ((server-info (remote-server-info *runremote*))) + (if server-info + (begin + (remote-server-url-set! *runremote* (server:record->url server-info)) + (remote-server-id-set! *runremote* (server:record->id server-info))))))) + (if (and host port server-id) (let* ((start-res (case *transport-type* - ((http)(http-transport:client-connect host port)))) + ((http)(http-transport:client-connect host port server-id)))) (ping-res (case *transport-type* ((http)(rmt:login-no-auto-client-setup start-res))))) (if (and start-res ping-res) (let ((runremote (or area-dat *runremote*))) ;; it might have been generated only a few statements ago Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -296,11 +296,13 @@ (define (common:logpro-exit-code->test-status exit-code) (status-sym->string (common:logpro-exit-code->status-sym exit-code))) (defstruct remote (hh-dat (common:get-homehost)) ;; homehost record ( addr . hhflag ) - (server-url (if *toppath* (server:check-if-running *toppath*))) ;; (server:check-if-running *toppath*) #f)) + (server-url #f) ;; (server:check-if-running *toppath*) #f)) + (server-id #f) + (server-info (if *toppath* (server:check-if-running *toppath*))) (last-server-check 0) ;; last time we checked to see if the server was alive (conndat #f) (transport *transport-type*) (server-timeout (server:expiration-timeout)) (force-server #f) @@ -567,11 +569,11 @@ (- num-logs max-allowed)))) (for-each (lambda (file) (let* ((fullname (conc "logs/" file))) (if (directory? fullname) - (debug:print-error 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.") + (debug:print-info 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.") (handle-exceptions exn (debug:print-error 0 *default-log-port* "failed to remove " fullname ", exn=" exn) (delete-file* fullname))))) files) @@ -3546,14 +3548,14 @@ #t) ;; just ignore it, it might have died in the meantime so joining it will throw an exception (thread-join! thread)) ))) (hash-table-keys *common:thread-punchlist*))) -(define *common:telemetry-log-state* 'startup) -(define *common:telemetry-log-socket* #f) +#;(define *common:telemetry-log-state* 'startup) +#;(define *common:telemetry-log-socket* #f) -(define (common:telemetry-log-open) +#;(define (common:telemetry-log-open) (if (eq? *common:telemetry-log-state* 'startup) (let* ((serverhost (configf:lookup *configdat* "telemetry" "host")) (serverport (configf:lookup-number *configdat* "telemetry" "port")) (user (or (get-environment-variable "USER") "unknown")) (host (or (get-environment-variable "HOST") "unknown"))) @@ -3569,11 +3571,11 @@ (udp-connect! s serverhost serverport) (set! *common:telemetry-log-socket* s) 'open) 'not-needed)))))) -(define (common:telemetry-log event #!key (payload '())) +#;(define (common:telemetry-log event #!key (payload '())) (if (eq? *common:telemetry-log-state* 'startup) (common:telemetry-log-open)) (if (eq? 'open *common:telemetry-log-state*) (handle-exceptions @@ -3596,11 +3598,11 @@ (with-output-to-string (lambda () (pp payload)))))) (msg (conc user":"host":"start":"(current-process-id)":"(car (argv))":" toppath":"payload-serialized))) (udp-send *common:telemetry-log-socket* msg)))))) -(define (common:telemetry-log-close) +#;(define (common:telemetry-log-close) (when (or (member *common:telemetry-log-state* '(broken-or-no-server-preclose open)) *common:telemetry-log-socket*) (handle-exceptions exn (begin (define *common:telemetry-log-state* 'closed-fail) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -1534,11 +1534,11 @@ state TEXT DEFAULT 'new', status TEXT DEFAULT 'n/a', archive_type TEXT DEFAULT 'bup', du INTEGER, archive_path TEXT);"))) - (print "creating trigges from init") + ;; (print "creating trigges from init") (db:create-triggers db) db)) ;; ) ;;====================================================================== ;; A R C H I V E S @@ -3968,17 +3968,133 @@ ;; (db:set-state-status-and-roll-up-items dbstruct run-id test-id #f state status msg) ;; ;; process the test_data table ;; (if (and test-id state status (equal? status "AUTO")) ;; (db:test-data-rollup dbstruct run-id test-id status)) ;; (mt:process-triggers dbstruct run-id test-id state status))) + +;; NOT FINISHED +(define (db:calc-state-status-toplevel state status tl-state tl-status) + `(,state ,status)) + +;; (match state +;; (("COMPLETED") +;; (match `(,tl-state ,tl-status) +;; (("COMPLETED" "PASS") `(,state ,status)) +;; (("COMPLETED" thestatus) +;; (case (string->symbol thestatus) +;; ((ABORT CHECK DEAD) +;; (if `("COMPLETED" ,thestatus)) +;; (match `(,thestatus ,status) +;; (("FAIL" "ABORT") '("COMPLETED" "ABORT")) +;; (("FAIL" "CHECK") '("COMPLETED" "CHECK")) +;; (("FAIL" "DEAD") '("COMPLETED" "DEAD")) +;; (("WARN" "FAIL") '("COMPLETED" "FAIL")) +;; (("WARN" "CHECK") '("COMPLETED" "CHECK")) +;; (("WARN" "DEAD") + +(define (db:set-state-status-and-roll-up-items dbstruct run-id test-name item-path state status comment) + ;; establish info on incoming test followed by info on top level test + ;; BBnote - for mode itemwait, linkage between upstream test & matching item status is propagated to run queue in db:prereqs-not-met + ;; (mutex-lock! *db-transaction-mutex*) ;; why do we need a mutex? + (let* ((testdat (if (number? test-name) + (db:get-test-info-by-id dbstruct run-id test-name) ;; test-name is actually a test-id + (db:get-test-info dbstruct run-id test-name item-path))) + (test-id (db:test-get-id testdat)) + (test-name (if (number? test-name) + (db:test-get-testname testdat) + test-name)) + (no-sync-db (db:no-sync-db #f)) + (rollup-flag #f) + (wait-flag #f) + (rollup-lock-key (conc run-id "-rollup-" test-name)) + (waiting-lock-key (conc run-id "-waiting-" test-name))) + (db:test-set-state-status dbstruct run-id test-id state status #f) + (if (and test-id state status (equal? status "AUTO")) + (db:test-data-rollup dbstruct run-id test-id status)) + (if (member state '("LAUNCHED" "REMOTEHOSTSTART")) + (db:general-call dbstruct 'set-test-start-time (list test-id))) + + (if (not (equal? item-path "")) ;; only roll up IF incoming test is an item + (begin + ;; is there a rollup lock? If not, take it + (sqlite3:with-transaction + no-sync-db + (lambda () + ;; (debug:print 0 *default-log-port* "EXCEPTION: exn="exn) + (let* ((rollup-lock-time (db:no-sync-get/default no-sync-db rollup-lock-key #f)) + (waiting-lock-time (db:no-sync-get/default no-sync-db waiting-lock-key #f))) + (if rollup-lock-time ;; someone is doing a rollup + (if (not waiting-lock-time) ;; no one is waiting + (begin + (set! wait-flag #t) + (set! rollup-flag #t) + (db:no-sync-set no-sync-db waiting-lock-key (current-seconds)))) ;; we are going to wait + (begin + (set! rollup-flag #t) + (db:no-sync-set no-sync-db rollup-lock-key (current-seconds))))))) + (if wait-flag + (let loop ((count 10)) ;; about 20 seconds + (thread-sleep! 2) + (if (and (not (db:no-sync-get/default no-sync-db waiting-lock-key #f)) + (> count 0)) + (loop (+ count 1)) + (sqlite3:with-transaction + no-sync-db + (lambda () + (db:no-sync-set no-sync-db rollup-lock-key (current-seconds)) + (db:no-sync-del! no-sync-db waiting-lock-key)))))) + ;; now the rollup + (if rollup-flag ;; put this into a thread + (begin + ;; (thread-start! (make-thread + ;; (lambda () + (db:roll-up-test-state-status dbstruct run-id test-name state status) + (db:no-sync-del! no-sync-db rollup-lock-key)) + ;; (conc "thread for run-id: " run-id " test-name: " test-name)))))))) + ))))) + +;; I'd like to remove the need for item-path - it is logically not needed here +;; for now we pass in state and status - NOTE: There is a possible race if a test +;; is rapidly re-run while an earlier run is waiting to rollup. +;; +(define (db:roll-up-test-state-status dbstruct run-id test-name state status) + (let* ((testdat (if (number? test-name) + (db:get-test-info-by-id dbstruct run-id test-name) ;; test-name is actually a test-id + (db:get-test-info dbstruct run-id test-name ""))) + (test-id (db:test-get-id testdat)) + (test-name (if (number? test-name) + (db:test-get-testname testdat) + test-name)) + (tl-testdat (db:get-test-info dbstruct run-id test-name "")) + (tl-test-id (db:test-get-id tl-testdat))) + (db:with-db + dbstruct #f #f + (lambda (db) + ;; NB// Pass the db so it is part fo the transaction + ;; item-path is used in get-all-state-status counts to exclude current state/status of THIS test + ;; but with the state/status being set earlier this is not needed any longer + (let* ((state-status-counts (db:get-all-state-status-counts-for-testname dbstruct run-id test-name)) + (state-statuses (if (null? state-status-counts) + '() + (db:roll-up-rules state-status-counts state status))) + (newstate (if (null? state-statuses) + state + (car state-statuses))) + (newstatus (if (null? state-statuses) + status + (cadr state-statuses)))) + (if tl-test-id + (db:test-set-state-status db run-id tl-test-id newstate newstatus #f)) + ))) + #t)) ;; state is the priority rollup of all states ;; status is the priority rollup of all completed statesfu ;; ;; if test-name is an integer work off that instead of test-name test-path ;; -(define (db:set-state-status-and-roll-up-items dbstruct run-id test-name item-path state status comment) +(define (db:set-state-status-and-roll-up-items-orig dbstruct run-id test-name item-path state status comment) ;; establish info on incoming test followed by info on top level test ;; BBnote - for mode itemwait, linkage between upstream test & matching item status is propagated to run queue in db:prereqs-not-met (let* ((testdat (if (number? test-name) (db:get-test-info-by-id dbstruct run-id test-name) ;; test-name is actually a test-id (db:get-test-info dbstruct run-id test-name item-path))) @@ -3999,17 +4115,17 @@ (lambda (db) (let ((tr-res (sqlite3:with-transaction db (lambda () - ;; NB// Pass the db so it is part fo the transaction + ;; NB// Pass the db so it is part of the transaction (db:test-set-state-status db run-id test-id state status comment) ;; this call sets the item state/status (if (not (equal? item-path "")) ;; only roll up IF incoming test is an item (let* ((state-status-counts (db:get-all-state-status-counts-for-test dbstruct run-id test-name item-path state status)) ;; item-path is used to exclude current state/status of THIS test - (state-stauses (db:roll-up-rules state-status-counts state status)) - (newstate (car state-stauses)) - (newstatus (cadr state-stauses))) + (state-statuses (db:roll-up-rules state-status-counts state status)) + (newstate (car state-statuses)) + (newstatus (cadr state-statuses))) (debug:print 4 *default-log-port* "BB> tl-test-id="tl-test-id" ; "test-name":"item-path" newstate="newstate" newstatus="newstatus" len(sscs)="(length state-status-counts) " state-status-counts: " (apply conc (map (lambda (x) (conc (with-output-to-string (lambda () (pp (dbr:counts->alist x)))) " | ")) @@ -4022,81 +4138,84 @@ (if (and test-id state status (equal? status "AUTO")) (db:test-data-rollup dbstruct run-id test-id status)) tr-res))))) (define (db:roll-up-rules state-status-counts state status) - (let* ((running (length (filter (lambda (x) - (member (dbr:counts-state x) *common:running-states*)) - state-status-counts))) - (bad-not-started (length (filter (lambda (x) - (and (equal? (dbr:counts-state x) "NOT_STARTED") - (not (member (dbr:counts-status x) *common:not-started-ok-statuses*)))) - state-status-counts))) - (all-curr-states (common:special-sort ;; worst -> best (sort of) - (delete-duplicates - (if (and state (not (member state *common:dont-roll-up-states*))) - (cons state (map dbr:counts-state state-status-counts)) - (map dbr:counts-state state-status-counts))) - *common:std-states* >)) - (all-curr-statuses (common:special-sort ;; worst -> best - (delete-duplicates - (if (and state status (not (member state *common:dont-roll-up-states*))) - (cons status (map dbr:counts-status state-status-counts)) - (map dbr:counts-status state-status-counts))) - *common:std-statuses* >)) - (non-completes (filter (lambda (x) - (not (member x (cons "COMPLETED" *common:dont-roll-up-states*)))) - all-curr-states)) - (preq-fails (filter (lambda (x) - (equal? x "PREQ_FAIL")) - all-curr-statuses)) - (num-non-completes (length non-completes)) - (newstate (cond - ((> running 0) "RUNNING") ;; anything running, call the situation running - ((> (length preq-fails) 0) "NOT_STARTED") - ((> bad-not-started 0) "COMPLETED") ;; we have an ugly situation, it is completed in the sense we cannot do more. - ((> num-non-completes 0) (car non-completes)) ;; (remove (lambda (x)(equal? "COMPLETED" x)) all-curr-states))) ;; only rollup DELETED if all DELETED - (else (car all-curr-states)))) - (newstatus (cond - ((> (length preq-fails) 0) "PREQ_FAIL") - ((or (> bad-not-started 0) - (and (equal? newstate "NOT_STARTED") - (> num-non-completes 0))) - "STARTED") - (else (car all-curr-statuses))))) - (debug:print-info 2 *default-log-port* - "\n--> probe db:set-state-status-and-roll-up-items: " - "\n--> state-status-counts: "(map dbr:counts->alist state-status-counts) - "\n--> running: "running - "\n--> bad-not-started: "bad-not-started - "\n--> non-non-completes: "num-non-completes - "\n--> non-completes: "non-completes - "\n--> all-curr-states: "all-curr-states + (let* ((running (length (filter (lambda (x) + (member (dbr:counts-state x) *common:running-states*)) + state-status-counts))) + (bad-not-started (length (filter (lambda (x) + (and (equal? (dbr:counts-state x) "NOT_STARTED") + (not (member (dbr:counts-status x) *common:not-started-ok-statuses*)))) + state-status-counts))) + (all-curr-states (common:special-sort ;; worst -> best (sort of) + (delete-duplicates + (if (and state (not (member state *common:dont-roll-up-states*))) + (cons state (map dbr:counts-state state-status-counts)) + (map dbr:counts-state state-status-counts))) + *common:std-states* >)) + (all-curr-statuses (common:special-sort ;; worst -> best + (delete-duplicates + (if (and state status (not (member state *common:dont-roll-up-states*))) + (cons status (map dbr:counts-status state-status-counts)) + (map dbr:counts-status state-status-counts))) + *common:std-statuses* >)) + (non-completes (filter (lambda (x) + (not (member x (cons "COMPLETED" *common:dont-roll-up-states*)))) + all-curr-states)) + (preq-fails (filter (lambda (x) + (equal? x "PREQ_FAIL")) + all-curr-statuses)) + (num-non-completes (length non-completes)) + (newstate (cond + ((> running 0) "RUNNING") ;; anything running, call the situation running + ((> (length preq-fails) 0) "NOT_STARTED") + ((> bad-not-started 0) "COMPLETED") ;; we have an ugly situation, it is completed in the sense we cannot do more. + ((> num-non-completes 0) (car non-completes)) ;; (remove (lambda (x)(equal? "COMPLETED" x)) all-curr-states))) ;; only rollup DELETED if all DELETED + (else (car all-curr-states)))) + (newstatus (cond + ((> (length preq-fails) 0) "PREQ_FAIL") + ((or (> bad-not-started 0) + (and (equal? newstate "NOT_STARTED") + (> num-non-completes 0))) + "STARTED") + (else (car all-curr-statuses))))) + (debug:print-info 2 *default-log-port* + "\n--> probe db:set-state-status-and-roll-up-items: " + "\n--> state-status-counts: "(map dbr:counts->alist state-status-counts) + "\n--> running: "running + "\n--> bad-not-started: "bad-not-started + "\n--> non-non-completes: "num-non-completes + "\n--> non-completes: "non-completes + "\n--> all-curr-states: "all-curr-states "\n--> all-curr-statuses: "all-curr-statuses "\n--> newstate "newstate "\n--> newstatus "newstatus "\n\n") - - ;; NB// Pass the db so it is part of the transaction - (list newstate newstatus))) + ;; NB// Pass the db so it is part of the transaction + (list newstate newstatus))) (define (db:set-state-status-and-roll-up-run dbstruct run-id curr-state curr-status) - (mutex-lock! *db-transaction-mutex*) + ;; (mutex-lock! *db-transaction-mutex*) (db:with-db dbstruct #f #f (lambda (db) (let ((tr-res (sqlite3:with-transaction db (lambda () (let* ((state-status-counts (db:get-all-state-status-counts-for-run dbstruct run-id)) - (state-stauses (db:roll-up-rules state-status-counts #f #f )) - (newstate (car state-stauses)) - (newstatus (cadr state-stauses))) + (state-statuses (db:roll-up-rules state-status-counts #f #f )) + (newstate (if (null? state-statuses) + curr-state + (car state-statuses))) + (newstatus (if (null? state-statuses) + curr-status + (cadr state-statuses)))) (if (or (not (eq? newstate curr-state)) (not (eq? newstatus curr-status))) (db:set-run-state-status dbstruct run-id newstate newstatus ))))))) - (mutex-unlock! *db-transaction-mutex*) + ;; (mutex-unlock! *db-transaction-mutex*) tr-res)))) (define (db:get-all-state-status-counts-for-run dbstruct run-id) (let* ((test-count-recs (db:with-db @@ -4148,10 +4267,25 @@ (unrelated-rec-list (filter nonmatch-countrec-lambda other-items-count-recs))) (cons updated-count-rec unrelated-rec-list))) + +;; full count not including toplevel +;; +(define (db:get-all-state-status-counts-for-testname dbstruct run-id test-name) + (let* ((test-count-recs (db:with-db + dbstruct #f #f + (lambda (db) + (sqlite3:map-row + (lambda (state status count) + (make-dbr:counts state: state status: status count: count)) + db + "SELECT state,status,count(id) FROM tests WHERE run_id=? AND testname=? AND item_path != '' GROUP BY state,status;" + run-id test-name))))) + test-count-recs)) + ;; (define (db:get-all-item-states db run-id test-name) ;; (sqlite3:map-row ;; (lambda (a) a) ;; db @@ -4342,18 +4476,19 @@ set-verbosity killserver )) (define (db:login dbstruct calling-path calling-version client-signature) - (cond + (cond ((not (equal? calling-path *toppath*)) (list #f "Login failed due to mismatch paths: " calling-path ", " *toppath*)) ;; ((not (equal? *run-id* run-id)) ;; (list #f "Login failed due to mismatch run-id: " run-id ", " *run-id*)) ((not (equal? megatest-version calling-version)) (list #t (conc "Login warning due to mismatch megatest version: " calling-version ", " megatest-version))) - (else + + (else (hash-table-set! *logged-in-clients* client-signature (current-seconds)) '(#t "successful login")))) (define (db:general-call dbstruct stmtname params) (let ((query (let ((q (alist-ref (if (string? stmtname) Index: dcommon.scm ================================================================== --- dcommon.scm +++ dcommon.scm @@ -713,11 +713,11 @@ ;; colnames) (set! rownum 1) (for-each (lambda (server) (set! colnum 0) - (match-let (((mod-time host port start-time pid) + (match-let (((mod-time host port start-time server-id pid) server)) (let* ((uptime (- (current-seconds) mod-time)) (runtime (if start-time (- mod-time start-time) 0)) Index: docs/manual/megatest_manual.html ================================================================== --- docs/manual/megatest_manual.html +++ docs/manual/megatest_manual.html @@ -773,13 +773,13 @@

The Megatest Users Manual

Matt Welland
<matt@kiatoa.com>
version 1.5, June 2020 -
-
Table of Contents
- +
+
Table of Contents
+

Preface

@@ -959,10 +959,15 @@ Static
+ +
+

Road Map

+
+

TODO / Road Map

Note: This road-map is a wish list and not a formal plan. Items are in @@ -1447,11 +1452,14 @@

Installation

Dependencies

Chicken scheme and a number of "eggs" are required for building -Megatest. In the v1.66 and beyond assistance to create the build +Megatest. See the script installall.sh in the utils directory of the +source distribution for an automated way to install everything +needed for building Megatest on Linux.

+

Megatest. In the v1.66 and beyond assistance to create the build system is built into the Makefile.

Installation steps (overview)
./configure
@@ -3417,10 +3425,48 @@
 

+
+
+

Test Plan

+
+
+

Tests

+

itemwait|33

+

rerun-downstream-item|20

+

rerunclean|20

+

fullrun|18

+

goodtests|18

+

kill-rerun|17

+

items-runconfigvars|16

+

ro_test|16

+

runconfig-tests|16

+

env-pollution|13

+

itemmap|11

+

testpatt_envvar|10

+

toprun|10

+

chained-waiton|8

+

skip-on-fileexists|8

+

killrun_preqfail|7

+

subrun|6

+

dependencies|5

+

itemwait-simple|4

+

rollup|4

+

end-of-run|3

+

killrun|3

+

listener|3

+

test2|3

+

testpatt|3

+

env-pollution-usecacheno|2

+

set-values|2 +envvars|1 +listruns-tests|1 +subrun-usecases|1

+
+

Megatest Internals

@@ -3438,10 +3484,10 @@

Index: docs/manual/megatest_manual.txt ================================================================== --- docs/manual/megatest_manual.txt +++ docs/manual/megatest_manual.txt @@ -116,10 +116,12 @@ include::writing_tests.txt[] include::howto.txt[] include::reference.txt[] + +include::testplan.txt[] Megatest Internals ------------------ ["graphviz", "server.png"] ADDED docs/manual/testplan.txt Index: docs/manual/testplan.txt ================================================================== --- /dev/null +++ docs/manual/testplan.txt @@ -0,0 +1,81 @@ +// This file is part of Megatest. +// +// Megatest is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Megatest is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Megatest. If not, see . + +// Copyright 2006-2020, Matthew Welland. + +Test Plan +--------- + +Tests +~~~~~ + +itemwait|33 + +rerun-downstream-item|20 + +rerunclean|20 + +fullrun|18 + +goodtests|18 + +kill-rerun|17 + +items-runconfigvars|16 + +ro_test|16 + +runconfig-tests|16 + +env-pollution|13 + +itemmap|11 + +testpatt_envvar|10 + +toprun|10 + +chained-waiton|8 + +skip-on-fileexists|8 + +killrun_preqfail|7 + +subrun|6 + +dependencies|5 + +itemwait-simple|4 + +rollup|4 + +end-of-run|3 + +killrun|3 + +listener|3 + +test2|3 + +testpatt|3 + +env-pollution-usecacheno|2 + +set-values|2 +envvars|1 +listruns-tests|1 +subrun-usecases|1 + + Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -246,12 +246,18 @@ (debug:print 0 *default-log-port* "FATAL ERROR: http-transport:client-api-send-receive called with no server info") (exit 1)))) (res (vector #f "uninitialized")) (success #t) (sparams (db:obj->string params transport: 'http)) - (runremote (or area-dat *runremote*))) - (debug:print-info 11 *default-log-port* "fullurl=" fullurl ", cmd=" cmd ", params=" params ", run-id=" run-id "\n") + (runremote (or area-dat *runremote*)) + (server-id (if (vector? serverdat) + (http-transport:server-dat-get-server-id serverdat) + (begin + (debug:print 0 *default-log-port* "FATAL ERROR: http-transport:client-api-send-receive called with no server info") + (exit 1))))) + (debug:print-info 11 *default-log-port* "cmd=" cmd " fullurl=" fullurl " server-id=" server-id " current time:" (current-seconds)) + ;; set up the http-client here (max-retry-attempts 1) ;; consider all requests indempotent (retry-request? (lambda (request) #f)) @@ -273,11 +279,11 @@ (if (debug:debug-mode 1) (debug:print-info 0 *default-log-port* "couldn't talk to server, trying again ...") (begin (debug:print 0 *default-log-port* "WARNING: failure in with-input-from-request to " fullurl ".") (debug:print 0 *default-log-port* " message: " msg ", exn=" exn) - (debug:print 0 *default-log-port* " cmd: " cmd " params: " params) + (debug:print 0 *default-log-port* " cmd: " cmd " params: " params " key:" (or server-id "thekey")) (debug:print 0 *default-log-port* " call-chain: " call-chain))) (if runremote (remote-conndat-set! runremote #f)) ;; Killing associated server to allow clean retry.") ;; (tasks:kill-server-run-id run-id) ;; better to kill the server in the logic that called this routine? @@ -286,11 +292,11 @@ ;;; (make-property-condition 'commfail 'message "failed to connect to server"))) ;;; "communications failed" (db:obj->string #f)) (with-input-from-request ;; was dat fullurl - (list (cons 'key (or *server-id* "thekey")) + (list (cons 'key (or server-id "thekey")) (cons 'cmd cmd) (cons 'params sparams)) read-string)) transport: 'http) 0)) ;; added this speculatively @@ -305,12 +311,12 @@ (th1 (make-thread send-recieve "with-input-from-request")) (th2 (make-thread time-out "time out"))) (thread-start! th1) (thread-start! th2) (thread-join! th1) + (vector-set! res 0 success) (thread-terminate! th2) - (debug:print-info 11 *default-log-port* "got res=" res) (if (vector? res) (if (vector-ref res 0) ;; this is the first flag or the second flag? res ;; this is the *inner* vector? seriously? why? (if (debug:debug-mode 11) (let ((call-chain (get-call-chain))) ;; note: this code also called in nmsg-transport - consider consolidating it @@ -350,11 +356,12 @@ (define (http-transport:server-dat-get-port vec) (vector-ref vec 1)) (define (http-transport:server-dat-get-api-uri vec) (vector-ref vec 2)) (define (http-transport:server-dat-get-api-url vec) (vector-ref vec 3)) (define (http-transport:server-dat-get-api-req vec) (vector-ref vec 4)) (define (http-transport:server-dat-get-last-access vec) (vector-ref vec 5)) -(define (http-transport:server-dat-get-socket vec) (vector-ref vec 6)) +;(define (http-transport:server-dat-get-socket vec) (vector-ref vec 6)) +(define (http-transport:server-dat-get-server-id vec) (vector-ref vec 6)) (define (http-transport:server-dat-make-url vec) (if (and (http-transport:server-dat-get-iface vec) (http-transport:server-dat-get-port vec)) (conc "http://" @@ -371,15 +378,15 @@ (debug:print-error 0 *default-log-port* "call to http-transport:server-dat-update-last-access with non-vector!!")))) ;; ;; connect ;; -(define (http-transport:client-connect iface port) +(define (http-transport:client-connect iface port server-id) (let* ((api-url (conc "http://" iface ":" port "/api")) (api-uri (uri-reference (conc "http://" iface ":" port "/api"))) (api-req (make-request method: 'POST uri: api-uri)) - (server-dat (vector iface port api-uri api-url api-req (current-seconds)))) + (server-dat (vector iface port api-uri api-url api-req (current-seconds) server-id))) server-dat)) ;; run http-transport:keep-running in a parallel thread to monitor that the db is being ;; used and to shutdown after sometime if it is not. ;; @@ -475,21 +482,27 @@ (let ((new-iface (car sdat)) (new-port (cadr sdat))) (debug:print-info 0 *default-log-port* "WARNING: interface changed, refreshing iface and port info") (set! iface new-iface) (set! port new-port) - (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds)) + (if (not *server-id*) + (set! *server-id* (server:mk-signature))) + (debug:print 0 *default-log-port* (current-seconds) (current-directory) (current-process-id) (argv)) + (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds) " server-id: " *server-id*) (flush-output *default-log-port*))) ;; Transfer *db-last-access* to last-access to use in checking that we are still alive (mutex-lock! *heartbeat-mutex*) (set! last-access *db-last-access*) (mutex-unlock! *heartbeat-mutex*) (if (common:low-noise-print 120 (conc "server running on " iface ":" port)) (begin - (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds)) + (if (not *server-id*) + (set! *server-id* (server:mk-signature))) + (debug:print 0 *default-log-port* (current-seconds) (current-directory) (current-process-id) (argv)) + (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds) " server-id: " *server-id*) (flush-output *default-log-port*))) (if (common:low-noise-print 60 "dbstats") (begin (debug:print 0 *default-log-port* "Server stats:") (db:print-current-query-stats))) Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -211,11 +211,11 @@ (let loop ((minutes (calc-minutes)) (cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (disk-free (get-df (current-directory))) (last-sync (current-seconds))) - (common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync)) + ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync)) (let* ((over-time (> (current-seconds) (+ last-sync update-period))) (new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (delta (abs (- load cpu-load)))) (if (> delta 0.1) ;; don't bother updating with small changes load @@ -233,11 +233,11 @@ (test-info (rmt:get-test-info-by-id run-id test-id)) (state (db:test-get-state test-info)) (status (db:test-get-status test-info)) (kill-reason "no kill reason specified") (kill-job? #f)) - (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period)) + ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period)) (cond ((test-get-kill-request run-id test-id) (set! kill-reason "KILLING TEST since received kill request (KILLREQ)") (set! kill-job? #t)) ((and runtlim (> (- (current-seconds) start-seconds) runtlim)) @@ -252,13 +252,13 @@ (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync) (launch:handle-zombie-tests run-id) (when do-sync ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append) ;; (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes))))) - (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync started at "(current-seconds))) + ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync started at "(current-seconds))) (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) - (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds)))) + #;(common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds)))) (if kill-job? (begin (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason) (mutex-lock! m) Index: megatest-version.scm ================================================================== --- megatest-version.scm +++ megatest-version.scm @@ -18,6 +18,6 @@ ;; Always use two or four digit decimal ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00.. ;; (declare (unit megatest-version)) -(define megatest-version 1.6566) +(define megatest-version 1.6581) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -852,11 +852,11 @@ )) (if (args:get-arg "-ping") (let* ((server-id (string->number (args:get-arg "-ping"))) ;; extract run-id (i.e. no ":" (host:port (args:get-arg "-ping"))) - (server:ping (or server-id host:port) do-exit: #t))) + (server:ping (or server-id host:port) #f do-exit: #t))) ;;====================================================================== ;; Capture, save and manipulate environments ;;====================================================================== Index: mt.scm ================================================================== --- mt.scm +++ mt.scm @@ -102,11 +102,11 @@ #f)))) (if useres (let ((result (vector-ref res 1))) (debug:print 4 *default-log-port* "Using lazy value res: " result) result) - (let ((newres (rmt:get-prereqs-not-met run-id waitons ref-item-path mode: mode itemmaps: itemmaps))) + (let ((newres (rmt:get-prereqs-not-met run-id waitons ref-item-path mode itemmaps))) (hash-table-set! *pre-reqs-met-cache* key (vector (current-seconds) newres)) newres)))) (define (mt:get-run-stats dbstruct run-id) ;; Get run stats from local access, move this ... but where? Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -100,10 +100,15 @@ ;; DOT INIT_RUNREMOTE -> MUTEXLOCK; ;; ensure we have a record for our connection for given area (if (not runremote) ;; can remove this one. should never get here. (begin (set! *runremote* (make-remote)) + (let* ((server-info (remote-server-info *runremote*))) + (if server-info + (begin + (remote-server-url-set! *runremote* (server:record->url server-info)) + (remote-server-id-set! *runremote* (server:record->id server-info))))) (set! runremote *runremote*))) ;; new runremote will come from this on next iteration ;; DOT SET_HOMEHOST; // leaving off - doesn't really add to the clarity ;; DOT MUTEXLOCK -> SET_HOMEHOST [label="no homehost?"]; ;; DOT SET_HOMEHOST -> MUTEXLOCK; @@ -174,12 +179,17 @@ ;;DOT CASE6 -> "rmt:send-receive"; ;; on homehost and this is a write, we already have a server, but server has died ((and (cdr (remote-hh-dat runremote)) ;; on homehost (not (member cmd api:read-only-queries)) ;; this is a write (remote-server-url runremote) ;; have a server - (not (server:ping (remote-server-url runremote)))) ;; server has died. NOTE: this is not a cheap call! Need better approach. + (not (server:ping (remote-server-url runremote) (remote-server-id runremote)))) ;; server has died. NOTE: this is not a cheap call! Need better approach. (set! *runremote* (make-remote)) + (let* ((server-info (remote-server-info *runremote*))) + (if server-info + (begin + (remote-server-url-set! *runremote* (server:record->url server-info)) + (remote-server-id-set! *runremote* (server:record->id server-info))))) (remote-force-server-set! runremote (common:force-server?)) (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 6") (rmt:send-receive cmd rid params attemptnum: attemptnum)) @@ -202,20 +212,22 @@ ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required (cdr (remote-hh-dat runremote)) ;; have homehost (not (remote-server-url runremote)) ;; no connection yet (not (member cmd api:read-only-queries))) ;; not a read-only query (debug:print-info 12 *default-log-port* "rmt:send-receive, case 8") - (let ((server-url (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call - (if server-url - (remote-server-url-set! runremote server-url) ;; the string can be consumed by the client setup if needed + (let ((server-info (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call + (if server-info + (begin + (remote-server-url-set! runremote (server:record->url server-info)) ;; the string can be consumed by the client setup if needed + (remote-server-id-set! runremote (server:record->id server-info))) (if (common:force-server?) (server:start-and-wait *toppath*) - (server:kind-run *toppath*)))) + (server:kind-run *toppath*))) (remote-force-server-set! runremote (common:force-server?)) (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 8.1") - (rmt:open-qry-close-locally cmd 0 params)) + (rmt:open-qry-close-locally cmd 0 params))) ;;DOT CASE9 [label="force server\nnot on homehost"]; ;;DOT MUTEXLOCK -> CASE9 [label="no connection\nand either require server\nor not on homehost"]; {rank=same "case 9" CASE9}; ;;DOT CASE9 -> "start\nserver" -> "rmt:send-receive"; ((or (and (remote-force-server runremote) ;; we are forcing a server and don't yet have a connection to one @@ -658,11 +670,12 @@ (apply append (map (lambda (run-id) (rmt:send-receive 'test-get-paths-matching-keynames-target-new run-id (list run-id keynames target res testpatt statepatt statuspatt runname))) run-ids)))) -(define (rmt:get-prereqs-not-met run-id waitons ref-test-name ref-item-path #!key (mode '(normal))(itemmaps #f)) +;; NOTE: rmt functions can NEVER have key params as they might be called as local +(define (rmt:get-prereqs-not-met run-id waitons ref-test-name ref-item-path mode itemmaps) ;; #!key (mode '(normal))(itemmaps #f)) (rmt:send-receive 'get-prereqs-not-met run-id (list run-id waitons ref-test-name ref-item-path mode itemmaps))) (define (rmt:get-count-tests-running-for-run-id run-id fastmode) (rmt:send-receive 'get-count-tests-running-for-run-id run-id (list run-id fastmode))) Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -833,15 +833,15 @@ ;; tal - list of never visited tests ;; prefer next hed to be from reg than tal. (define runs:nothing-left-in-queue-count 0) -(define (runs:lazy-get-prereqs-not-met testdat run-id waitons hed item-path #!optional (mode '(normal))(itemmaps #f)) ;; mode: testmode itemmaps: itemmaps) +(define (runs:lazy-get-prereqs-not-met testdat run-id waitons hed item-path #!key (mode '(normal))(itemmaps #f)) ;; mode: testmode itemmaps: itemmaps) (if (and (runs:testdat-prereqs-not-met testdat) (< (- (current-seconds) (runs:testdat-last-update testdat)) 10)) ;; only refresh for this test if it has been at least 10 seconds (runs:testdat-prereqs-not-met testdat) - (let* ((res (let ((res (rmt:get-prereqs-not-met run-id waitons hed item-path mode: mode itemmaps: itemmaps))) + (let* ((res (let ((res (rmt:get-prereqs-not-met run-id waitons hed item-path mode itemmaps))) (if (list? res) res (begin (debug:print 0 *default-log-port* "ERROR: rmt:get-prereqs-not-met returned non-list!\n" @@ -862,20 +862,11 @@ ;; (reruns '())) (define (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps testdat) (let* ((loop-list (list hed tal reg reruns)) (prereqs-not-met (runs:lazy-get-prereqs-not-met testdat run-id waitons hed item-path mode: testmode itemmaps: itemmaps)) - #;(let ((res (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))) - (if (list? res) - res - (begin - (debug:print 0 *default-log-port* - "ERROR: rmt:get-prereqs-not-met returned non-list!\n" - " res=" res " run-id=" run-id " waitons=" waitons " hed=" hed " item-path=" item-path " testmode=" testmode " itemmaps=" itemmaps) - '()))) (have-itemized (not (null? (lset-intersection eq? testmode '(itemmatch itemwait))))) - ;; (prereqs-not-met (mt:lazy-get-prereqs-not-met run-id waitons item-path mode: testmode itemmap: itemmap)) (fails (runs:calc-fails prereqs-not-met)) (prereq-fails (runs:calc-prereq-fail prereqs-not-met)) (non-completed (runs:calc-not-completed prereqs-not-met)) (runnables (runs:calc-runnable prereqs-not-met)) (unexpanded-prereqs @@ -1117,12 +1108,10 @@ (have-resources (car run-limits-info)) (num-running (list-ref run-limits-info 1)) (num-running-in-jobgroup(list-ref run-limits-info 2)) (max-concurrent-jobs (list-ref run-limits-info 3)) (job-group-limit (list-ref run-limits-info 4)) - ;; (prereqs-not-met (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)) - ;; (prereqs-not-met (mt:lazy-get-prereqs-not-met run-id waitons item-path mode: testmode itemmap: itemmap)) (fails (if (list? prereqs-not-met) ;; TODO: rename fails to failed-prereqs (runs:calc-fails prereqs-not-met) (begin (debug:print-error 0 *default-log-port* "prereqs-not-met is not a list! " prereqs-not-met) '()))) @@ -1518,11 +1507,10 @@ keyvals: keyvals run-info: run-info ;; newtal: newtal all-tests-registry: all-tests-registry ;; itemmaps: itemmaps - ;; prereqs-not-met: (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps) ;; can-run-more-tests: (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs) ;; look at the test jobgroup and tot jobs running ))) ;; Initialize the test-registery hash with tests that already have a record ;; convert state to symbol and use that as the hash value Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -93,10 +93,11 @@ (define (server:mk-signature) (message-digest-string (md5-primitive) (with-output-to-string (lambda () (write (list (current-directory) + (current-process-id) (argv))))))) ;; When using zmq this would send the message back (two step process) ;; with spiffy or rpc this simply returns the return data to be returned ;; @@ -163,18 +164,19 @@ (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) (thread-join! log-rotate) (pop-directory))) ;; given a path to a server log return: host port startseconds -;; +;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let + (define (server:logf-get-start-info logf) - (let ((rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+)"))) ;; SERVER STARTED: host:port AT timesecs + (let ((rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+)"))) ;; SERVER STARTED: host:port AT timesecs server id (handle-exceptions exn (begin (print "failed to get server info from " logf ", exn=" exn) - (list #f #f #f)) ;; no idea what went wrong, call it a bad server + (list #f #f #f #f)) ;; no idea what went wrong, call it a bad server (with-input-from-file logf (lambda () (let loop ((inl (read-line)) (lnum 0)) @@ -181,16 +183,17 @@ (if (not (eof-object? inl)) (let ((mlst (string-match rx inl))) (if (not mlst) (if (< lnum 500) ;; give up if more than 500 lines of server log read (loop (read-line)(+ lnum 1)) - (list #f #f #f)) + (list #f #f #f #f)) (let ((dat (cdr mlst))) (list (car dat) ;; host (string->number (cadr dat)) ;; port - (string->number (caddr dat)))))) - (list #f #f #f)))))))) + (string->number (caddr dat)) + (cadr (cddr dat)))))) + (list #f #f #f #f)))))))) ;; get a list of servers with all relevant data ;; ( mod-time host port start-time pid ) ;; (define (server:get-list areapath #!key (limit #f)) @@ -217,11 +220,11 @@ (tal (cdr server-logs)) (res '())) (let* ((mod-time (handle-exceptions exn (begin - (print "failed to get modification time on " hed ", exn=" exn) + (debug:print 0 *default-log-port* "failed to get modification time on " hed ", exn=" exn) (current-seconds)) ;; 0 (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted (down-time (- (current-seconds) mod-time)) (serv-dat (if (or (< num-serv-logs 10) (< down-time 900)) ;; day-seconds)) @@ -230,11 +233,11 @@ (serv-rec (cons mod-time serv-dat)) (fmatch (string-match fname-rx hed)) (pid (if fmatch (string->number (list-ref fmatch 2)) #f)) (new-res (if (null? serv-dat) res - (cons (append serv-rec (list pid)) res)))) + (cons (append serv-rec (list pid)) res)))) ;; any changes to number of elements in new-res will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let (if (null? tal) (if (and limit (> (length new-res) limit)) new-res ;; (take new-res limit) <= need intelligent sorting before this will work new-res) @@ -242,11 +245,11 @@ (define (server:get-num-alive srvlst) (let ((num-alive 0)) (for-each (lambda (server) - (match-let (((mod-time host port start-time pid) + (match-let (((mod-time host port start-time server-id pid) server)) (let* ((uptime (- (current-seconds) mod-time)) (runtime (if start-time (- mod-time start-time) 0))) @@ -305,13 +308,19 @@ (let* ((len (length srvrs)) (idx (random len))) (list-ref srvrs idx)) #f))) +(define (server:record->id servr) + (match-let (((mod-time host port start-time server-id pid) + servr)) + (if server-id + server-id + #f))) (define (server:record->url servr) - (match-let (((mod-time host port start-time pid) + (match-let (((mod-time host port start-time server-id pid) servr)) (if (and host port) (conc host ":" port) #f))) @@ -378,15 +387,15 @@ ;; this one seems to be the general entry point ;; (define (server:start-and-wait areapath #!key (timeout 60)) (let ((give-up-time (+ (current-seconds) timeout))) - (let loop ((server-url (server:check-if-running areapath)) + (let loop ((server-info (server:check-if-running areapath)) (try-num 0)) - (if (or server-url + (if (or server-info (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available. - server-url + (server:record->url server-info) (let ((num-ok (length (server:get-best (server:get-list areapath))))) (if (and (> try-num 0) ;; first time through simply wait a little while then try again (< num-ok 1)) ;; if there are no decent candidates for servers then try starting a new one (server:kind-run areapath)) (thread-sleep! 5) @@ -403,11 +412,10 @@ ;; no longer care if multiple servers are started by accident. older servers will drop off in time. ;; (define (server:check-if-running areapath) ;; #!key (numservers "2")) (let* ((ns (server:get-num-servers)) (servers (server:get-best (server:get-list areapath)))) - ;; (print "servers: " servers " ns: " ns) (if (or (and servers (null? servers)) (not servers) (and (list? servers) (< (length servers) (random ns)))) ;; somewhere between 0 and numservers @@ -414,21 +422,22 @@ #f (let loop ((hed (car servers)) (tal (cdr servers))) (let ((res (server:check-server hed))) (if res - res + hed (if (null? tal) #f (loop (car tal)(cdr tal))))))))) ;; ping the given server ;; (define (server:check-server server-record) (let* ((server-url (server:record->url server-record)) + (server-id (server:record->id server-record)) (res (case *transport-type* - ((http)(server:ping server-url)) + ((http)(server:ping server-url server-id)) ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server) ))) (if res server-url #f))) @@ -441,11 +450,11 @@ ;; called in megatest.scm, host-port is string hostname:port ;; ;; NOTE: This is NOT called directly from clients as not all transports support a client running ;; in the same process as the server. ;; -(define (server:ping host-port-in #!key (do-exit #f)) +(define (server:ping host-port-in server-id #!key (do-exit #f)) (let ((host:port (if (not host-port-in) ;; use read-dotserver to find #f ;; (server:check-if-running *toppath*) ;; (if (number? host-port-in) ;; we were handed a server-id ;; (let ((srec (tasks:get-server-by-id (db:delay-if-busy (tasks:open-db)) host-port-in))) ;; ;; (print "srec: " srec " host-port-in: " host-port-in) @@ -467,11 +476,11 @@ (debug:print 0 *default-log-port* "ERROR: bad host:port")) (if do-exit (exit 1)) #f) (let* ((iface (car host-port)) (port (cadr host-port)) - (server-dat (http-transport:client-connect iface port)) + (server-dat (http-transport:client-connect iface port server-id)) (login-res (rmt:login-no-auto-client-setup server-dat))) (if (and (list? login-res) (car login-res)) (begin ;; (print "LOGIN_OK") @@ -585,16 +594,22 @@ (if (not (configf:lookup *configdat* "server" "disable-db-snapshot")) (common:snapshot-file mtdbfile subdir: ".db-snapshot")) (delete-file* staging-file) (let* ((start-time (current-milliseconds)) (res (system sync-cmd)) + (dbbackupfile (conc mtdbfile ".backup")) (res2 (cond - ((eq? 0 res) - (delete-file* (conc mtdbfile ".backup")) + ((eq? 0 res ) + (handle-exceptions + exn + #f + (if (file-exists? dbbackupfile) + (delete-file* dbbackupfile) + ) (if (eq? 0 (file-size sync-log)) - (delete-file sync-log)) + (delete-file* sync-log)) (system (conc "/bin/mv " staging-file " " mtdbfile)) (set! last-sync-seconds (/ (- (current-milliseconds) start-time) 1000)) (set! off-time (calculate-off-time last-sync-seconds @@ -605,11 +620,11 @@ (debug:print 0 *default-log-port* "WARNING: ["(common:human-time)"] server.sync-duty-cycle is invalid. Should be a number between 0 and 1, but "duty-cycle" was specified. Using default value: "default-duty-cycle) default-duty-cycle)))) (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "last-sync-seconds" sec") (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "last-sync-seconds" sec ; with duty-cycle of "duty-cycle" off time is now "off-time) - 'sync-completed) + 'sync-completed)) (else (system (conc "/bin/cp "sync-log" "sync-log".fail")) (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Sync failed. See log at "sync-log".fail") (if (file-exists? (conc mtdbfile ".backup")) (system (conc "/bin/cp "mtdbfile ".backup " mtdbfile))) Index: sretrieve.scm ================================================================== --- sretrieve.scm +++ sretrieve.scm @@ -637,21 +637,21 @@ (conc(pathname-file target-path) "." (pathname-extension target-path)) (pathname-file target-path))) (curr-dir (current-directory)) (start-dir (conc (current-directory) "/" last-dir-name)) (execlude (make-exclude-pattern (string-split restrictions ","))) - (tmpfile (conc "/tmp/" (current-user-name) "/my-pipe-" (current-process-id)))) + (tmpfile (conc "/tmp/my-pipe-" (current-process-id)))) (if (file-exists? start-dir) (begin (sauth:print-error (conclast-dir-name " already exist in your work dir.")) (sauth:print-error "Nothing has been retrieved!! ")) (begin ; (sretrieve:do-as-calling-user ; (lambda () - - (if (not (file-exists? (conc "/tmp/" (current-user-name)))) - (create-directory (conc "/tmp/" (current-user-name)) #t)) + ; (print tmpfile) + ;(if (not (file-exists? (conc "/tmp/" (current-user-name)))) + ; (create-directory (conc "/tmp/" (current-user-name)) #t)) (change-directory parent-dir) (create-fifo tmpfile) (process-fork (lambda() (sleep 1) Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -755,30 +755,31 @@ (owner (db:get-value-by-header row header "owner")) (event-time (db:get-value-by-header row header "event_time")) (comment (db:get-value-by-header row header "comment")) (fail-count (db:get-value-by-header row header "fail_count")) (pass-count (db:get-value-by-header row header "pass_count")) - (db-contour (db:get-value-by-header row header "contour")) + (db-contour (db:get-value-by-header row header "contour")) (contour (if (args:get-arg "-prepend-contour") (if (and db-contour (not (equal? db-contour "")) (string? db-contour )) (begin - (debug:print-info 1 *default-log-port* "db-contour") + (debug:print-info 10 *default-log-port* "db-contour" db-contour) db-contour) (args:get-arg "-contour")))) - (run-tag (if (args:get-arg "-run-tag") + (run-tag (if (args:get-arg "-run-tag") (args:get-arg "-run-tag") "")) - (last-update (db:get-value-by-header row header "last_update")) + (last-update (db:get-value-by-header row header "last_update")) (keytarg (if (or (args:get-arg "-prepend-contour") (args:get-arg "-prefix-target")) (conc "MT_CONTOUR/MT_AREA/" (string-intersperse (rmt:get-keys) "/")) (string-intersperse (rmt:get-keys) "/"))) ;; e.g. version/iteration/platform + (base-target (rmt:get-target run-id)) (target (if (or (args:get-arg "-prepend-contour") (args:get-arg "-prefix-target")) - (conc (or (args:get-arg "-prefix-target") (conc contour "/" (common:get-area-name) "/")) (rmt:get-target run-id)) (rmt:get-target run-id))) ;; e.g. v1.63/a3e1/ubuntu + (conc (or (args:get-arg "-prefix-target") (conc contour "/" (common:get-area-name) "/")) base-target) base-target)) ;; e.g. v1.63/a3e1/ubuntu (spec-id (pgdb:get-ttype dbh keytarg)) (publish-time (if (args:get-arg "-cp-eventtime-to-publishtime") event-time (current-seconds))) - (new-run-id (pgdb:get-run-id dbh spec-id target run-name area-id))) + (new-run-id (if (and run-name base-target) (pgdb:get-run-id dbh spec-id target run-name area-id) #f))) (if new-run-id (begin ;; let ((run-record (pgdb:get-run-info dbh new-run-id)) (hash-table-set! runs-ht run-id new-run-id) ;; ensure key fields are up to date ;; if last_update == pgdb_last_update do not update smallest-last-update-time @@ -788,16 +789,16 @@ (hash-table-set! smallest-last-update-time "smallest-time" last-update))) (pgdb:refresh-run-info dbh new-run-id state status owner event-time comment fail-count pass-count area-id last-update publish-time) - (debug:print-info 0 *default-log-port* "Working on run-id " run-id " pgdb-id " new-run-id ) + (debug:print-info 4 *default-log-port* "Working on run-id " run-id " pgdb-id " new-run-id ) (if (not (equal? run-tag "")) (task:add-run-tag dbh new-run-id run-tag)) new-run-id) - (if (equal? state "deleted") + (if (or (not state) (equal? state "deleted")) (begin (debug:print-info 1 *default-log-port* "Warning: Run with id " run-id " was created after previous sync and deleted before the sync") #f) (if (handle-exceptions exn (begin (print-call-chain) @@ -860,17 +861,17 @@ (begin (if pgdb-test-id (begin (if pgdb-step-id (begin - (debug:print-info 1 *default-log-port* "Updating existing test-step with test-id: " test-id " and step-id " step-id " pgdb test id: " pgdb-test-id " pgdb step id " pgdb-step-id ) + (debug:print-info 4 *default-log-port* "Updating existing test-step with test-id: " test-id " and step-id " step-id " pgdb test id: " pgdb-test-id " pgdb step id " pgdb-step-id ) (let* ((pgdb-last-update (pgdb:get-test-step-last-update dbh pgdb-step-id))) (if (and (> last-update pgdb-last-update) (or (not smallest-time) (< last-update smallest-time))) (hash-table-set! smallest-last-update-time "smallest-time" last-update))) (pgdb:update-test-step dbh pgdb-step-id pgdb-test-id stepname state status event_time comment logfile last-update)) (begin - (debug:print-info 1 *default-log-port* "Inserting test-step with test-id: " test-id " and step-id " step-id " pgdb test id: " pgdb-test-id) + (debug:print-info 4 *default-log-port* "Inserting test-step with test-id: " test-id " and step-id " step-id " pgdb test id: " pgdb-test-id) (if (or (not smallest-time) (< last-update smallest-time)) (hash-table-set! smallest-last-update-time "smallest-time" last-update)) (pgdb:insert-test-step dbh pgdb-test-id stepname state status event_time comment logfile last-update ) (set! pgdb-step-id (pgdb:get-test-step-id dbh pgdb-test-id stepname state)))) (hash-table-set! step-ht step-id pgdb-step-id )) @@ -906,17 +907,17 @@ (begin (if pgdb-test-id (begin (if pgdb-data-id (begin - (debug:print-info 1 *default-log-port* "Updating existing test-data with test-id: " test-id " and data-id " data-id " pgdb test id: " pgdb-test-id " pgdb data id " pgdb-data-id) + (debug:print-info 4 *default-log-port* "Updating existing test-data with test-id: " test-id " and data-id " data-id " pgdb test id: " pgdb-test-id " pgdb data id " pgdb-data-id) (let* ((pgdb-last-update (pgdb:get-test-data-last-update dbh pgdb-data-id))) (if (and (> last-update pgdb-last-update) (or (not smallest-time) (< last-update smallest-time))) (hash-table-set! smallest-last-update-time "smallest-time" last-update))) (pgdb:update-test-data dbh pgdb-data-id pgdb-test-id category variable value expected tol units comment status type last-update)) (begin - (debug:print-info 1 *default-log-port* "Inserting test-data with test-id: " test-id " and data-id " data-id " pgdb test id: " pgdb-test-id) + (debug:print-info 4 *default-log-port* "Inserting test-data with test-id: " test-id " and data-id " data-id " pgdb test id: " pgdb-test-id) (if (handle-exceptions exn (begin (print-call-chain) (print ((condition-property-accessor 'exn 'message) exn)) #f) @@ -970,21 +971,23 @@ (pgdb:get-test-id dbh pgdb-run-id test-name item-path)) #f))) ;; "id" "run_id" "testname" "state" "status" "event_time" ;; "host" "cpuload" "diskfree" "uname" "rundir" "item_path" ;; "run_duration" "final_logf" "comment" "shortdir" "attemptnum" "archived" + (if (or (not item-path) (string-null? item-path)) + (debug:print-info 0 *default-log-port* "Working on Run id : " run-id "and test name : " test-name)) (if pgdb-run-id (begin (if pgdb-test-id ;; have a record (begin ;; let ((key-name (conc run-id "/" test-name "/" item-path))) - (debug:print-info 0 *default-log-port* "Updating existing test with run-id: " run-id " and test-id: " test-id " pgdb run id: " pgdb-run-id " pgdb-test-id " pgdb-test-id) + (debug:print-info 4 *default-log-port* "Updating existing test with run-id: " run-id " and test-id: " test-id " pgdb run id: " pgdb-run-id " pgdb-test-id " pgdb-test-id) (let* ((pgdb-last-update (pgdb:get-test-last-update dbh pgdb-test-id))) (if (and (> last-update pgdb-last-update) (or (not smallest-time) (< last-update smallest-time))) ;;if last-update is same as pgdb-last-update then it is safe to assume the records are identical and we can use a larger last update time. (hash-table-set! smallest-last-update-time "smallest-time" last-update))) (pgdb:update-test dbh pgdb-test-id pgdb-run-id test-name item-path state status host cpuload diskfree uname run-dir log-file run-duration comment event-time archived last-update pid)) (begin - (debug:print-info 0 *default-log-port* "Inserting test with run-id: " run-id " and test-id: " test-id " pgdb run id: " pgdb-run-id) + (debug:print-info 4 *default-log-port* "Inserting test with run-id: " run-id " and test-id: " test-id " pgdb run id: " pgdb-run-id) (pgdb:insert-test dbh pgdb-run-id test-name item-path state status host cpuload diskfree uname run-dir log-file run-duration comment event-time archived last-update pid) (if (or (not smallest-time) (< last-update smallest-time)) (hash-table-set! smallest-last-update-time "smallest-time" last-update)) (set! pgdb-test-id (pgdb:get-test-id dbh pgdb-run-id test-name item-path)))) (hash-table-set! test-ht test-id pgdb-test-id)) @@ -1013,11 +1016,11 @@ (pgdb:insert-area-tag dbh (vector-ref tag-info 0) (vector-ref area-info 0)))))) (define (tasks:sync-run-data dbh cached-info run-ids area-info smallest-last-update-time) (for-each (lambda (run-id) - (debug:print-info 1 *default-log-port* "Check if run with " run-id " needs to be synced" ) + (debug:print-info 4 *default-log-port* "Check if run with " run-id " needs to be synced" ) (tasks:run-id->mtpg-run-id dbh cached-info run-id area-info smallest-last-update-time)) run-ids)) ;; get runs changed since last sync Index: tests.scm ================================================================== --- tests.scm +++ tests.scm @@ -21,11 +21,11 @@ ;;====================================================================== ;; Tests ;;====================================================================== (declare (unit tests)) -(declare (uses lock-queue)) +;;(declare (uses lock-queue)) (declare (uses db)) (declare (uses tdb)) (declare (uses common)) ;; (declare (uses dcommon)) ;; needed for the steps processing (declare (uses items))