Index: TODO ================================================================== --- TODO +++ TODO @@ -42,10 +42,11 @@ . break command line into sections; all, run control, queries, utilities etc. . pull in ftfplan (not integrated, just code pulled in) WW20 . ./configure => ubuntu, sles11, sles12, rh7 +. Jenkins junit XML support . Add output flushing in teamcity support . Switch to using simple runs query everywhere . Add end_time to runs and add a rollup call that sets state, status and end_time Future Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -154,21 +154,21 @@ ((> *api-process-request-count* 20) ;; 20) (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an overloaded message.") (set! *server-overloaded* #t) (vector #f (vector #f 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor! (else - (let* ((cmd-in (vector-ref dat 0)) + (let* ((cmd-in (common:safe-vector-ref dat 0 'nocmd)) (cmd (if (symbol? cmd-in) cmd-in (string->symbol cmd-in))) - (params (vector-ref dat 1)) + (params (common:safe-vector-ref dat 1 '())) (start-t (current-milliseconds)) (readonly-mode (dbr:dbstruct-read-only dbstruct)) (readonly-command (member cmd api:read-only-queries)) (writecmd-in-readonly-mode (and readonly-mode (not readonly-command))) (foo (begin - (common:telemetry-log (conc "api-in:"(->string cmd)) + #;(common:telemetry-log (conc "api-in:"(->string cmd)) payload: `((params . ,params))) #t)) (res (if writecmd-in-readonly-mode @@ -175,10 +175,12 @@ (conc "attempt to run write command "cmd" on a read-only database") (case cmd ;;=============================================== ;; READ/WRITE QUERIES ;;=============================================== + + ((nocmd) '(#f "All broken!")) ((get-keys-write) (db:get-keys dbstruct)) ;; force a dummy "write" query to force server; for debug in -repl ;; SERVERS ((start-server) (apply server:kind-run params)) @@ -359,16 +361,16 @@ start-t))) (hash-table-set! *db-api-call-time* cmd (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '())))) (if writecmd-in-readonly-mode (begin - (common:telemetry-log (conc "api-out:"(->string cmd)) + #;(common:telemetry-log (conc "api-out:"(->string cmd)) payload: `((params . ,params) (ok-res . #t))) (vector #f res)) (begin - (common:telemetry-log (conc "api-out:"(->string cmd)) + #;(common:telemetry-log (conc "api-out:"(->string cmd)) payload: `((params . ,params) (ok-res . #f))) (vector #t res)))))))) ;; http-server send-response @@ -381,12 +383,12 @@ (set! *api-process-request-count* (+ *api-process-request-count* 1)) (let* ((cmd ($ 'cmd)) (paramsj ($ 'params)) (params (db:string->obj paramsj transport: 'http)) ;; incoming data from the POST (or is it a GET?) (resdat (api:execute-requests dbstruct (vector cmd params))) ;; process the request, resdat = #( flag result ) - (success (vector-ref resdat 0)) - (res (vector-ref resdat 1))) ;; (vector flag payload), get the payload, ignore the flag (why?) + (success (common:safe-vector-ref resdat 0 #f)) + (res (common:safe-vector-ref resdat 1 #f))) ;; (vector flag payload), get the payload, ignore the flag (why?) (if (not success) (debug:print 0 *default-log-port* "ERROR: success flag is #f for " cmd " with params " params)) (if (> *api-process-request-count* *max-api-process-requests*) (set! *max-api-process-requests* *api-process-request-count*)) (set! *api-process-request-count* (- *api-process-request-count* 1)) Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -488,11 +488,20 @@ (copy daysfile wksfile) (copy hrsfile daysfile)) #t) #f)) - +(define (common:safe-vector-ref vec indx default) + (if (vector? vec) + (handle-exceptions + exn + (begin + (debug:print-info 0 *default-log-port* "remote data issue: exn=" exn) + default) + (vector-ref vec indx)) + default)) + ;; Rotate logs, logic: ;; if > 500k and older than 1 week: ;; remove previous compressed log and compress this log ;; WARNING: This proc operates assuming that it is in the directory above the @@ -501,11 +510,11 @@ (define (common:rotate-logs) (let* ((all-files (make-hash-table)) (stats (make-hash-table)) (inc-stat (lambda (key) (hash-table-set! stats key (+ (hash-table-ref/default stats key 0) 1)))) - (max-allowed (string->number (or (configf:lookup *configdat* "setup" "max-logfiles") "300")))) ;; name -> age + (max-allowed (string->number (or (configf:lookup *configdat* "setup" "max-logfiles") "600")))) ;; name -> age (if (not (directory-exists? "logs"))(create-directory "logs")) (directory-fold (lambda (file rem) (handle-exceptions exn Index: dashboard.scm ================================================================== --- dashboard.scm +++ dashboard.scm @@ -720,11 +720,10 @@ (if got-all (begin (dboard:rundat-last-update-set! run-dat (- start-time 2)) (dboard:rundat-run-data-offset-set! run-dat 0)) (begin - ;;; (thread-sleep! 0.25) ;; give the rest of the gui some time to update. <-- this did NOT help (dboard:rundat-run-data-offset-set! run-dat (+ num-to-get (dboard:rundat-run-data-offset run-dat))))) (for-each (lambda (tdat) @@ -838,13 +837,11 @@ ;; ;; create a virtual table of all the tests ;; keypatts: ( (KEY1 "abc%def")(KEY2 "%") ) ;; (define (dboard:update-rundat tabdat runnamepatt numruns testnamepatt keypatts) - (dboard:tabdat-already-running-set! tabdat #t) - (let* (;; (already-running (dboard:tabdat-already-running tabdat)) - (access-mode (dboard:tabdat-access-mode tabdat)) + (let* ((access-mode (dboard:tabdat-access-mode tabdat)) (keys (dboard:tabdat-keys tabdat)) ;; (db:dispatch-query access-mode rmt:get-keys db:get-keys))) (last-runs-update (- (dboard:tabdat-last-runs-update tabdat) 2)) (allruns (rmt:get-runs runnamepatt numruns (dboard:tabdat-start-run-offset tabdat) keypatts)) ;;(allruns-tree (rmt:get-runs-by-patt (dboard:tabdat-keys tabdat) "%" #f #f #f #f)) (allruns-tree (rmt:get-runs-by-patt keys "%" #f #f #f #f 0)) ;; last-runs-update));;'("id" "runname") @@ -3727,17 +3724,10 @@ ;; (define (tabdat-values tabdat) ;; runs update-rundat using the various filters from the gui ;; (define (dashboard:do-update-rundat tabdat) - ;; this seems like a good place to check for already running and skip if so - ;; - ;; (set! *updater-running* #t) -;;(if (dboard:tabdat-already-running tabdat) -;; (begin -;; (debug:print-info 0 *default-log-port* "Dashboard overloaded - updates will be slow, skipping update.") -;; (dboard:tabdat-target tabdat)) (dboard:update-rundat tabdat (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) "runname" "%") (dboard:tabdat-numruns tabdat) (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) "test-name" "%/%") Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -462,13 +462,14 @@ (debug:print 0 *default-log-port* "ERROR: failed to find a record for test-id " test-id ", exiting.") (exit)))) (test-pid (db:test-get-process_id test-info))) (cond ;; -mrw- I'm removing KILLREQ from this list so that a test in KILLREQ state is treated as a "do not run" flag. - ((or (member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun - (and (equal? (db:test-get-state test-info) "COMPLETED") ;; completed/abort => rerun if asked - (member (db:test-get-status test-info) '("ABORT")))) + ;; ((or (member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun + ;; (and (equal? (db:test-get-state test-info) "COMPLETED") ;; completed/abort => rerun if asked + ;; (member (db:test-get-status test-info) '("ABORT")))) + ((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun (debug:print 0 *default-log-port* "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request") ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") (rmt:general-call 'set-test-start-time #f test-id) (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f) @@ -770,11 +771,13 @@ (if (not (null? tal)) (loop (car tal) (cdr tal))))))))))) (define (launch:is-test-alive host pid) (if (and host pid (not (equal? host "n/a"))) - (let* ((cmd (conc "ssh " host " pstree -A " pid)) + (let* ((is-local (equal? host (get-host-name))) + (ssh-cmd (if is-local " " (conc "ssh " host " "))) + (cmd (conc ssh-cmd "pstree -A " pid)) (output (with-input-from-pipe cmd read-lines))) (debug:print 2 *default-log-port* "Running " cmd " received " output) (if (eq? (length output) 0) #f #t)) Index: megatest-version.scm ================================================================== --- megatest-version.scm +++ megatest-version.scm @@ -18,6 +18,6 @@ ;; Always use two or four digit decimal ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00.. ;; (declare (unit megatest-version)) -(define megatest-version 1.6572) +(define megatest-version 1.7101) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -371,12 +371,12 @@ (> (vector-length v) 1)) (let ((newvec (vector (vector-ref v 0)(vector-ref v 1)))) newvec) ;; by copying the vector while inside the error handler we should force the detection of a corrupted record (vector #t '())))) ;; we could also check that the returned types are valid (vector #t '()))) - (success (vector-ref resdat 0)) - (res (vector-ref resdat 1)) + (success (common:safe-vector-ref resdat 0 #f)) ;; (vector-ref resdat 0)) + (res (common:safe-vector-ref resdat 1 #f)) ;; (vector-ref resdat 1)) (duration (- (current-milliseconds) start))) (if (and read-only qry-is-write) (debug:print 0 *default-log-port* "ERROR: attempt to write to read-only database ignored. cmd=" cmd)) (if (not success) (if (> remretries 0) @@ -391,11 +391,11 @@ ;; (rmt:update-db-stats run-id cmd params duration) ;; mark this run as dirty if this was a write, the watchdog is responsible for syncing it (if qry-is-write (let ((start-time (current-seconds))) (mutex-lock! *db-multi-sync-mutex*) -/ (set! *db-last-access* start-time) ;; THIS IS PROBABLY USELESS? (we are on a client) + (set! *db-last-access* start-time) ;; THIS IS PROBABLY USELESS? (we are on a client) (mutex-unlock! *db-multi-sync-mutex*))))) res)) (define (rmt:send-receive-no-auto-client-setup connection-info cmd run-id params) (let* ((run-id (if run-id run-id 0)) Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -1783,17 +1783,14 @@ ;; if items is a proc then need to run items:get-items-from-config, get the list and loop ;; - but only do that if resources exist to kick off the job ;; EXPAND ITEMS ((or (procedure? items)(eq? items 'have-procedure)) (debug:print-info 4 *default-log-port* "cond branch - " "rtq-4") - (let ((can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs))) - (if (and (list? can-run-more) - (car can-run-more)) ;; itemized test expanded here - (let ((loop-list (runs:expand-items hed tal reg reruns regfull newtal jobgroup - max-concurrent-jobs run-id waitons item-path - testmode test-record can-run-more items runname - tconfig reglen test-registry test-records itemmaps))) + (let ((can-run-more #f)) ;; (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs))) + (if (not can-run-more) #;(and (list? can-run-more) + (car can-run-more)) + (let ((loop-list (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps))) ;; itemized test expanded here (if loop-list (apply loop loop-list) (debug:print-info 4 *default-log-port* " -- Can't expand hed="hed))) ;; if can't run more just loop with next possible test (loop (car newtal)(cdr newtal) reg reruns)))) Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -346,10 +346,28 @@ (begin (debug:print-info 0 *default-log-port* "Gating server start, last start: " fmodtime ", delta: " delta ", reftime: " reftime ", all-go=" all-go) (thread-sleep! reftime) (server:wait-for-server-start-last-flag areapath))))))) + +;; wait for server=start-last to be three seconds old - this might be what was causing the obfusc-skel fails +;; +#;(define (server:wait-for-server-start-last-flag areapath) + (let* ((start-flag (conc areapath "/logs/server-start-last"))) + (if (file-exists? start-flag) + (let* ((fmodtime (file-modification-time start-flag)) + (reftime (+ 2 (random 3))) + (delta (- (current-seconds) fmodtime)) + (all-go (> delta reftime))) + (if all-go + #t ;; (system (conc "touch " start-flag)) ;; lazy but safe + (begin + (debug:print-info 0 *default-log-port* "Gating server start, last start: " + fmodtime ", delta: " delta ", reftime: " reftime ", all-go=" all-go) + (thread-sleep! 5) + (server:wait-for-server-start-last-flag areapath)))) + #;(system (conc "touch " start-flag))))) ;; kind start up of servers, wait 40 seconds before allowing another server for a given ;; run-id to be launched ;; (define (server:kind-run areapath) Index: tests.scm ================================================================== --- tests.scm +++ tests.scm @@ -1987,10 +1987,32 @@ (tmpfree (get-df "/tmp")) (uname (get-uname "-srvpio")) (hostname (get-host-name))) (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname update-db: update-db tmpfree: tmpfree))) +;; (define (tests:set-partial-meta-info test-id run-id minutes work-area) +#;(define (tests:set-partial-meta-info test-id run-id minutes work-area remtries) + (let* ((cpuload (get-cpu-load)) + (diskfree (get-df (current-directory))) + (remtries 10)) + (handle-exceptions + exn + (if (> remtries 0) + (begin + (print-call-chain (current-error-port)) + (debug:print-info 0 *default-log-port* "WARNING: failed to set meta info. Will try " remtries " more times") + (set! remtries (- remtries 1)) + (thread-sleep! 10) + (tests:set-full-meta-info db test-id run-id minutes work-area (- remtries 1))) + (let ((err-status ((condition-property-accessor 'sqlite3 'status #f) exn))) + (debug:print-error 0 *default-log-port* "tried for over a minute to update meta info and failed. Giving up") + (debug:print 0 *default-log-port* "EXCEPTION: database probably overloaded or unreadable.") + (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 5 *default-log-port* "exn=" (condition->list exn)) + (debug:print 0 *default-log-port* " status: " ((condition-property-accessor 'sqlite3 'status) exn)) + (print-call-chain (current-error-port)))) + (tests:update-testdat-meta-info db test-id work-area cpuload diskfree minutes)))) ;;====================================================================== ;; A R C H I V I N G ;;======================================================================