468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
|
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
|
-
+
+
|
;; BB: question to Matt -- does the tests:test-state-status! encompass rollup to toplevel? If not, should it?
(tests:test-set-status! run-id test-id "KILLED" "KILLED" (conc (args:get-arg "-m")" "kill-reason) #f)) ;; BB ADDED kill-reason -- confirm OK with Matt
(begin
(debug:print-error 0 *default-log-port* "Nothing to kill, pid1=" pid1 ", pid2=" pid2)
(tests:test-set-status! run-id test-id "KILLED" "FAILED TO KILL" (conc (args:get-arg "-m")" "kill-reason) #f) ;; BB ADDED kill-reason -- confirm OK with Matt
)))
(mutex-unlock! m)
;; no point in sticking around. Exit now.
;; no point in sticking around. Exit now. But run end of run before exiting?
(launch:end-of-run-check run-id)
(exit)))
(if (hash-table-ref/default misc-flags 'keep-going #f)
(begin
(thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses
(if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta
(loop (calc-minutes)
(or new-cpu-load cpu-load)
|
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
|
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
|
-
-
+
|
(mutex-lock! m)
(let* ((item-path (item-list->path itemdat))
;; only state and status needed - use lazy routine
(testinfo (rmt:get-testinfo-state-status run-id test-id)))
;; Am I completed?
(if (member (db:test-get-state testinfo) '("REMOTEHOSTSTART" "RUNNING")) ;; NOTE: It should *not* be REMOTEHOSTSTART but for reasons I don't yet understand it sometimes gets stuck in that state ;; (not (equal? (db:test-get-state testinfo) "COMPLETED"))
(let ((new-state (if kill-job? "KILLED" "COMPLETED") ;; (if (eq? (vector-ref exit-info 2) 0) ;; exited with "good" status
;; "COMPLETED"
;; (db:test-get-state testinfo))) ;; else preseve the state as set within the test
;; "COMPLETED" ;; (db:test-get-state testinfo))) ;; else preseve the state as set within the test
)
(new-status (cond
((not (launch:einf-exit-status exit-info)) "FAIL") ;; job failed to run ... (vector-ref exit-info 1)
((eq? (launch:einf-rollup-status exit-info) 0) ;; (vector-ref exit-info 3)
;; if the current status is AUTO then defer to the calculated value (i.e. leave this AUTO)
(if (equal? (db:test-get-status testinfo) "AUTO") "AUTO" "PASS"))
((eq? (launch:einf-rollup-status exit-info) 1) "FAIL") ;; (vector-ref exit-info 3)
|
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
|
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
|
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
|
))
;; for automated creation of the rollup html file this is a good place...
(if (not (equal? item-path ""))
(tests:summarize-items run-id test-id test-name #f))
(tests:summarize-test run-id test-id) ;; don't force - just update if no
(rmt:update-run-stats run-id (rmt:get-raw-run-stats run-id)))
(mutex-unlock! m)
(launch:end-of-run-check run-id )
(debug:print 2 *default-log-port* "Output from running " fullrunscript ", pid " (launch:einf-pid exit-info) " in work area "
work-area ":\n====\n exit code " (launch:einf-exit-code exit-info) "\n" "====\n")
(if (not (launch:einf-exit-status exit-info))
(exit 4))))
)))
;; Spec for End of test
;; At end of each test call, after marking self as COMPLETED do run-state-status-rollup
;; At transition to run COMPLETED/X do hooks
;; Definition: test_dead if event_time + duration + 1 minute? < current_time AND
;; we can prove the process is not alive (ssh host pstree -A pid)
;; if dead safe to mark the test as killed in the db
;; State/status table
;; new
;; 100% COMPLETED/ (PASS,FAIL,ABORT etc.) ==> COMPLETED / X where X is same as itemized rollup
;; > 3 RUNNING with not test_dead do nothing (run should already be RUNNING/ na
;; > 0 RUNNING and test_dead then send KILLREQ ==> COMPLETED
;; 0 RUNNING ==> this is actually the first condition, should not get here
(define (launch:end-of-run-check run-id )
(let* ((not-completed-cnt (rmt:get-not-completed-cnt run-id))
(running-cnt (rmt:get-count-tests-running-for-run-id run-id))
(all-test-launched (rmt:get-var (conc "lunch-complete-" run-id)))
(current-state (rmt:get-run-state run-id))
(current-status (rmt:get-run-status run-id)))
;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing
(debug:print 0 *default-log-port* "rollup run state/status")
(rmt:set-state-status-and-roll-up-run run-id current-state current-status)
(cond
((and all-test-launched (eq? not-completed-cnt 0) (equal? all-test-launched "yes" ))
(debug:print 0 *default-log-port* "look for post hook.")
(runs:run-post-hook run-id))
((> running-cnt 3)
(debug:print 0 *default-log-port* "There are " running-cnt " tests running." ))
((> running-cnt 0)
(debug:print 0 *default-log-port* "running cnt > 0 but <= 3 kill-running-tests-if-dead" )
(let ((kill-cnt (launch:kill-tests-if-dead run-id)))
(if (and all-test-launched (equal? all-test-launched "yes") (eq? kill-cnt running-cnt))
(launch:end-of-run-check run-id)))) ;;todo
(else (debug:print 0 *default-log-port* "Should it get here?? May be everything is not launched yet. Running test cnt:" running-cnt " Not completed test cnt:" not-completed-cnt)
(let* ((not-completed-tests (rmt:get-tests-for-run run-id "%" `("NOT_STARTED" "RUNNING" "LAUNCHED" "REMOTEHOSTSTART") `() #f #f #f #f #f #f #f #f)))
(if (> (length not-completed-tests) 0)
(let loop ((running-test (car not-completed-tests))
(tal (cdr not-completed-tests)))
(let* ((test-name (vector-ref running-test 2))
(item-path (vector-ref running-test 11)))
(debug:print 0 *default-log-port* "test " test-name "/" item-path " not completed")
(if (not (null? tal))
(loop (car tal) (cdr tal)))))))))))
(define (launch:is-test-alive host pid)
(if (and host pid (not (equal? host "n/a")))
(let* ((cmd (conc "ssh " host " pstree -A " pid))
(output (with-input-from-pipe cmd read-lines)))
(print "cmd: " cmd "\n op: " output )
(if(eq? (length output) 0)
#f
#t))
#t))
(define (launch:kill-tests-if-dead run-id)
(let* ((running-tests (rmt:get-tests-for-run run-id "%" `("RUNNING" "LAUNCHED" "REMOTEHOSTSTART") `() #f #f #f #f #f #f #f #f)))
(let loop ((running-test (car running-tests))
(tal (cdr running-tests))
(kill-cnt 0))
(let* ((test-name (vector-ref running-test 2))
(item-path (vector-ref running-test 11))
(test-id (vector-ref running-test 0))
(host (vector-ref running-test 6))
(pid (rmt:test-get-top-process-pid run-id test-id))
(event-time (vector-ref running-test 5))
(duration (vector-ref running-test 12))
(flag 0)
(curr-time (current-seconds)))
(if (and (< (+ event-time duration 600) curr-time) (not (launch:is-test-alive host pid))) ;;test has not updated duration in last 10 min then likely its not running but confirm before marking it as killed
(begin
(debug:print 0 *default-log-port* "test " test-name "/" item-path " needs to be killed")
(set! flag 1)
(rmt:set-state-status-and-roll-up-items run-id test-name item-path "KILLREQ" "n/a" #f)))
(if (not (null? tal))
(loop (car tal) (cdr tal) (+ kill-cnt flag))
(+ kill-cnt flag))))))
;; DO NOT USE - caching of configs is handled in launch:setup now.
;;
(define (launch:cache-config)
;; if we have a linktree and -runtests and -target and the directory exists dump the config
;; to megatest-(current-seconds).cfg and symlink it to megatest.cfg
(if (and *configdat*
(or (args:get-arg "-run")
|