Overview
Comment: | Refactored launch.scm run and monitor jobs code |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.61 |
Files: | files | file ages | folders |
SHA1: |
5b74ca3241eb5ff985c35b5dcbae5986 |
User & Date: | mrwellan on 2016-05-09 12:33:19 |
Other Links: | branch diff | manifest | tags |
Context
2016-05-10
| ||
11:57 | Added message to test steps panel check-in: b0c7d17ae2 user: mrwellan tags: v1.61 | |
2016-05-09
| ||
12:33 | Refactored launch.scm run and monitor jobs code check-in: 5b74ca3241 user: mrwellan tags: v1.61 | |
10:05 | Bumped version. Removed sig/stop from exit handler. Added calls to load <stepname>.dat into db check-in: 78985bc0f1 user: mrwellan tags: v1.61 | |
Changes
Modified launch.scm from [d762027aa5] to [3dcfc93438].
︙ | ︙ | |||
213 214 215 216 217 218 219 | (tests:test-set-status! run-id test-id next-state "PASS" #f #f)) (else ;; 'fail (launch:einf-rollup-status-set! exit-info 1) ;; (vector-set! exit-info 3 1) ;; force fail, this used to be next-state but that doesn't make sense. should always be "COMPLETED" (tests:test-set-status! run-id test-id "COMPLETED" "FAIL" (conc "Failed at step " stepname) #f) ))) logpro-used)) | > > > > > > > > | > > > > | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | | 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 | (tests:test-set-status! run-id test-id next-state "PASS" #f #f)) (else ;; 'fail (launch:einf-rollup-status-set! exit-info 1) ;; (vector-set! exit-info 3 1) ;; force fail, this used to be next-state but that doesn't make sense. should always be "COMPLETED" (tests:test-set-status! run-id test-id "COMPLETED" "FAIL" (conc "Failed at step " stepname) #f) ))) logpro-used)) (define (launch:manage-steps run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m) ;; (let-values ;; (((pid exit-status exit-code) ;; (run-n-wait fullrunscript))) ;; (tests:test-set-status! test-id "RUNNING" "n/a" #f #f) ;; Since we should have a clean slate at this time there is no need to do ;; any of the other stuff that tests:test-set-status! does. Let's just ;; force RUNNING/n/a ;; (thread-sleep! 0.3) (tests:test-force-state-status! run-id test-id "RUNNING" "n/a") (rmt:roll-up-pass-fail-counts run-id test-name item-path #f "RUNNING") ;; (thread-sleep! 0.3) ;; NFS slowness has caused grief here ;; if there is a runscript do it first (if fullrunscript (let ((pid (process-run fullrunscript))) (rmt:test-set-top-process-pid run-id test-id pid) (let loop ((i 0)) (let-values (((pid-val exit-status exit-code) (process-wait pid #t))) (mutex-lock! m) (launch:einf-pid-set! exit-info pid) ;; (vector-set! exit-info 0 pid) (launch:einf-exit-status-set! exit-info exit-status) ;; (vector-set! exit-info 1 exit-status) (launch:einf-exit-code-set! exit-info exit-code) ;; (vector-set! exit-info 2 exit-code) (launch:einf-rollup-status-set! exit-info exit-code) ;; (vector-set! exit-info 3 exit-code) ;; rollup status (mutex-unlock! m) (if (eq? pid-val 0) (begin (thread-sleep! 2) (loop (+ i 1))) ))))) ;; then, if runscript ran ok (or did not get called) ;; do all the ezsteps (if any) (if ezsteps (let* ((testconfig ;; (read-config (conc work-area "/testconfig") #f #t environ-patt: "pre-launch-env-vars")) ;; FIXME??? is allow-system ok here? ;; NOTE: it is tempting to turn off force-create of testconfig but dynamic ;; ezstep names need a full re-eval here. (tests:get-testconfig test-name tconfigreg #t force-create: #t)) ;; 'return-procs))) (ezstepslst (if (hash-table? testconfig) (hash-table-ref/default testconfig "ezsteps" '()) #f))) (if testconfig (hash-table-set! *testconfigs* test-name testconfig) ;; cached for lazy reads later ... (begin (launch:setup) (debug:print 0 "WARNING: no testconfig found for " test-name " in search path:\n " (string-intersperse (tests:get-tests-search-path *configdat*) "\n ")))) ;; after all that, still no testconfig? Time to abort (if (not testconfig) (begin (debug:print 0 "ERROR: Failed to resolve megatest.config, runconfigs.config and testconfig issues. Giving up now") (exit 1))) (if (not (file-exists? ".ezsteps"))(create-directory ".ezsteps")) ;; if ezsteps was defined then we are sure to have at least one step but check anyway (if (not (> (length ezstepslst) 0)) (debug:print 0 "ERROR: ezsteps defined but ezstepslst is zero length") (let loop ((ezstep (car ezstepslst)) (tal (cdr ezstepslst)) (prevstep #f)) ;; check exit-info (vector-ref exit-info 1) (if (launch:einf-exit-status exit-info) ;; (vector-ref exit-info 1) (let ((logpro-used (launch:runstep ezstep run-id test-id exit-info m tal testconfig)) (stepname (car ezstep))) ;; if logpro-used read in the stepname.dat file (if (and logpro-used (file-exists? (conc stepname ".dat"))) (launch:load-logpro-dat stepname)) (if (steprun-good? logpro-used (launch:einf-exit-code exit-info)) (if (not (null? tal)) (loop (car tal) (cdr tal) stepname)) (debug:print 4 "WARNING: step " (car ezstep) " failed. Stopping"))) (debug:print 4 "WARNING: a prior step failed, stopping at " ezstep))))))) (define (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags) (let* ((start-seconds (current-seconds)) (calc-minutes (lambda () (inexact->exact (round (- (current-seconds) start-seconds))))) (kill-tries 0)) ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area) ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10) (let loop ((minutes (calc-minutes)) (cpu-load (get-cpu-load)) (disk-free (get-df (current-directory)))) (let ((new-cpu-load (let* ((load (get-cpu-load)) (delta (abs (- load cpu-load)))) (if (> delta 0.6) ;; don't bother updating with small changes load #f))) (new-disk-free (let* ((df (get-df (current-directory))) (delta (abs (- df disk-free)))) (if (> delta 200) ;; ignore changes under 200 Meg df #f)))) (set! kill-job? (or (test-get-kill-request run-id test-id) ;; run-id test-name itemdat)) (and runtlim (let* ((run-seconds (- (current-seconds) start-seconds)) (time-exceeded (> run-seconds runtlim))) (if time-exceeded (begin (debug:print-info 0 "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" run-seconds " seconds, limit=" runtlim) #t) #f))))) (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) (if kill-job? (begin (mutex-lock! m) ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this ;; section and the runit section? Or add a loop that tries three times with a 1/4 second ;; between tries? (let* ((pid1 (launch:einf-pid exit-info)) ;; (vector-ref exit-info 0)) (pid2 (rmt:test-get-top-process-pid run-id test-id)) (pids (delete-duplicates (filter number? (list pid1 pid2))))) (if (not (null? pids)) (begin (for-each (lambda (pid) (handle-exceptions exn (begin (debug:print-info 0 "Unable to kill process with pid " pid ", possibly already killed.") (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))) (debug:print 0 "WARNING: Request received to kill job " pid) ;; " (attempt # " kill-tries ")") (debug:print-info 0 "Signal mask=" (signal-mask)) ;; (if (process:alive? pid) ;; (begin (map (lambda (pid-num) (process-signal pid-num signal/term)) (process:get-sub-pids pid)) (thread-sleep! 5) ;; (if (process:process-alive? pid) (map (lambda (pid-num) (handle-exceptions exn #f (process-signal pid-num signal/kill))) (process:get-sub-pids pid)))) ;; (debug:print-info 0 "not killing process " pid " as it is not alive")))) pids) (tests:test-set-status! run-id test-id "KILLED" "KILLED" (args:get-arg "-m") #f)) (begin (debug:print 0 "ERROR: Nothing to kill, pid1=" pid1 ", pid2=" pid2) (tests:test-set-status! run-id test-id "KILLED" "FAILED TO KILL" (args:get-arg "-m") #f) ))) (mutex-unlock! m) ;; no point in sticking around. Exit now. (exit))) (if (hash-table-ref/default misc-flags 'keep-going #f) (begin (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta (loop (calc-minutes) (or new-cpu-load cpu-load) (or new-disk-free disk-free))))))) (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional (define (launch:execute encoded-cmd) (let* ((cmdinfo (common:read-encoded-string encoded-cmd)) (tconfigreg (tests:get-all))) (setenv "MT_CMDINFO" encoded-cmd) (if (list? cmdinfo) ;; ((testpath /tmp/mrwellan/jazzmind/src/example_run/tests/sqlitespeed) ;; (test-name sqlitespeed) (runscript runscript.rb) (db-host localhost) (run-id 1)) (let* ((testpath (assoc/default 'testpath cmdinfo)) ;; testpath is the test spec area (top-path (assoc/default 'toppath cmdinfo)) (work-area (assoc/default 'work-area cmdinfo)) ;; work-area is the test run area |
︙ | ︙ | |||
252 253 254 255 256 257 258 | (if (substring-index "/" runscript) runscript ;; use unadultered if contains slashes (let ((fulln (conc testpath "/" runscript))) (if (and (file-exists? fulln) (file-execute-access? fulln)) fulln runscript))))) ;; assume it is on the path | | < | 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 | (if (substring-index "/" runscript) runscript ;; use unadultered if contains slashes (let ((fulln (conc testpath "/" runscript))) (if (and (file-exists? fulln) (file-execute-access? fulln)) fulln runscript))))) ;; assume it is on the path ) ;; (rollup-status 0) ;; NFS might not have propagated the directory meta data to the run host - give it time if needed (let loop ((count 0)) (if (or (file-exists? top-path) (> count 10)) (change-directory top-path) (begin |
︙ | ︙ | |||
417 418 419 420 421 422 423 | ;; any previous runs ;; (db:test-remove-steps db run-id testname itemdat) ;; (let* ((m (make-mutex)) (kill-job? #f) (exit-info (make-launch:einf pid: #t exit-status: #t exit-code: #t rollup-status: 0)) ;; pid exit-status exit-code (i.e. process was successfully run) rollup-status (job-thread #f) | | < < < < < < < < | | < < < < < < < < < < < < < < < < < < < < < < | < < < < < | < < < < < < < | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | | 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 | ;; any previous runs ;; (db:test-remove-steps db run-id testname itemdat) ;; (let* ((m (make-mutex)) (kill-job? #f) (exit-info (make-launch:einf pid: #t exit-status: #t exit-code: #t rollup-status: 0)) ;; pid exit-status exit-code (i.e. process was successfully run) rollup-status (job-thread #f) ;; (keep-going #t) (misc-flags (let ((ht (make-hash-table))) (hash-table-set! ht 'keep-going #t) ht)) (runit (lambda () (launch:manage-steps run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m))) (monitorjob (lambda () (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags))) (th1 (make-thread monitorjob "monitor job")) (th2 (make-thread runit "run job"))) (set! job-thread th2) (thread-start! th1) (thread-start! th2) (thread-join! th2) (debug:print-info 0 "Megatest exectute of test " test-name ", item path " item-path " complete. Notifying the db ...") (hash-table-set! misc-flags 'keep-going #f) (thread-join! th1) (thread-sleep! 1) ;; givbe thread th1 a chance to be done TODO: Verify this is needed. At 0.1 I was getting fail to stop, increased to total of 1.1 sec. (mutex-lock! m) (let* ((item-path (item-list->path itemdat)) ;; only state and status needed - use lazy routine (testinfo (rmt:get-testinfo-state-status run-id test-id))) ;; Am I completed? |
︙ | ︙ |