Megatest: Diff

Differences From Artifact [1931a96c9c]:

File launch.scm — part of check-in [e96d85c5b3] at 2022-03-19 17:38:31 on branch v1.65 — Fixed behavior when disks are non-existent. https://hsdes.intel.com/appstore/article/#/14015976511 (user: mmgraham, size: 88338) [annotate] [blame] [check-ins using]

To Artifact [2a102f22e8]:

File launch.scm — part of check-in [15a47376a6] at 2024-08-15 18:16:40 on branch v1.65-adjutant-patched-forward-defunct — Patched forward adjutant code (user: matt, size: 90419) [annotate] [blame] [check-ins using]

︙			︙
166 167 168 169 170 171 172 ~~173~~ 174 175 176 177 178 179 180 181 182 ~~183 184 185 186 187 188~~ ~~189 190 191 192 193 194 195~~ 196 197 198 ~~199 200 201 202~~ 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 ~~221~~ 222 223 224 225 226 227 228 229 230 231 232 233 234	(set! ezsteps #t) ;; set the needed flag (set! ezstepslst (append (or ezstepslst '()) (list (list "subrun" (conc "{subrun=true} " mt-cmd))))))) ;; process the ezsteps (if ezsteps ~~(let* (~~(all-steps-dat (make-hash-table))) ;; keep all the info around as stepname ==> alist; ~~where 'params is the params list (add other stuff as needed)~~ (if (not (common:file-exists? ".ezsteps"))(create-directory ".ezsteps")) ;; if ezsteps was defined then we are sure to have at least one step but check anyway (if (not (> (length ezstepslst) 0)) (debug:print-error 0 default-log-port "ezsteps defined but ezstepslst is zero length") (let ((all-step-names (map car ezstepslst)) (status-file (file-open "ezsteps.status" (+ open/append open/wronly open/creat))) ) (setenv "MT_STEP_NAMES" (string-intersperse all-step-names " ")) (let loop ((ezstep (car ezstepslst)) (tal (cdr ezstepslst)) (prevstep #f)) (debug:print-info 0 default-log-port "Processing ezstep \"" (string-intersperse ezstep " ") "\"") ;; check exit-info (vector-ref exit-info 1) (if (launch:einf-exit-status exit-info) ;; (vector-ref exit-info 1) (let* ((logpro-used (launch:runstep ezstep run-id test-id exit-info m ~~tal testconfig all-steps-dat))~~ (stepname (car ezstep)) (stepparms (hash-table-ref all-steps-dat stepname))) (setenv "MT_STEP_NAME" stepname) (pp (hash-table->alist all-steps-dat)) ;; if logpro-used read in the stepname.dat file (if (and logpro-used (common:file-exists? (conc stepname ".dat"))) (launch:load-logpro-dat run-id test-id stepname)) (file-write status-file (conc stepname " " (launch:einf-exit-code exit-info) "\n")) (if (steprun-good? logpro-used (launch:einf-exit-code exit-info) stepparms) (if (not (null? tal)) (loop (car tal) (cdr tal) stepname)) (debug:print 0 default-log-port "WARNING: step " (car ezstep) " failed. Stopping"))) (debug:print 0 default-log-port "WARNING: a prior step failed, stopping at " ezstep))) (file-close status-file) ) )))))) (define (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags) (let* ((update-period (string->number (or (configf:lookup configdat "setup" "test-stats-update-period") "30"))) (start-seconds (current-seconds)) (calc-minutes (lambda () (inexact->exact (round (- (current-seconds) start-seconds))))) (kill-tries 0)) ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area) ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) ~~(tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10)~~ (let loop ((minutes (calc-minutes)) (cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (disk-free (get-df (current-directory))) (last-sync (current-seconds))) ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync)) (let* ((over-time (> (current-seconds) (+ last-sync update-period))) (new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (delta (abs (- load cpu-load)))) (if (> delta 0.1) ;; don't bother updating with small changes load #f))) (new-disk-free (let* ((df (if over-time ;; only get df every 30 seconds	> \| > > \| \| \| \| \| \| > \| \| \| \| \| \| \| \| \| \| \| \| > >	166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240	(set! ezsteps #t) ;; set the needed flag (set! ezstepslst (append (or ezstepslst '()) (list (list "subrun" (conc "{subrun=true} " mt-cmd))))))) ;; process the ezsteps (if ezsteps (let* ((envdbf (conc "/tmp/."(current-user-name)"-"(current-process-id)"-"run-id"-"test-id".db")) (all-steps-dat (make-hash-table))) ;; keep all the info around as stepname ==> alist; ;;; where 'params is the params list (add other ;;; stuff as needed) (if (not (common:file-exists? ".ezsteps"))(create-directory ".ezsteps")) ;; if ezsteps was defined then we are sure to have at least one step but check anyway (if (not (> (length ezstepslst) 0)) (debug:print-error 0 default-log-port "ezsteps defined but ezstepslst is zero length") (let ((all-step-names (map car ezstepslst)) (status-file (file-open "ezsteps.status" (+ open/append open/wronly open/creat))) ) (setenv "MT_STEP_NAMES" (string-intersperse all-step-names " ")) (let loop ((ezstep (car ezstepslst)) (tal (cdr ezstepslst)) (prevstep #f)) (debug:print-info 0 default-log-port "Processing ezstep \"" (string-intersperse ezstep " ") "\"") ;; check exit-info (vector-ref exit-info 1) (if (launch:einf-exit-status exit-info) ;; (vector-ref exit-info 1) (let* ((logpro-used (launch:runstep ezstep run-id test-id exit-info m tal testconfig all-steps-dat prevstep envdbf)) (stepname (car ezstep)) (stepparms (hash-table-ref all-steps-dat stepname))) (setenv "MT_STEP_NAME" stepname) (pp (hash-table->alist all-steps-dat)) ;; if logpro-used read in the stepname.dat file (if (and logpro-used (common:file-exists? (conc stepname ".dat"))) (launch:load-logpro-dat run-id test-id stepname)) (file-write status-file (conc stepname " " (launch:einf-exit-code exit-info) "\n")) (if (steprun-good? logpro-used (launch:einf-exit-code exit-info) stepparms) (if (not (null? tal)) (loop (car tal) (cdr tal) stepname)) (debug:print 0 default-log-port "WARNING: step " (car ezstep) " failed. Stopping"))) (debug:print 0 default-log-port "WARNING: a prior step failed, stopping at " ezstep))) (file-close status-file) ) )))))) (define (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags) (let* ((update-period (string->number (or (configf:lookup configdat "setup" "test-stats-update-period") "30"))) (start-seconds (current-seconds)) (calc-minutes (lambda () (inexact->exact (round (- (current-seconds) start-seconds))))) (kill-tries 0)) ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area) ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10 update-db: #t) (let loop ((minutes (calc-minutes)) (cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (disk-free (get-df (current-directory))) (last-sync (current-seconds))) ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync)) ;; top of loop encountered at "(current-seconds)" with ;; last-sync="last-sync)) (let* ((over-time (> (current-seconds) (+ last-sync update-period))) (new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (delta (abs (- load cpu-load)))) (if (> delta 0.1) ;; don't bother updating with small changes load #f))) (new-disk-free (let* ((df (if over-time ;; only get df every 30 seconds
︙			︙
251 252 253 254 255 256 257 ~~258~~ 259 260 261 262 263 264 ~~265 266 267 268 269 270~~ 271 ~~272~~ 273 274 275 276 277 278 279	((test-get-kill-request run-id test-id) (set! kill-reason "KILLING TEST since received kill request (KILLREQ)") (set! kill-job? #t)) ((and runtlim (> (- (current-seconds) start-seconds) runtlim)) (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim)) (set! kill-job? #t)) ((equal? status "DEAD") ~~(tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)~~ (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.") ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING (set! kill-job? #f))) (debug:print 4 default-log-port "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync) (launch:handle-zombie-tests run-id) (~~when~~ do-sync ~~;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)~~ ~~;; (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))~~ ~~;; (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync started at "(current-seconds)))~~ (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) ~~;; (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds)))~~ ) (if kill-job? (begin (debug:print-info 0 default-log-port "proceeding to kill test: "kill-reason) (mutex-lock! m) ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this ;; section and the runit section? Or add a loop that tries three times with a 1/4 second ;; between tries?	\| \| < < < \| < <	257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280	((test-get-kill-request run-id test-id) (set! kill-reason "KILLING TEST since received kill request (KILLREQ)") (set! kill-job? #t)) ((and runtlim (> (- (current-seconds) start-seconds) runtlim)) (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim)) (set! kill-job? #t)) ((equal? status "DEAD") (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f update-db: #t) (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.") ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING (set! kill-job? #f))) (debug:print 4 default-log-port "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync) (launch:handle-zombie-tests run-id) (if do-sync ;; save meta data about the running of this test (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)) ) (if kill-job? (begin (debug:print-info 0 default-log-port "proceeding to kill test: "kill-reason) (mutex-lock! m) ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this ;; section and the runit section? Or add a loop that tries three times with a 1/4 second ;; between tries?
︙			︙
322 323 324 325 326 327 328 ~~329~~ 330 331 332 333 334 335 336	(begin (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta (loop (calc-minutes) (or new-cpu-load cpu-load) (or new-disk-free disk-free) (if do-sync (current-seconds) last-sync))))))) ~~(tests:update-central-meta-info run-id test-id ~~(get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional~~~~ (define (launch:execute encoded-cmd) (let* ((cmdinfo (common:read-encoded-string encoded-cmd)) (tconfigreg #f)) (setenv "MT_CMDINFO" encoded-cmd) ;;(bb-check-path msg: "launch:execute incoming")	\| > > >	323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340	(begin (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta (loop (calc-minutes) (or new-cpu-load cpu-load) (or new-disk-free disk-free) (if do-sync (current-seconds) last-sync))))))) (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory)) (calc-minutes) #f #f update-db: #t)) ;; NOTE: Checking twice for keep-going is intentional (define (launch:execute encoded-cmd) (let* ((cmdinfo (common:read-encoded-string encoded-cmd)) (tconfigreg #f)) (setenv "MT_CMDINFO" encoded-cmd) ;;(bb-check-path msg: "launch:execute incoming")
︙			︙
408 409 410 411 412 413 414 ~~415~~ 416 417 418 419 420 421 422	(debug:print 0 default-log-port "INFO: missing files from " work-area ": " (string-intersperse bad-files ", ")) (launch:test-copy testpath work-area)))) ;; one more time, change to the work-area directory (change-directory work-area))) ) ;; let* (if contour (setenv "MT_CONTOUR" contour)) ;; immediated set some key variables from CMDINFO data, yes, these will be set again below ... ;; (setenv "MT_TESTSUITENAME" areaname) (setenv "MT_RUN_AREA_HOME" top-path) (set! toppath top-path) (change-directory toppath) ;; temporarily switch to the run area home (setenv "MT_TEST_RUN_DIR" work-area)	<	412 413 414 415 416 417 418 419 420 421 422 423 424 425	(debug:print 0 default-log-port "INFO: missing files from " work-area ": " (string-intersperse bad-files ", ")) (launch:test-copy testpath work-area)))) ;; one more time, change to the work-area directory (change-directory work-area))) ) ;; let* (if contour (setenv "MT_CONTOUR" contour)) ;; immediated set some key variables from CMDINFO data, yes, these will be set again below ... ;; (setenv "MT_TESTSUITENAME" areaname) (setenv "MT_RUN_AREA_HOME" top-path) (set! toppath top-path) (change-directory toppath) ;; temporarily switch to the run area home (setenv "MT_TEST_RUN_DIR" work-area)
︙			︙
475 476 477 478 479 480 481 482 483 484 485 486 487 488	(db:test-get-host test-info) (begin (debug:print 0 default-log-port "ERROR: failed to find a record for test-id " test-id ", exiting.") (exit)))) (test-pid (db:test-get-process_id test-info))) (cond ;; -mrw- I'm removing KILLREQ from this list so that a test in KILLREQ state is treated as a "do not run" flag. ((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun (debug:print 0 default-log-port "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request") ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") (rmt:general-call 'set-test-start-time #f test-id) (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f) ) ;; prime it for running	> > >	478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494	(db:test-get-host test-info) (begin (debug:print 0 default-log-port "ERROR: failed to find a record for test-id " test-id ", exiting.") (exit)))) (test-pid (db:test-get-process_id test-info))) (cond ;; -mrw- I'm removing KILLREQ from this list so that a test in KILLREQ state is treated as a "do not run" flag. ;;((or (member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun ;; (and (equal? (db:test-get-state test-info) "COMPLETED") ;; completed/abort => rerun if asked ;; (member (db:test-get-status test-info) '("ABORT")))) ((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun (debug:print 0 default-log-port "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request") ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") (rmt:general-call 'set-test-start-time #f test-id) (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f) ) ;; prime it for running
︙			︙
807 808 809 810 811 812 813 ~~814~~ 815 816 817 818 819 820 821	(debug:print 0 default-log-port "test " test-name "/" item-path " not completed") (if (not (null? tal)) (loop (car tal) (cdr tal))))))))))) ;; replaced below with version that does not ssh if checking on localhost #;(define (launch:is-test-alive host pid) (if (and host pid (not (equal? host "n/a"))) ~~~~(let* (~~(cmd (conc "ssh " ~~host "~~ pstree -A " pid))~~ (output (with-input-from-pipe cmd read-lines))) (debug:print 2 default-log-port "Running " cmd " received " output) (if (eq? (length output) 0) #f #t)) #t))	> > \|	813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829	(debug:print 0 default-log-port "test " test-name "/" item-path " not completed") (if (not (null? tal)) (loop (car tal) (cdr tal))))))))))) ;; replaced below with version that does not ssh if checking on localhost #;(define (launch:is-test-alive host pid) (if (and host pid (not (equal? host "n/a"))) (let* ((is-local (equal? host (get-host-name))) (ssh-cmd (if is-local " " (conc "ssh " host " "))) (cmd (conc ssh-cmd "pstree -A " pid)) (output (with-input-from-pipe cmd read-lines))) (debug:print 2 default-log-port "Running " cmd " received " output) (if (eq? (length output) 0) #f #t)) #t))
︙			︙
1417 1418 1419 1420 1421 1422 1423 ~~1424~~ 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 ~~1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448~~ 1449 ~~1450~~ 1451 1452 1453 1454 1455 1456 1457 1458 1459 ~~1460 1461 1462 1463 1464 1465 1466 1467 1468 1469~~ 1470 1471 1472 1473 1474 1475 1476	(else #f)))) (when do-scan? (debug:print 1 default-log-port "INFO: search and mark zombie tests") (rmt:set-var key (current-seconds)) (rmt:find-and-mark-incomplete run-id #f)))) ;; 1. look though disks list for disk with most space ;; 2. create run dir on disk, path name is meaningful ;; 3. create link from run dir to megatest runs area ;; 4. remotely run the test on allocated host ;; - could be ssh to host from hosts table (update regularly with load) ;; - could be netbatch ;; (launch-test db (cadr status) test-conf)) (define (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat params) (mutex-lock! launch-setup-mutex) ;; setting variables and processing the testconfig is NOT thread-safe, reuse the launch-setup mutex (let* ( ~~;; (~~lock~~-key (~~co~~nc "~~test-" t~~est-id))~~ ~~;; (got-lock (let loop ((lock (rmt:no-sync-get-lock lock-key))~~ ~~;; (expire-time (+ (current-seconds) 15))) ;; give up on getting the lock and steal it after 15 seconds~~ ~~;; (if (car lock)~~ ~~;; #t~~ ~~;; (if (> (current-seconds) expire-time)~~ ~~;; (begin~~ ~~;; (debug:print-info 0 default-log-port "Timed out waiting for a lock to launch test " keyvals " " runname " " test-name " " test-path)~~ ~~;; (rmt:no-sync-del! lock-key) ;; destroy the lock~~ ~~;; (loop (rmt:no-sync-get-lock lock-key) expire-time)) ;;~~ ~~;; (begin~~ ~~;; (thread-sleep! 1)~~ ~~;; (loop (rmt:no-sync-get-lock lock-key) expire-time))))))~~ (item-path (item-list->path itemdat)) ~~(contour #f)) ;; NOT READY FOR THIS (args:get-arg "-contour")))~~ (let loop ((delta (- (current-seconds) last-launch)) (launch-delay (configf:lookup-number configdat "setup" "launch-delay" default: 0))) (if (> launch-delay delta) (begin (if (common:low-noise-print 1200 "test launch delay") ;; every two hours or so remind the user about launch delay. (debug:print-info 0 default-log-port "NOTE: test launches are delayed by " launch-delay " seconds. See megatest.config launch-delay setting to adjust.")) ;; launch of " test-name " for " (- launch-delay delta) " seconds")) (thread-sleep! (- launch-delay delta)) (loop (- (current-seconds) last-launch) launch-delay)))) (change-directory toppath) ~~(alist->env-vars ;; consolidate this code with the code in megatest.scm for "-execute", maybe - the longer they are set the longer each launch takes (must be non-overlapping with the vars)~~ (append (list (list "MT_RUN_AREA_HOME" toppath) (list "MT_TEST_NAME" test-name) (list "MT_RUNNAME" runname) (list "MT_ITEMPATH" item-path) (list "MT_CONTOUR" contour) ) itemdat)) (let* ((tregistry (tests:get-all)) ;; third param (below) is system-allowed ;; for tconfig, why do we allow fallback to test-conf? (tconfig (or (tests:get-testconfig test-name item-path tregistry #t force-create: #t) (begin (debug:print 0 default-log-port "WARNING: falling back to pre-calculated testconfig. This is likely not desired.") test-conf))) ;; force re-read now that all vars are set (useshell (let ((ush (configf:lookup configdat "jobtools" "useshell")))	\| > > > > > > > > \| < < < < < < < < < < < < \| > > > < \| \| \| \| \| \| \| \| \| > > > > > > >	1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489	(else #f)))) (when do-scan? (debug:print 1 default-log-port "INFO: search and mark zombie tests") (rmt:set-var key (current-seconds)) (rmt:find-and-mark-incomplete run-id #f)))) (defstruct launch:ajt (vars '()) (exekey #f) (host-type #f) (test-sig #f) (cmdline #f)) ;; append vars (define (launch:ajt-add-vars dat vars) (launch:ajt-vars-set! dat (append (launch:ajt-vars dat) vars))) ;; 1. look though disks list for disk with most space ;; 2. create run dir on disk, path name is meaningful ;; 3. create link from run dir to megatest runs area ;; 4. remotely run the test on allocated host ;; - could be ssh to host from hosts table (update regularly with load) ;; - could be netbatch ;; (launch-test db (cadr status) test-conf)) (define (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat params) (mutex-lock! launch-setup-mutex) ;; setting variables and processing the testconfig is NOT thread-safe, reuse the launch-setup mutex (let* (;; locking code removed from here commented out and pasted at end of file (item-path (item-list->path itemdat)) (contour #f) ;; NOT READY FOR THIS (args:get-arg "-contour"))) ;; launcher-mode will be 'adjutant or 'normal (launcher-mode (string->symbol (or (configf:lookup configdat "jobtools" "mode") "normal"))) (ajtdat (make-launch:ajt))) (let loop ((delta (- (current-seconds) last-launch)) (launch-delay (configf:lookup-number configdat "setup" "launch-delay" default: 0))) (if (> launch-delay delta) (begin (if (common:low-noise-print 1200 "test launch delay") ;; every two hours or so remind the user about launch delay. (debug:print-info 0 default-log-port "NOTE: test launches are delayed by " launch-delay " seconds. See megatest.config launch-delay setting to adjust.")) ;; launch of " test-name " for " (- launch-delay delta) " seconds")) (thread-sleep! (- launch-delay delta)) (loop (- (current-seconds) last-launch) launch-delay)))) (change-directory toppath) (let ((var-list (append (list (list "MT_RUN_AREA_HOME" toppath) (list "MT_TEST_NAME" test-name) (list "MT_RUNNAME" runname) (list "MT_ITEMPATH" item-path) (list "MT_CONTOUR" contour) ) itemdat))) ;; consolidate this code with the code in megatest.scm for ;; "-execute", maybe - the longer they are set the longer ;; each launch takes (must be non-overlapping with the vars) (alist->env-vars var-list) ;; the var-list into the ajtdat adjutant record whether it is needed or not. (launch:ajt-add-vars ajtdat var-list)) (let* ((tregistry (tests:get-all)) ;; third param (below) is system-allowed ;; for tconfig, why do we allow fallback to test-conf? (tconfig (or (tests:get-testconfig test-name item-path tregistry #t force-create: #t) (begin (debug:print 0 default-log-port "WARNING: falling back to pre-calculated testconfig. This is likely not desired.") test-conf))) ;; force re-read now that all vars are set (useshell (let ((ush (configf:lookup configdat "jobtools" "useshell")))
︙			︙
1484 1485 1486 1487 1488 1489 1490 ~~1491 1492 1493 1494~~ 1495 ~~1496 1497 1498~~ ~~1499 1500 1501 1502 1503 1504 1505~~ ~~1506~~ 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 ~~1524~~ 1525 1526 1527 1528 1529 1530 1531	(subrun (> (length (hash-table-ref/default tconfig "subrun" '())) 0)) ;; send a flag to process a subrun ;; (diskspace (configf:lookup tconfig "requirements" "diskspace")) ;; (memory (configf:lookup tconfig "requirements" "memory")) ;; (hosts (configf:lookup configdat "jobtools" "workhosts")) ;; I'm pretty sure this was never completed (remote-megatest (configf:lookup configdat "setup" "executable")) (run-time-limit (or (configf:lookup tconfig "requirements" "runtimelim") (configf:lookup configdat "setup" "runtimelim"))) ~~;; FIXME SOMEDAY: not good how this is so obtuse, this hack is to~~ ~~;; allow running from dashboard. Extract the path~~ ~~;; from the called megatest and convert dashboard~~ ~~;; or dboard to megatest~~ (local-megatest (common:find-local-megatest)) ~~#;(local-megatest (let* ((lm (car (argv)))~~ ~~(dir (pathname-directory lm))~~ ~~(exe (pathname-strip-directory lm)))~~ (~~conc (~~if di~~r (conc dir "/") ""~~) ~~(case~~ (string->s~~ymbo~~l ~~exe~~) ~~((dboard) "../megatest")~~ ~~((mtest) "../megatest")~~ ~~((dashboard) "megatest")~~ ~~(else exe)))))~~ ~~(launcher (common:get-launcher configdat test-name item-path)) ;; (configf:lookup configdat "jobtools" "launcher"))~~ ~~(test-sig (conc (common:get-testsuite-name) ":" test-name ":" item-path)) ~~;; (item-list->path itemdat))) ;; test-path is the full path including the item-path~~~~ (work-area #f) (toptest-work-area #f) ;; for iterated tests the top test contains data relevant for all (diskpath #f) (cmdparms #f) (fullcmd #f) ;; (define a (with-output-to-string (lambda ()(write x)))) (mt-bindir-path #f) (testinfo (rmt:get-test-info-by-id run-id test-id)) (mt_target (string-intersperse (map cadr keyvals) "/")) (debug-param (append (if (args:get-arg "-debug") (list "-debug" (args:get-arg "-debug")) '()) (if (args:get-arg "-logging")(list "-logging") '()) (if (configf:lookup configdat "misc" "profilesw") (list (configf:lookup configdat "misc" "profilesw")) '())))) ;; (if hosts (set! hosts (string-split hosts))) ;; set the megatest to be called on the remote host (if (not remote-megatest)(set! remote-megatest local-megatest)) ;; "megatest")) (set! mt-bindir-path (pathname-directory remote-megatest)) ~~(if launcher (set! launcher (string-split launcher)))~~ ;; set up the run work area for this test (if (and (args:get-arg "-preclean") ;; user has requested to preclean for this run (not (member (db:test-get-rundir testinfo)(list "n/a" "/tmp/badname")))) ;; n/a is a placeholder and thus not a read dir (begin (debug:print-info 0 default-log-port "attempting to preclean directory " (db:test-get-rundir testinfo) " for test " test-name "/" item-path) (runs:remove-test-directory testinfo 'remove-data-only))) ;; remove data only, do not perturb the record	< < < < < < < > \| \| < < < < < > > \| > > > > > > > > > \|	1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544	(subrun (> (length (hash-table-ref/default tconfig "subrun" '())) 0)) ;; send a flag to process a subrun ;; (diskspace (configf:lookup tconfig "requirements" "diskspace")) ;; (memory (configf:lookup tconfig "requirements" "memory")) ;; (hosts (configf:lookup configdat "jobtools" "workhosts")) ;; I'm pretty sure this was never completed (remote-megatest (configf:lookup configdat "setup" "executable")) (run-time-limit (or (configf:lookup tconfig "requirements" "runtimelim") (configf:lookup configdat "setup" "runtimelim"))) (local-megatest (common:find-local-megatest)) (launcher (let ((l (common:get-launcher configdat test-name item-path launcher-mode))) (if (string? l) (string-split l) l))) ;; some nonhomogenuity here. '(cmd param1 param2 ...) OR '(host-type launcher) ;; (item-list->path itemdat))) ;; test-path is the full path including the item-path (test-sig (conc (common:get-testsuite-name) ":" test-name ":" item-path)) (work-area #f) (toptest-work-area #f) ;; for iterated tests the top test contains data relevant for all (diskpath #f) (cmdparms #f) (fullcmd #f) ;; (define a (with-output-to-string (lambda ()(write x)))) (mt-bindir-path #f) (testinfo (rmt:get-test-info-by-id run-id test-id)) (mt_target (string-intersperse (map cadr keyvals) "/")) (debug-param (append (if (args:get-arg "-debug") (list "-debug" (args:get-arg "-debug")) '()) (if (args:get-arg "-logging")(list "-logging") '()) (if (configf:lookup configdat "misc" "profilesw") (list (configf:lookup configdat "misc" "profilesw")) '())))) ;; save the test-sig in the ajtdat record (launch:ajt-test-sig-set! ajtdat test-sig) ;; go ahead and figure out if we have a host-type from the ;; launcher call above and save it in the ajtdat record (if (and (eq? launcher-mode 'adjutant) (list? launcher) (> (length launcher) 1)) (launch:ajt-host-type-set! ajtdat (car launcher))) ;; (if hosts (set! hosts (string-split hosts))) ;; set the megatest to be called on the remote host (if (not remote-megatest)(set! remote-megatest local-megatest)) ;; "megatest")) (set! mt-bindir-path (pathname-directory remote-megatest)) ;; (if launcher (set! launcher (string-split launcher))) ;; yuk! ;; set up the run work area for this test (if (and (args:get-arg "-preclean") ;; user has requested to preclean for this run (not (member (db:test-get-rundir testinfo)(list "n/a" "/tmp/badname")))) ;; n/a is a placeholder and thus not a read dir (begin (debug:print-info 0 default-log-port "attempting to preclean directory " (db:test-get-rundir testinfo) " for test " test-name "/" item-path) (runs:remove-test-directory testinfo 'remove-data-only))) ;; remove data only, do not perturb the record
︙			︙
1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 ~~1590~~ ~~1591 1592 1593~~ 1594 1595 ~~1596~~ 1597 1598 1599 ~~1600~~ 1601 1602 1603 1604 1605 ~~1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616~~ ~~1617 1618~~ 1619 1620 ~~1621~~ ~~1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632~~ 1633 1634 ~~1635~~ 1636 ~~1637~~ 1638 1639 1640 1641 1642 1643 1644	(list 'runtlim (if run-time-limit (common:hms-string->seconds run-time-limit) #f)) (list 'env-ovrd (hash-table-ref/default configdat "env-override" '())) (list 'set-vars (if params (hash-table-ref/default params "-setvars" #f))) (list 'runname runname) (list 'mt-bindir-path mt-bindir-path)))))))) (setenv "MT_CMDINFO" cmdparms) ;; setting this for use in nblauncher ;; clean out step records from previous run if they exist ;; (rmt:delete-test-step-records run-id test-id) ;; if the dir does not exist we may have a itempath where individual variables are a path, launch anyway (if (common:file-exists? work-area) (change-directory work-area)) ;; so that log files from the launch process don't clutter the test dir ~~(cond~~ ~~;; ((and~~ launch~~er ho~~st~~s) ;; must be us~~ing s~~sh hostnam~~e ~~;; (set! fullcmd~~ (append ~~launcher (car hosts)~~(list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param))) ~~;; (set! fullcmd (append launcher (car hosts)(list remote-megatest test-sig "-execute" cmdparms))))~~ (launcher (set! fullcmd (append launcher (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param))) ~~;; (set! fullcmd (append launcher (list remote-megatest test-sig "-execute" cmdparms))))~~ (else (if (not useshell)(debug:print 0 default-log-port "WARNING: internal launching will not work well without \"useshell yes\" in your [jobtools] section")) (set! fullcmd (append (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param (list (if useshell "&" "")))))) ~~;; (set! fullcmd (list remote-megatest test-sig "-execute" cmdparms (if useshell "&" "")))))~~ (if (args:get-arg "-xterm")(set! fullcmd (append fullcmd (list "-xterm")))) (debug:print 1 default-log-port "Launching " work-area) ;; set pre-launch-env-vars before launching, keep the vars in prevvals and put the envionment back when done (debug:print 4 default-log-port "fullcmd: " fullcmd) (set! last-launch (current-seconds)) ;; all that junk above takes time, set this as late as possible. ~~(let* ((commonprevvals (alist->env-vars~~ (hash-table-ref/default configdat "env-override" '()))) (~~misc~~prevvals (alist->env-vars ~~;; c~~o~~nsol~~ida~~te this code with the code in megatest.scm for "-execute"~~ (append (list (list "MT_TEST_RUN_DIR" work-area) (list "MT_TEST_NAME" test-name) (list "MT_ITEM_INFO" (conc itemdat)) (list "MT_RUNNAME" runname) (list "MT_TARGET" mt_target) (list "MT_ITEMPATH" item-path) ) itemdat))) ~~(testprevvals (alist->env-vars ~~(hash-table-ref/default tconfig "pre-launch-env-overrides" '())))~~~~ ;; Launchwait defaults to true, must override it to turn off wait (launchwait (if (equal? (configf:lookup configdat "setup" "launchwait") "no") #f #t)) ~~~~(launch-results-prev (apply (if launchwait~~ ;; BB: TODO: refactor this to examine return code of launcher, if nonzero, set state to launch failed.~~ process:cmd-run-with-stderr-and-exitcode->list process-run) (if useshell (let ((cmdstr (string-intersperse fullcmd " "))) (if launchwait cmdstr (conc cmdstr " >> mt_launch.log 2>&1 &"))) (car fullcmd)) (if useshell '() (cdr fullcmd)))) (success (if launchwait (equal? 0 (cadr launch-results-prev)) #t)) (launch-results (if launchwait (car launch-results-prev) launch-results-prev))) ~~(if (not success)~~ (tests:test-set-status! run-id test-id "COMPLETED" "DEAD" "launcher failed; exited non-zero; check mt_launch.log" #f)) ;; (if launch-results launch-results "FAILED")) ~~(mutex-unlock! launch-setup-mutex) ;; yes, really should mutex all the way to here. Need to put this entire process into a fork.~~ ;; (rmt:no-sync-del! lock-key) ;; release the lock for starting this test (if (not launchwait) ;; give the OS a little time to allow the process to start (thread-sleep! 0.01)) (with-output-to-file "mt_launch.log" (lambda () (print "LAUNCHCMD: " (string-intersperse fullcmd " ")) (if (list? launch-results)	> > > \| > \| \| < > < \| < \| \| \| \| \| \| \| \| < \| > > \| \| \| > > > \| \| \| \| \| \| \| \| \| \| \| > > > > > > > > > > > > > > > > > > > > > > > > \| <	1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686	(list 'runtlim (if run-time-limit (common:hms-string->seconds run-time-limit) #f)) (list 'env-ovrd (hash-table-ref/default configdat "env-override" '())) (list 'set-vars (if params (hash-table-ref/default params "-setvars" #f))) (list 'runname runname) (list 'mt-bindir-path mt-bindir-path)))))))) (setenv "MT_CMDINFO" cmdparms) ;; setting this for use in nblauncher ;; save the cmdparms in the ajtdat (launch:ajt-exekey-set! ajtdat cmdparms) ;; clean out step records from previous run if they exist ;; (rmt:delete-test-step-records run-id test-id) ;; if the dir does not exist we may have a itempath where individual variables are a path, launch anyway (if (common:file-exists? work-area) (change-directory work-area)) ;; so that log files from the launch process don't clutter the test dir ;; save the command line for adjutant mode (might never be needed but best to assemble it here) (launch:ajt-cmdline-set! ajtdat (string-intersperse (append (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param))) (cond (launcher (set! fullcmd (append launcher (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param))) (else (if (not useshell)(debug:print 0 default-log-port "WARNING: internal launching will not work well without \"useshell yes\" in your [jobtools] section")) (set! fullcmd (append (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param (list (if useshell "&" "")))))) (if (args:get-arg "-xterm")(set! fullcmd (append fullcmd (list "-xterm")))) (debug:print 1 default-log-port "Launching " work-area) ;; set pre-launch-env-vars before launching, keep the vars in prevvals and put the envionment back when done (debug:print 4 default-log-port "fullcmd: " fullcmd) (set! last-launch (current-seconds)) ;; all that junk above takes time, set this as late as possible. (let* ((env-override-vars (hash-table-ref/default configdat "env-override" '())) (commonprevvals (alist->env-vars env-override-vars)) (misc-vars (append (list (list "MT_TEST_RUN_DIR" work-area) (list "MT_TEST_NAME" test-name) (list "MT_ITEM_INFO" (conc itemdat)) (list "MT_RUNNAME" runname) (list "MT_TARGET" mt_target) (list "MT_ITEMPATH" item-path)) itemdat)) (miscprevvals (alist->env-vars misc-vars));; consolidate this code with the code in megatest.scm for "-execute" (test-vars (hash-table-ref/default tconfig "pre-launch-env-overrides" '())) (testprevvals (alist->env-vars test-vars)) ;; Launchwait defaults to true, must override it to turn off wait (launchwait (if (equal? (configf:lookup configdat "setup" "launchwait") "no") #f #t)) ;; BB: TODO: refactor this to examine return code of launcher, if nonzero, set state to launch failed. (launch-results-prev (if (eq? launcher-mode 'adjutant) '(#t 0) ;; just some fake data to fool downstream but non-applicable code (apply (if launchwait process:cmd-run-with-stderr-and-exitcode->list process-run) (if useshell (let ((cmdstr (string-intersperse fullcmd " "))) (if launchwait cmdstr (conc cmdstr " >> mt_launch.log 2>&1 &"))) (car fullcmd)) (if useshell '() (cdr fullcmd))))) (success (if launchwait (equal? 0 (cadr launch-results-prev)) #t)) (launch-results (if launchwait (car launch-results-prev) launch-results-prev))) (launch:ajt-add-vars ajtdat env-override-vars) (launch:ajt-add-vars ajtdat misc-vars) (launch:ajt-add-vars ajtdat test-vars) ;; if in adjutant mode we register the job in the jobs_queue ;; then fire off an adjutant runner ;; (if (eq? launcher-mode 'adjutant) (let* ((adjutant-runner-cmd (append (cdr launcher) (list remote-megatest "-adjutant" (launch:ajt-host-type ajtdat) "-start-dir" toppath))) (adj-cmd (conc (string-intersperse (map conc adjutant-runner-cmd) " ") "&"))) (rmt:no-sync-add-job (launch:ajt-host-type ajtdat) (launch:ajt-vars ajtdat) (launch:ajt-exekey ajtdat) (launch:ajt-cmdline ajtdat)) (print "adj-cmd: " adj-cmd) (system adj-cmd) )) (if (not success) (tests:test-set-status! run-id test-id "COMPLETED" "DEAD" "launcher failed; exited non-zero; check mt_launch.log" #f)) ;; (if launch-results launch-results "FAILED")) ;; (rmt:no-sync-del! lock-key) ;; release the lock for starting this test (if (not launchwait) ;; give the OS a little time to allow the process to start (thread-sleep! 0.01)) (with-output-to-file "mt_launch.log" (lambda () (print "LAUNCHCMD: " (string-intersperse fullcmd " ")) (if (list? launch-results)
︙			︙
1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669	;; but this hack will work! Thanks go to Alan Post of the Chicken email list ;; NB// Is this still needed? Should be safe to go back to "exit" now? (process-signal (current-process-id) signal/kill) )) (alist->env-vars miscprevvals) (alist->env-vars testprevvals) (alist->env-vars commonprevvals) launch-results)) (change-directory toppath) (thread-sleep! (configf:lookup-number configdat "setup" "inter-test-delay" default: 0.0)))) ;; recover a test where the top controlling mtest may have died ;; (define (launch:recover-test run-id test-id)	> > > >	1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715	;; but this hack will work! Thanks go to Alan Post of the Chicken email list ;; NB// Is this still needed? Should be safe to go back to "exit" now? (process-signal (current-process-id) signal/kill) )) (alist->env-vars miscprevvals) (alist->env-vars testprevvals) (alist->env-vars commonprevvals) ;; yes, really should mutex all the way to here. Need to put this entire process into a fork. ;; the unlock previously was further up. This seemed wrong as we should not proceed until the ;; vars have been reset. (mutex-unlock! launch-setup-mutex) launch-results)) (change-directory toppath) (thread-sleep! (configf:lookup-number configdat "setup" "inter-test-delay" default: 0.0)))) ;; recover a test where the top controlling mtest may have died ;; (define (launch:recover-test run-id test-id)
︙			︙
1685 1686 1687 1688 1689 1690 1691	(read-symbolic-link (conc "/proc/" pid "/cwd")) #f))) ;; now wait on that process if all is correct ;; periodically update the db with runtime ;; when the process exits look at the db, if still RUNNING after 10 seconds set ;; state/status appropriately (process-wait pid)))	> > > > > > > > > > > > > > > >	1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753	(read-symbolic-link (conc "/proc/" pid "/cwd")) #f))) ;; now wait on that process if all is correct ;; periodically update the db with runtime ;; when the process exits look at the db, if still RUNNING after 10 seconds set ;; state/status appropriately (process-wait pid))) ;; (lock-key (conc "test-" test-id)) ;; (got-lock (let loop ((lock (rmt:no-sync-get-lock lock-key)) ;; (expire-time (+ (current-seconds) 15))) ;; give up on getting the lock and steal it after 15 seconds ;; (if (car lock) ;; #t ;; (if (> (current-seconds) expire-time) ;; (begin ;; (debug:print-info 0 default-log-port "Timed out waiting for a lock to launch test " keyvals " " runname " " test-name " " test-path) ;; (rmt:no-sync-del! lock-key) ;; destroy the lock ;; (loop (rmt:no-sync-get-lock lock-key) expire-time)) ;; ;; (begin ;; (thread-sleep! 1) ;; (loop (rmt:no-sync-get-lock lock-key) expire-time))))))