341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
|
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
|
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
+
+
+
-
-
-
-
-
-
-
-
-
+
-
-
-
-
+
+
-
+
-
-
+
-
-
-
+
+
+
+
|
(and runtlim (let* ((run-seconds (- (current-seconds) start-seconds))
(time-exceeded (> run-seconds runtlim)))
(if time-exceeded
(begin
(debug:print-info 0 "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" run-seconds " seconds, limit=" runtlim)
#t)
#f)))))
;; open-run-close not needed for test-set-meta-info
;; (tests:set-partial-meta-info #f test-id run-id minutes work-area)
(tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f)
;; (tests:set-partial-meta-info #f test-id run-id minutes work-area 10)
;; (tests:set-partial-meta-info test-id run-id minutes work-area)
(if kill-job?
(begin
(mutex-lock! m)
;; NOTE: The pid can change as different steps are run. Do we need handshaking between this
;; section and the runit section? Or add a loop that tries three times with a 1/4 second
;; between tries?
(let* ((pid (vector-ref exit-info 0)))
(if (number? pid)
(handle-exceptions
exn
(debug:print-info 0 "Unable to kill process with pid " pid ", possibly already killed.")
(let* ((pid1 (vector-ref exit-info 0))
(pid2 (rmt:test-get-top-process-pid run-id test-id))
(pids (delete-duplicates (filter number? (list pid1 pid2)))))
(if (not (null? pids))
(begin
(for-each
(lambda (pid)
(handle-exceptions
exn
(debug:print-info 0 "Unable to kill process with pid " pid ", possibly already killed.")
;;(process-signal pid signal/kill))
(begin
(debug:print 0 "WARNING: Request received to kill job (attempt # " kill-tries ")")
(let ((processes (cmd-run->list (conc "pgrep -l -P " pid))))
(for-each
(debug:print 0 "WARNING: Request received to kill job " pid) ;; " (attempt # " kill-tries ")")
(process-signal pid signal/int)
(thread-sleep! 5)
(lambda (p)
(let* ((parts (string-split p))
(p-id (if (> (length parts) 0)
(string->number (car parts))
#f)))
(if p-id
(begin
(debug:print 0 "Killing " (cadr parts) "; kill -9 " p-id)
;; (process-signal pid signal/kill))))) ;;
(process-signal pid signal/kill)))
(system (conc "kill -9 " p-id))))))
(car processes)))
(system (conc "kill -9 -" pid))
(tests:test-set-status! test-id "KILLED" "KILLED" (args:get-arg "-m") #f)))
pids)
(tests:test-set-status! run-id test-id "KILLED" "KILLED" (args:get-arg "-m") #f))
(begin
(debug:print 0 "WARNING: Request received to kill job but problem with process, attempting to kill manager process")
(debug:print 0 "ERROR: Nothing to kill, pid1=" pid1 ", pid2=" pid2)
(tests:test-set-status! test-id "KILLED" "KILLED" (args:get-arg "-m") #f)
(tests:test-set-status! run-id test-id "KILLED" "FAIL" (args:get-arg "-m") #f)
(tests:test-set-status! run-id test-id "KILLED" "FAILED TO KILL" (args:get-arg "-m") #f)
(exit 1) ;; IS THIS NECESSARY OR WISE???
)))
(set! kill-tries (+ 1 kill-tries))
(mutex-unlock! m)))
(mutex-unlock! m)
;; no point in sticking around. Exit now.
(exit)))
(if keep-going
(begin
(thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses
(if keep-going
(loop (calc-minutes)))))))
(tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f)))) ;; NOTE: Checking twice for keep-going is intentional
(th1 (make-thread monitorjob "monitor job"))
(th2 (make-thread runit "run job")))
(set! job-thread th2)
(thread-start! th1)
(thread-start! th2)
(thread-join! th2)
(debug:print-info 0 "Megatest exectute of test " test-name ", item path " item-path " complete. Notifying the db ...")
(set! keep-going #f)
(thread-join! th1)
(thread-sleep! 1) ;; give thread th1 a chance to be done TODO: Verify this is needed. At 0.1 I was getting fail to stop, increased to total of 1.1 sec.
(mutex-lock! m)
(let* ((item-path (item-list->path itemdat))
;; only state and status needed - use lazy routine
(testinfo (rmt:get-testinfo-state-status run-id test-id)))
|