Skip to content

Commit 392fc11

Browse files
committed
[executor] Improve task killing logic so it takes out the process group
1 parent 32fc0b3 commit 392fc11

File tree

1 file changed

+27
-2
lines changed

1 file changed

+27
-2
lines changed

executor/executor.go

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ import (
4141
"os/exec"
4242
"strings"
4343
"sync"
44+
"syscall"
4445
"time"
4546

4647
"github.com/AliceO2Group/Control/common"
@@ -436,6 +437,10 @@ func launch(state *internalState, task mesos.TaskInfo) {
436437
}
437438
taskCmd.Env = append(os.Environ(), commandInfo.Env...)
438439

440+
// We must setpgid(2) in order to be able to kill the whole process group which consists of
441+
// the containing shell and all of its children
442+
taskCmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
443+
439444
var errStdout, errStderr error
440445
stdoutIn, _ := taskCmd.StdoutPipe()
441446
stderrIn, _ := taskCmd.StderrPipe()
@@ -503,6 +508,21 @@ func launch(state *internalState, task mesos.TaskInfo) {
503508
WithField("task", task.Name).
504509
Debug("task running and ready for control input")
505510
break
511+
} else if reachedState == "DONE" || reachedState == "ERROR" {
512+
// something went wrong, the device moved to DONE or ERROR on startup
513+
state.mu.Lock()
514+
log.Debug("state locked")
515+
status := newStatus(state, task.TaskID)
516+
_ = syscall.Kill(-taskCmd.Process.Pid, syscall.SIGKILL)
517+
518+
log.WithField("task", task.Name).Debug("task killed")
519+
status.State = mesos.TASK_FAILED.Enum()
520+
521+
state.killedTasks[task.TaskID] = status
522+
523+
log.Debug("unlocking state")
524+
state.mu.Unlock()
525+
return
506526
} else if elapsed >= startupTimeout {
507527
err = errors.New("timeout while waiting for task startup")
508528
log.WithField("task", task.Name).Error(err.Error())
@@ -566,7 +586,7 @@ func launch(state *internalState, task mesos.TaskInfo) {
566586
"task": task.Name,
567587
}).
568588
Warning("killing leftover process")
569-
err = taskCmd.Process.Kill()
589+
err = syscall.Kill(-taskCmd.Process.Pid, syscall.SIGKILL)
570590
if err != nil {
571591
log.WithFields(logrus.Fields{
572592
"process": taskCmd.Process.Pid,
@@ -771,7 +791,12 @@ func kill(state *internalState, e *executor.Event_Kill) error {
771791

772792
status := newStatus(state, e.GetTaskID())
773793

774-
_ = rpcClient.TaskCmd.Process.Kill()
794+
// When killing we must always use syscall.Kill with a negative PID, in order to kill all
795+
// children which were assigned the same PGID at launch
796+
killErr := syscall.Kill(-rpcClient.TaskCmd.Process.Pid, syscall.SIGKILL)
797+
if killErr != nil {
798+
log.WithError(killErr).WithField("taskId", e.GetTaskID()).Warning("could not kill task")
799+
}
775800

776801
if reachedState == "DONE" {
777802
log.Debug("task exited correctly")

0 commit comments

Comments
 (0)