@@ -41,6 +41,7 @@ import (
4141 "os/exec"
4242 "strings"
4343 "sync"
44+ "syscall"
4445 "time"
4546
4647 "github.com/AliceO2Group/Control/common"
@@ -436,6 +437,10 @@ func launch(state *internalState, task mesos.TaskInfo) {
436437 }
437438 taskCmd .Env = append (os .Environ (), commandInfo .Env ... )
438439
440+ // We must setpgid(2) in order to be able to kill the whole process group which consists of
441+ // the containing shell and all of its children
442+ taskCmd .SysProcAttr = & syscall.SysProcAttr {Setpgid : true }
443+
439444 var errStdout , errStderr error
440445 stdoutIn , _ := taskCmd .StdoutPipe ()
441446 stderrIn , _ := taskCmd .StderrPipe ()
@@ -503,6 +508,21 @@ func launch(state *internalState, task mesos.TaskInfo) {
503508 WithField ("task" , task .Name ).
504509 Debug ("task running and ready for control input" )
505510 break
511+ } else if reachedState == "DONE" || reachedState == "ERROR" {
512+ // something went wrong, the device moved to DONE or ERROR on startup
513+ state .mu .Lock ()
514+ log .Debug ("state locked" )
515+ status := newStatus (state , task .TaskID )
516+ _ = syscall .Kill (- taskCmd .Process .Pid , syscall .SIGKILL )
517+
518+ log .WithField ("task" , task .Name ).Debug ("task killed" )
519+ status .State = mesos .TASK_FAILED .Enum ()
520+
521+ state .killedTasks [task .TaskID ] = status
522+
523+ log .Debug ("unlocking state" )
524+ state .mu .Unlock ()
525+ return
506526 } else if elapsed >= startupTimeout {
507527 err = errors .New ("timeout while waiting for task startup" )
508528 log .WithField ("task" , task .Name ).Error (err .Error ())
@@ -566,7 +586,7 @@ func launch(state *internalState, task mesos.TaskInfo) {
566586 "task" : task .Name ,
567587 }).
568588 Warning ("killing leftover process" )
569- err = taskCmd .Process .Kill ( )
589+ err = syscall . Kill ( - taskCmd .Process .Pid , syscall . SIGKILL )
570590 if err != nil {
571591 log .WithFields (logrus.Fields {
572592 "process" : taskCmd .Process .Pid ,
@@ -771,7 +791,12 @@ func kill(state *internalState, e *executor.Event_Kill) error {
771791
772792 status := newStatus (state , e .GetTaskID ())
773793
774- _ = rpcClient .TaskCmd .Process .Kill ()
794+ // When killing we must always use syscall.Kill with a negative PID, in order to kill all
795+ // children which were assigned the same PGID at launch
796+ killErr := syscall .Kill (- rpcClient .TaskCmd .Process .Pid , syscall .SIGKILL )
797+ if killErr != nil {
798+ log .WithError (killErr ).WithField ("taskId" , e .GetTaskID ()).Warning ("could not kill task" )
799+ }
775800
776801 if reachedState == "DONE" {
777802 log .Debug ("task exited correctly" )
0 commit comments