Skip to content

Commit f23e783

Browse files
Sefi TufanCopilot
andcommitted
feat: improve error handling for gadget goroutine failures
Use errgroup.WithContext to manage gadget goroutine lifecycle instead of fire-and-forget goroutines. RunAll now returns an *errgroup.Group so the caller can Wait() on gadget completion and receive errors. - If any gadget fails, the errgroup context is canceled, stopping other gadgets - Caller distinguishes graceful shutdown from unexpected failures - Added tests for error propagation and context cancellation Closes #53 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Sefi Tufan <sefitufan@microsoft.com>
1 parent 5c56568 commit f23e783

3 files changed

Lines changed: 114 additions & 17 deletions

File tree

cmd/micromize/root.go

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ package main
1717
import (
1818
"context"
1919
_ "embed"
20+
"errors"
2021
"fmt"
2122
"log/slog"
2223
"os"
@@ -160,12 +161,21 @@ func run(ctx context.Context) error {
160161
})
161162

162163
// Run all gadgets
163-
if err := registry.RunAll(ctx); err != nil {
164+
g, err := registry.RunAll(ctx)
165+
if err != nil {
164166
return fmt.Errorf("running gadgets: %w", err)
165167
}
166168

167-
// Wait for context to be done (which happens on signal)
168-
<-ctx.Done()
169+
// Wait for all gadgets to complete.
170+
// On graceful shutdown (signal), context is canceled and gadgets stop.
171+
// If a gadget fails unexpectedly, errgroup cancels the context, stopping the rest.
172+
if err := g.Wait(); err != nil {
173+
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
174+
return nil
175+
}
176+
return fmt.Errorf("gadget error: %w", err)
177+
}
178+
169179
return nil
170180
}
171181

internal/gadget/registry.go

Lines changed: 29 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,9 @@ package gadget
1717
import (
1818
"context"
1919
"fmt"
20-
"log/slog"
2120

2221
gadgetcontext "github.com/inspektor-gadget/inspektor-gadget/pkg/gadget-context"
22+
"golang.org/x/sync/errgroup"
2323
)
2424

2525
// RuntimeManager defines the interface for running gadgets
@@ -61,24 +61,43 @@ func (r *Registry) Register(name string, config *GadgetConfig) {
6161
r.gadgets[name] = config
6262
}
6363

64-
// RunAll starts all registered gadgets
65-
func (r *Registry) RunAll(ctx context.Context) error {
64+
// RunAll starts all registered gadgets and returns an errgroup that the caller
65+
// can Wait() on. If any gadget fails, the errgroup's context is canceled,
66+
// signaling other gadgets to stop.
67+
func (r *Registry) RunAll(ctx context.Context) (*errgroup.Group, error) {
68+
g, gCtx := errgroup.WithContext(ctx)
69+
70+
// Pre-pass: create all gadget contexts before starting any goroutines.
71+
// This ensures that if CreateContext fails, no goroutines have been started.
72+
type gadgetEntry struct {
73+
name string
74+
config *GadgetConfig
75+
gadgetCtx *gadgetcontext.GadgetContext
76+
}
77+
78+
entries := make([]gadgetEntry, 0, len(r.gadgets))
6679
for name, config := range r.gadgets {
6780
contextManager := r.defaultContextManager
6881
if config.Context != nil {
6982
contextManager = config.Context
7083
}
7184

72-
gadgetCtx, err := contextManager.CreateContext(ctx, config.Bytes, config.ImageName)
85+
gadgetCtx, err := contextManager.CreateContext(gCtx, config.Bytes, config.ImageName)
7386
if err != nil {
74-
return fmt.Errorf("creating context for gadget %s: %w", name, err)
87+
return nil, fmt.Errorf("creating context for gadget %s: %w", name, err)
7588
}
7689

77-
go func(name string, config *GadgetConfig, gadgetCtx *gadgetcontext.GadgetContext) {
78-
if err := r.runtimeManager.RunGadget(gadgetCtx, config.Params); err != nil {
79-
slog.Error("Error running gadget", "name", name, "error", err)
90+
entries = append(entries, gadgetEntry{name: name, config: config, gadgetCtx: gadgetCtx})
91+
}
92+
93+
for _, e := range entries {
94+
g.Go(func() error {
95+
if err := r.runtimeManager.RunGadget(e.gadgetCtx, e.config.Params); err != nil {
96+
return fmt.Errorf("running gadget %s: %w", e.name, err)
8097
}
81-
}(name, config, gadgetCtx)
98+
return nil
99+
})
82100
}
83-
return nil
101+
102+
return g, nil
84103
}

internal/gadget/registry_test.go

Lines changed: 72 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,9 @@ package gadget
1616

1717
import (
1818
"context"
19+
"fmt"
1920
"sync"
2021
"testing"
21-
"time"
2222

2323
gadgetcontext "github.com/inspektor-gadget/inspektor-gadget/pkg/gadget-context"
2424
)
@@ -80,14 +80,82 @@ func TestRegistry_RunAll(t *testing.T) {
8080
Params: map[string]string{"foo": "bar"},
8181
})
8282

83-
if err := r.RunAll(context.Background()); err != nil {
83+
g, err := r.RunAll(context.Background())
84+
if err != nil {
8485
t.Fatalf("RunAll failed: %v", err)
8586
}
8687

88+
if err := g.Wait(); err != nil {
89+
t.Fatalf("Wait returned error: %v", err)
90+
}
91+
8792
select {
8893
case <-done:
8994
// success
90-
case <-time.After(2 * time.Millisecond):
91-
t.Fatal("timeout waiting for RunGadget")
95+
default:
96+
t.Fatal("RunGadget was never called")
97+
}
98+
}
99+
100+
func TestRegistry_RunAll_ErrorPropagation(t *testing.T) {
101+
mockRuntime := &mockRuntimeManager{
102+
runGadgetFunc: func(gadgetCtx *gadgetcontext.GadgetContext, params map[string]string) error {
103+
return fmt.Errorf("gadget startup failed")
104+
},
105+
}
106+
107+
mockContext := &mockContextCreator{}
108+
r := NewRegistry(mockContext, mockRuntime)
109+
110+
r.Register("failing-gadget", &GadgetConfig{
111+
ImageName: "test-image",
112+
Params: map[string]string{},
113+
})
114+
115+
g, err := r.RunAll(context.Background())
116+
if err != nil {
117+
t.Fatalf("RunAll failed: %v", err)
118+
}
119+
120+
err = g.Wait()
121+
if err == nil {
122+
t.Fatal("expected error from Wait, got nil")
123+
}
124+
125+
expected := "running gadget failing-gadget: gadget startup failed"
126+
if err.Error() != expected {
127+
t.Errorf("expected error %q, got %q", expected, err.Error())
128+
}
129+
}
130+
131+
func TestRegistry_RunAll_ContextCancellation(t *testing.T) {
132+
mockRuntime := &mockRuntimeManager{
133+
runGadgetFunc: func(gadgetCtx *gadgetcontext.GadgetContext, params map[string]string) error {
134+
// Simulate a long-running gadget that respects context cancellation
135+
<-gadgetCtx.Context().Done()
136+
return gadgetCtx.Context().Err()
137+
},
138+
}
139+
140+
mockContext := &mockContextCreator{}
141+
r := NewRegistry(mockContext, mockRuntime)
142+
143+
r.Register("long-running", &GadgetConfig{
144+
ImageName: "test-image",
145+
Params: map[string]string{},
146+
})
147+
148+
ctx, cancel := context.WithCancel(context.Background())
149+
g, err := r.RunAll(ctx)
150+
if err != nil {
151+
t.Fatalf("RunAll failed: %v", err)
152+
}
153+
154+
// Cancel the context to simulate shutdown signal
155+
cancel()
156+
157+
err = g.Wait()
158+
if err == nil {
159+
t.Fatal("expected error from Wait after cancellation, got nil")
92160
}
93161
}

0 commit comments

Comments
 (0)