From 0924f08b9caaf473fa27c7e49a67659ce5172506 Mon Sep 17 00:00:00 2001 From: Sefi Tufan Date: Wed, 25 Feb 2026 21:00:21 +0000 Subject: [PATCH] feat: improve error handling for gadget goroutine failures Use errgroup.WithContext to manage gadget goroutine lifecycle instead of fire-and-forget goroutines. RunAll now returns an *errgroup.Group so the caller can Wait() on gadget completion and receive errors. - If any gadget fails, the errgroup context is canceled, stopping other gadgets - Caller distinguishes graceful shutdown from unexpected failures - Added tests for error propagation and context cancellation Closes #53 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Sefi Tufan --- cmd/micromize/root.go | 16 +++++-- internal/gadget/registry.go | 39 +++++++++++---- internal/gadget/registry_test.go | 81 ++++++++++++++++++++++++++++++-- 3 files changed, 119 insertions(+), 17 deletions(-) diff --git a/cmd/micromize/root.go b/cmd/micromize/root.go index 723d061..8b712c2 100644 --- a/cmd/micromize/root.go +++ b/cmd/micromize/root.go @@ -17,6 +17,7 @@ package main import ( "context" _ "embed" + "errors" "fmt" "log/slog" "os" @@ -168,12 +169,21 @@ func run(ctx context.Context) error { }) // Run all gadgets - if err := registry.RunAll(ctx); err != nil { + g, err := registry.RunAll(ctx) + if err != nil { return fmt.Errorf("running gadgets: %w", err) } - // Wait for context to be done (which happens on signal) - <-ctx.Done() + // Wait for all gadgets to complete. + // On graceful shutdown (signal), context is canceled and gadgets stop. + // If a gadget fails unexpectedly, errgroup cancels the context, stopping the rest. + if err := g.Wait(); err != nil { + if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + return nil + } + return fmt.Errorf("gadget error: %w", err) + } + return nil } diff --git a/internal/gadget/registry.go b/internal/gadget/registry.go index 3569704..3c5f4eb 100644 --- a/internal/gadget/registry.go +++ b/internal/gadget/registry.go @@ -17,9 +17,9 @@ package gadget import ( "context" "fmt" - "log/slog" gadgetcontext "github.com/inspektor-gadget/inspektor-gadget/pkg/gadget-context" + "golang.org/x/sync/errgroup" ) // RuntimeManager defines the interface for running gadgets @@ -61,24 +61,43 @@ func (r *Registry) Register(name string, config *GadgetConfig) { r.gadgets[name] = config } -// RunAll starts all registered gadgets -func (r *Registry) RunAll(ctx context.Context) error { +// RunAll starts all registered gadgets and returns an errgroup that the caller +// can Wait() on. If any gadget fails, the errgroup's context is canceled, +// signaling other gadgets to stop. +func (r *Registry) RunAll(ctx context.Context) (*errgroup.Group, error) { + g, gCtx := errgroup.WithContext(ctx) + + // Pre-pass: create all gadget contexts before starting any goroutines. + // This ensures that if CreateContext fails, no goroutines have been started. + type gadgetEntry struct { + name string + config *GadgetConfig + gadgetCtx *gadgetcontext.GadgetContext + } + + entries := make([]gadgetEntry, 0, len(r.gadgets)) for name, config := range r.gadgets { contextManager := r.defaultContextManager if config.Context != nil { contextManager = config.Context } - gadgetCtx, err := contextManager.CreateContext(ctx, config.Bytes, config.ImageName) + gadgetCtx, err := contextManager.CreateContext(gCtx, config.Bytes, config.ImageName) if err != nil { - return fmt.Errorf("creating context for gadget %s: %w", name, err) + return nil, fmt.Errorf("creating context for gadget %s: %w", name, err) } - go func(name string, config *GadgetConfig, gadgetCtx *gadgetcontext.GadgetContext) { - if err := r.runtimeManager.RunGadget(gadgetCtx, config.Params); err != nil { - slog.Error("Error running gadget", "name", name, "error", err) + entries = append(entries, gadgetEntry{name: name, config: config, gadgetCtx: gadgetCtx}) + } + + for _, e := range entries { + g.Go(func() error { + if err := r.runtimeManager.RunGadget(e.gadgetCtx, e.config.Params); err != nil { + return fmt.Errorf("running gadget %s: %w", e.name, err) } - }(name, config, gadgetCtx) + return nil + }) } - return nil + + return g, nil } diff --git a/internal/gadget/registry_test.go b/internal/gadget/registry_test.go index 978a923..f637ab2 100644 --- a/internal/gadget/registry_test.go +++ b/internal/gadget/registry_test.go @@ -16,9 +16,9 @@ package gadget import ( "context" + "fmt" "sync" "testing" - "time" gadgetcontext "github.com/inspektor-gadget/inspektor-gadget/pkg/gadget-context" ) @@ -80,14 +80,87 @@ func TestRegistry_RunAll(t *testing.T) { Params: map[string]string{"foo": "bar"}, }) - if err := r.RunAll(context.Background()); err != nil { + g, err := r.RunAll(context.Background()) + if err != nil { t.Fatalf("RunAll failed: %v", err) } + if err := g.Wait(); err != nil { + t.Fatalf("Wait returned error: %v", err) + } + select { case <-done: // success - case <-time.After(2 * time.Millisecond): - t.Fatal("timeout waiting for RunGadget") + default: + t.Fatal("RunGadget was never called") + } +} + +func TestRegistry_RunAll_ErrorPropagation(t *testing.T) { + mockRuntime := &mockRuntimeManager{ + runGadgetFunc: func(gadgetCtx *gadgetcontext.GadgetContext, params map[string]string) error { + return fmt.Errorf("gadget startup failed") + }, + } + + mockContext := &mockContextCreator{} + r := NewRegistry(mockContext, mockRuntime) + + r.Register("failing-gadget", &GadgetConfig{ + ImageName: "test-image", + Params: map[string]string{}, + }) + + g, err := r.RunAll(context.Background()) + if err != nil { + t.Fatalf("RunAll failed: %v", err) + } + + err = g.Wait() + if err == nil { + t.Fatal("expected error from Wait, got nil") + } + + expected := "running gadget failing-gadget: gadget startup failed" + if err.Error() != expected { + t.Errorf("expected error %q, got %q", expected, err.Error()) + } +} + +func TestRegistry_RunAll_ContextCancellation(t *testing.T) { + mockRuntime := &mockRuntimeManager{ + runGadgetFunc: func(gadgetCtx *gadgetcontext.GadgetContext, params map[string]string) error { + // Simulate a long-running gadget that respects context cancellation + <-gadgetCtx.Context().Done() + return gadgetCtx.Context().Err() + }, + } + + mockContext := &mockContextCreator{} + r := NewRegistry(mockContext, mockRuntime) + + r.Register("long-running", &GadgetConfig{ + ImageName: "test-image", + Params: map[string]string{}, + }) + + r.Register("another", &GadgetConfig{ + ImageName: "test-image", + Params: map[string]string{}, + }) + + ctx, cancel := context.WithCancel(context.Background()) + g, err := r.RunAll(ctx) + if err != nil { + t.Fatalf("RunAll failed: %v", err) + } + + // Cancel the context to simulate shutdown signal + cancel() + + err = g.Wait() + if err == nil { + t.Fatal("expected error from Wait after cancellation, got nil") } }