From efb9eaf5e03af8a18ac518a4633a5cee3a846eae Mon Sep 17 00:00:00 2001 From: Zaq? Question Date: Wed, 4 Mar 2026 22:45:02 -0800 Subject: [PATCH 1/6] feat: Support configuring custom span counters to aid trace insights The goal of this feature is to enrich traces leaving refinery with more information about what happened in the trace and ease querying on that information in honeycomb. Scenario: Finding endpoints with lots of database interactions Challenge: In honeycomb (and most trace providers) querying is done on span events themselves, and there's only a single layer of aggregation. You can query for a count of events where db.statement is set and group that by resolver name. You'll end up with the total number of database interactions per endpoint. Useful, but you can't take the avg or 99th% sum of database interactions and see what endpoints are having issues. I'll note that it is possible to create a calculated field to count the interactions, compute a sum of that, and group by `trace.trace_id` to see what endpoints are the worst offenders. If you export this table, you can throw it into a sheet and do a second aggregation to get the avg of those sums. That said, it's really expensive to do these querys and I've found them to be really slow. Thus the goal of these changes is to make it dramatically easier to query and monitor these values in widgets. For example: I might setup a filter for spans with `db.statement` to count the number of database interactions in a trace. Then in honeycomb I can take the avg of a new database_interactions attribute now set on root spans. --- collect/collect.go | 68 ++++++- collect/collect_test.go | 214 ++++++++++++++++++++ config/config.go | 2 + config/file_config.go | 16 +- config/mock.go | 8 + config/span_counter_config.go | 217 +++++++++++++++++++++ config/span_counter_config_test.go | 302 +++++++++++++++++++++++++++++ sample/rules.go | 156 +-------------- 8 files changed, 825 insertions(+), 158 deletions(-) create mode 100644 config/span_counter_config.go create mode 100644 config/span_counter_config_test.go diff --git a/collect/collect.go b/collect/collect.go index ea430892f1..da7f3a282d 100644 --- a/collect/collect.go +++ b/collect/collect.go @@ -113,7 +113,8 @@ type InMemCollector struct { hostname string - memMetricSample []rtmetrics.Sample // Memory monitoring using runtime/metrics + memMetricSample []rtmetrics.Sample // Memory monitoring using runtime/metrics + spanCounterConfigs []config.SpanCounterConfig } // These are the names of the metrics we use to track the number of events sent to peers through the router. @@ -171,6 +172,7 @@ func (i *InMemCollector) Start() error { i.Logger.Info().WithField("num_workers", numWorkers).Logf("Starting InMemCollector with %d workers", numWorkers) i.StressRelief.UpdateFromConfig() + i.initSpanCounterConfigs() // Set queue capacity metrics for stress relief calculations i.Metrics.Store(DENOMINATOR_INCOMING_CAP, float64(imcConfig.IncomingQueueSize)) i.Metrics.Store(DENOMINATOR_PEER_CAP, float64(imcConfig.PeerQueueSize)) @@ -240,6 +242,7 @@ func (i *InMemCollector) reloadConfigs() { i.SamplerFactory.ClearDynsamplers() i.StressRelief.UpdateFromConfig() + i.initSpanCounterConfigs() // Send reload signals to all workers to clear their local samplers // so that the new configuration will be propagated @@ -691,6 +694,60 @@ func (i *InMemCollector) addAdditionalAttributes(sp *types.Span) { } } +// initSpanCounterConfigs loads and initializes span counter configs from the current config. +// Must be called at startup and on config reload. +func (i *InMemCollector) initSpanCounterConfigs() { + cfgs := i.Config.GetSpanCounterConfig() + for j := range cfgs { + if err := cfgs[j].Init(); err != nil { + i.Logger.Error().WithField("error", err).Logf("failed to initialize span counter config entry %q", cfgs[j].Key) + } + } + i.mutex.Lock() + i.spanCounterConfigs = cfgs + i.mutex.Unlock() +} + +// computeCustomCounts computes each counter's value by iterating all spans in the trace +// and attaches the results to the root span. +// Returns nil, nil if there are no counters configured or no root span. +// +// Stress relief note: this runs inside sendTraces(), the sole consumer of the +// tracesToSend channel. Work is O(N×M) — N spans × M counters — so large +// traces with many counters slow the consumer, which deepens the outgoing +// queue. The stress relief system monitors queue depth as one of its stress +// inputs, so heavy custom-count configurations can raise the measured stress +// level and trigger earlier activation of stress relief. Additionally, spans +// processed via ProcessSpanImmediately (the stress-relief fast path) bypass the +// trace buffer entirely and never reach sendTraces, so custom counts are not +// computed or attached to stress-sampled traces. +func (i *InMemCollector) computeCustomCounts(t sendableTrace) (*types.Span, map[string]int64) { + i.mutex.RLock() + counters := i.spanCounterConfigs + i.mutex.RUnlock() + + if len(counters) == 0 { + return nil, nil + } + + targetSpan := t.RootSpan + if targetSpan == nil { + return nil, nil + } + + var rootData config.SpanData = &targetSpan.Data + counts := make(map[string]int64, len(counters)) + for _, sp := range t.GetSpans() { + for _, counter := range counters { + if counter.MatchesSpan(&sp.Data, rootData) { + counts[counter.Key]++ + } + } + } + + return targetSpan, counts +} + func (i *InMemCollector) sendTraces() { defer i.sendTracesWG.Done() @@ -698,6 +755,8 @@ func (i *InMemCollector) sendTraces() { i.Metrics.Histogram("collector_outgoing_queue", float64(len(i.tracesToSend))) _, span := otelutil.StartSpanMulti(context.Background(), i.Tracer, "sendTrace", map[string]interface{}{"num_spans": t.DescendantCount(), "tracesToSend_size": len(i.tracesToSend)}) + customCountTarget, customCounts := i.computeCustomCounts(t) + for _, sp := range t.GetSpans() { if i.Config.GetAddRuleReasonToTrace() { @@ -721,6 +780,13 @@ func (i *InMemCollector) sendTraces() { } } + // set custom span counts on the target span (root if present, else best fallback) + if sp == customCountTarget { + for k, v := range customCounts { + sp.Data.Set(k, v) + } + } + isDryRun := i.Config.GetIsDryRun() if isDryRun { sp.Data.Set(config.DryRunFieldName, t.shouldSend) diff --git a/collect/collect_test.go b/collect/collect_test.go index ffa97cab8b..fcc950a189 100644 --- a/collect/collect_test.go +++ b/collect/collect_test.go @@ -1901,6 +1901,220 @@ func TestWorkerHealthReporting(t *testing.T) { }, 2*time.Second, 50*time.Millisecond, "InMemCollector should be healthy again after worker resumes") } +// customCountConf returns a base MockConfig suitable for custom span count tests. +func customCountConf(counters []config.SpanCounterConfig) *config.MockConfig { + return &config.MockConfig{ + GetTracesConfigVal: config.TracesConfig{ + SendTicker: config.Duration(2 * time.Millisecond), + SendDelay: config.Duration(1 * time.Millisecond), + TraceTimeout: config.Duration(60 * time.Second), + MaxBatchSize: 500, + }, + SampleCache: config.SampleCacheConfig{ + KeptSize: 100, + DroppedSize: 100, + SizeCheckInterval: config.Duration(1 * time.Second), + }, + GetSamplerTypeVal: &config.DeterministicSamplerConfig{SampleRate: 1}, + TraceIdFieldNames: []string{"trace.trace_id", "traceId"}, + ParentIdFieldNames: []string{"trace.parent_id", "parentId"}, + GetCollectionConfigVal: config.CollectionConfig{ + WorkerCount: 2, + ShutdownDelay: config.Duration(1 * time.Millisecond), + IncomingQueueSize: 10, + PeerQueueSize: 10, + }, + SpanCounterConfigs: counters, + } +} + +// TestCustomSpanCounts_NoCounters verifies that when no counters are configured +// no custom fields are added to any span. +func TestCustomSpanCounts_NoCounters(t *testing.T) { + coll := newTestCollector(t, customCountConf(nil)) + transmission := coll.Transmission.(*transmit.MockTransmission) + + traceID := "no-counters" + coll.AddSpanFromPeer(&types.Span{ + TraceID: traceID, + Event: &types.Event{ + Dataset: "test", + Data: types.NewPayload(coll.Config, map[string]interface{}{"trace.parent_id": "x"}), + APIKey: legacyAPIKey, + }, + }) + coll.AddSpan(&types.Span{ + TraceID: traceID, + IsRoot: true, + Event: &types.Event{Dataset: "test", Data: types.NewPayload(coll.Config, nil), APIKey: legacyAPIKey}, + }) + + events := transmission.GetBlock(2) + for _, ev := range events { + assert.Nil(t, ev.Data.Get("my.count"), "no custom count fields should be set when no counters are configured") + } +} + +// TestCustomSpanCounts_CountsLandOnRoot verifies that a counter with no +// conditions counts all spans and attaches the result to the root span only. +func TestCustomSpanCounts_CountsLandOnRoot(t *testing.T) { + counters := []config.SpanCounterConfig{ + {Key: "all_spans"}, + } + coll := newTestCollector(t, customCountConf(counters)) + transmission := coll.Transmission.(*transmit.MockTransmission) + + traceID := "root-target" + for i := 0; i < 3; i++ { + coll.AddSpanFromPeer(&types.Span{ + TraceID: traceID, + Event: &types.Event{ + Dataset: "test", + Data: types.NewPayload(coll.Config, map[string]interface{}{"trace.parent_id": "x"}), + APIKey: legacyAPIKey, + }, + }) + } + coll.AddSpan(&types.Span{ + TraceID: traceID, + IsRoot: true, + Event: &types.Event{Dataset: "test", Data: types.NewPayload(coll.Config, nil), APIKey: legacyAPIKey}, + }) + + events := transmission.GetBlock(4) + require.Equal(t, 4, len(events)) + + var rootEvent *types.Event + var childEvents []*types.Event + for _, ev := range events { + if ev.Data.Get("trace.parent_id") == nil { + rootEvent = ev + } else { + childEvents = append(childEvents, ev) + } + } + + require.NotNil(t, rootEvent) + // all 4 spans counted (3 children + root) + assert.Equal(t, int64(4), rootEvent.Data.Get("all_spans")) + for _, child := range childEvents { + assert.Nil(t, child.Data.Get("all_spans"), "custom count should not be set on child spans") + } +} + +// TestCustomSpanCounts_ConditionalCounting verifies that only spans matching +// a condition are counted. +func TestCustomSpanCounts_ConditionalCounting(t *testing.T) { + counters := []config.SpanCounterConfig{ + { + Key: "error_spans", + Conditions: []*config.RulesBasedSamplerCondition{ + {Field: "error", Operator: config.EQ, Value: true}, + }, + }, + } + coll := newTestCollector(t, customCountConf(counters)) + transmission := coll.Transmission.(*transmit.MockTransmission) + + traceID := "conditional" + // 2 error spans + for i := 0; i < 2; i++ { + coll.AddSpanFromPeer(&types.Span{ + TraceID: traceID, + Event: &types.Event{ + Dataset: "test", + Data: types.NewPayload(coll.Config, map[string]interface{}{"trace.parent_id": "x", "error": true}), + APIKey: legacyAPIKey, + }, + }) + } + // 2 non-error spans + for i := 0; i < 2; i++ { + coll.AddSpanFromPeer(&types.Span{ + TraceID: traceID, + Event: &types.Event{ + Dataset: "test", + Data: types.NewPayload(coll.Config, map[string]interface{}{"trace.parent_id": "x"}), + APIKey: legacyAPIKey, + }, + }) + } + coll.AddSpan(&types.Span{ + TraceID: traceID, + IsRoot: true, + Event: &types.Event{Dataset: "test", Data: types.NewPayload(coll.Config, nil), APIKey: legacyAPIKey}, + }) + + events := transmission.GetBlock(5) + require.Equal(t, 5, len(events)) + + var rootEvent *types.Event + for _, ev := range events { + if ev.Data.Get("trace.parent_id") == nil { + rootEvent = ev + } + } + require.NotNil(t, rootEvent) + assert.Equal(t, int64(2), rootEvent.Data.Get("error_spans")) +} + +// TestCustomSpanCounts_MultipleCounters verifies that multiple counters with +// different conditions produce independent counts on the root span. +func TestCustomSpanCounts_MultipleCounters(t *testing.T) { + counters := []config.SpanCounterConfig{ + { + Key: "db_spans", + Conditions: []*config.RulesBasedSamplerCondition{ + {Field: "db.system", Operator: config.Exists}, + }, + }, + { + Key: "error_spans", + Conditions: []*config.RulesBasedSamplerCondition{ + {Field: "error", Operator: config.EQ, Value: true}, + }, + }, + } + coll := newTestCollector(t, customCountConf(counters)) + transmission := coll.Transmission.(*transmit.MockTransmission) + + traceID := "multi-counter" + spans := []map[string]interface{}{ + {"trace.parent_id": "x", "db.system": "postgresql"}, + {"trace.parent_id": "x", "db.system": "postgresql", "error": true}, + {"trace.parent_id": "x", "error": true}, + {"trace.parent_id": "x"}, + } + for _, data := range spans { + coll.AddSpanFromPeer(&types.Span{ + TraceID: traceID, + Event: &types.Event{ + Dataset: "test", + Data: types.NewPayload(coll.Config, data), + APIKey: legacyAPIKey, + }, + }) + } + coll.AddSpan(&types.Span{ + TraceID: traceID, + IsRoot: true, + Event: &types.Event{Dataset: "test", Data: types.NewPayload(coll.Config, nil), APIKey: legacyAPIKey}, + }) + + events := transmission.GetBlock(5) + require.Equal(t, 5, len(events)) + + var rootEvent *types.Event + for _, ev := range events { + if ev.Data.Get("trace.parent_id") == nil { + rootEvent = ev + } + } + require.NotNil(t, rootEvent) + assert.Equal(t, int64(2), rootEvent.Data.Get("db_spans"), "2 spans have db.system") + assert.Equal(t, int64(2), rootEvent.Data.Get("error_spans"), "2 spans have error=true") +} + // BenchmarkCollectorWithSamplers runs benchmarks for different sampler configurations. // This is a tricky benchmark to interpret because just setting up the input data // can easily be more expensive than the collector's routing code. The goal is to diff --git a/config/config.go b/config/config.go index 224fe07d76..cbbd211265 100644 --- a/config/config.go +++ b/config/config.go @@ -151,6 +151,8 @@ type Config interface { GetAddCountsToRoot() bool + GetSpanCounterConfig() []SpanCounterConfig + GetConfigMetadata() []ConfigMetadata GetSampleCacheConfig() SampleCacheConfig diff --git a/config/file_config.go b/config/file_config.go index 43206dae90..dbcf87a7b7 100644 --- a/config/file_config.go +++ b/config/file_config.go @@ -189,10 +189,11 @@ func (dt *DefaultTrue) UnmarshalText(text []byte) error { } type RefineryTelemetryConfig struct { - AddRuleReasonToTrace bool `yaml:"AddRuleReasonToTrace"` - AddSpanCountToRoot *DefaultTrue `yaml:"AddSpanCountToRoot" default:"true"` // Avoid pointer woe on access, use GetAddSpanCountToRoot() instead. - AddCountsToRoot bool `yaml:"AddCountsToRoot"` - AddHostMetadataToTrace *DefaultTrue `yaml:"AddHostMetadataToTrace" default:"true"` // Avoid pointer woe on access, use GetAddHostMetadataToTrace() instead. + AddRuleReasonToTrace bool `yaml:"AddRuleReasonToTrace"` + AddSpanCountToRoot *DefaultTrue `yaml:"AddSpanCountToRoot" default:"true"` // Avoid pointer woe on access, use GetAddSpanCountToRoot() instead. + AddCountsToRoot bool `yaml:"AddCountsToRoot"` + AddHostMetadataToTrace *DefaultTrue `yaml:"AddHostMetadataToTrace" default:"true"` // Avoid pointer woe on access, use GetAddHostMetadataToTrace() instead. + CustomSpanCounts []SpanCounterConfig `yaml:"CustomSpanCounts,omitempty"` } type TracesConfig struct { @@ -1116,6 +1117,13 @@ func (f *fileConfig) GetAddCountsToRoot() bool { return f.mainConfig.Telemetry.AddCountsToRoot } +func (f *fileConfig) GetSpanCounterConfig() []SpanCounterConfig { + f.mux.RLock() + defer f.mux.RUnlock() + + return f.mainConfig.Telemetry.CustomSpanCounts +} + func (f *fileConfig) GetSampleCacheConfig() SampleCacheConfig { f.mux.RLock() defer f.mux.RUnlock() diff --git a/config/mock.go b/config/mock.go index 785197a795..57eef3af73 100644 --- a/config/mock.go +++ b/config/mock.go @@ -52,6 +52,7 @@ type MockConfig struct { AdditionalErrorFields []string AddSpanCountToRoot bool AddCountsToRoot bool + SpanCounterConfigs []SpanCounterConfig CacheOverrunStrategy string SampleCache SampleCacheConfig StressRelief StressReliefConfig @@ -415,6 +416,13 @@ func (f *MockConfig) GetAddCountsToRoot() bool { return f.AddSpanCountToRoot } +func (f *MockConfig) GetSpanCounterConfig() []SpanCounterConfig { + f.Mux.RLock() + defer f.Mux.RUnlock() + + return f.SpanCounterConfigs +} + func (f *MockConfig) GetSampleCacheConfig() SampleCacheConfig { f.Mux.RLock() defer f.Mux.RUnlock() diff --git a/config/span_counter_config.go b/config/span_counter_config.go new file mode 100644 index 0000000000..6985e9203d --- /dev/null +++ b/config/span_counter_config.go @@ -0,0 +1,217 @@ +package config + +import "strings" + +// SpanData is the interface required for matching span fields in a SpanCounterConfig. +// It is satisfied by *types.Payload. +type SpanData interface { + Get(key string) any + Exists(key string) bool +} + +// SpanCounterConfig defines a custom span count to be computed and added to +// the root span under Key. Spans are counted if they satisfy all Conditions. +type SpanCounterConfig struct { + Key string `yaml:"Key"` + Conditions []*RulesBasedSamplerCondition `yaml:"Conditions,omitempty"` +} + +// Init initializes all conditions. Must be called before MatchesSpan. +func (c *SpanCounterConfig) Init() error { + for _, cond := range c.Conditions { + if err := cond.Init(); err != nil { + return err + } + } + return nil +} + +// MatchesSpan returns true if the span satisfies all conditions. +// span is the span being tested; root is the root span's data (may be nil). +func (c *SpanCounterConfig) MatchesSpan(span SpanData, root SpanData) bool { + for _, cond := range c.Conditions { + var value any + var exists bool + for _, field := range cond.Fields { + if strings.HasPrefix(field, RootPrefix) { + if root != nil { + f := field[len(RootPrefix):] + if root.Exists(f) { + value = root.Get(f) + exists = true + break + } + } + } else { + if span.Exists(field) { + value = span.Get(field) + exists = true + break + } + } + } + + if cond.Matches != nil { + if !cond.Matches(value, exists) { + return false + } + } else { + if !ConditionMatchesValue(cond, value, exists) { + return false + } + } + } + return true +} + +// ConditionMatchesValue evaluates a condition against a value when the +// condition's Matches function has not been set (i.e. Datatype is unspecified). +// This is exported so that sample/rules.go can share the implementation. +func ConditionMatchesValue(condition *RulesBasedSamplerCondition, value interface{}, exists bool) bool { + var match bool + switch exists { + case true: + switch condition.Operator { + case Exists: + match = exists + case NEQ: + if comparison, ok := compareValues(value, condition.Value); ok { + match = comparison != equal + } + case EQ: + if comparison, ok := compareValues(value, condition.Value); ok { + match = comparison == equal + } + case GT: + if comparison, ok := compareValues(value, condition.Value); ok { + match = comparison == more + } + case GTE: + if comparison, ok := compareValues(value, condition.Value); ok { + match = comparison == more || comparison == equal + } + case LT: + if comparison, ok := compareValues(value, condition.Value); ok { + match = comparison == less + } + case LTE: + if comparison, ok := compareValues(value, condition.Value); ok { + match = comparison == less || comparison == equal + } + } + case false: + switch condition.Operator { + case NotExists: + match = !exists + } + } + return match +} + +const ( + less = -1 + equal = 0 + more = 1 +) + +// compareValues compares two values of potentially mixed numeric types. +// a is the span field value (float64, int64, bool, or string). +// b is the condition value (float64, int64, int, bool, or string). +func compareValues(a, b interface{}) (int, bool) { + if a == nil { + if b == nil { + return equal, true + } + return less, true + } + + if b == nil { + return more, true + } + + switch at := a.(type) { + case int64: + switch bt := b.(type) { + case int: + i := int(at) + switch { + case i < bt: + return less, true + case i > bt: + return more, true + default: + return equal, true + } + case int64: + switch { + case at < bt: + return less, true + case at > bt: + return more, true + default: + return equal, true + } + case float64: + f := float64(at) + switch { + case f < bt: + return less, true + case f > bt: + return more, true + default: + return equal, true + } + } + case float64: + switch bt := b.(type) { + case int: + f := float64(bt) + switch { + case at < f: + return less, true + case at > f: + return more, true + default: + return equal, true + } + case int64: + f := float64(bt) + switch { + case at < f: + return less, true + case at > f: + return more, true + default: + return equal, true + } + case float64: + switch { + case at < bt: + return less, true + case at > bt: + return more, true + default: + return equal, true + } + } + case bool: + switch bt := b.(type) { + case bool: + switch { + case !at && bt: + return less, true + case at && !bt: + return more, true + default: + return equal, true + } + } + case string: + switch bt := b.(type) { + case string: + return strings.Compare(at, bt), true + } + } + + return equal, false +} diff --git a/config/span_counter_config_test.go b/config/span_counter_config_test.go new file mode 100644 index 0000000000..366a95803b --- /dev/null +++ b/config/span_counter_config_test.go @@ -0,0 +1,302 @@ +package config + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +// spanData is a simple map-backed implementation of SpanData for tests. +type spanData map[string]any + +func (s spanData) Get(key string) any { return s[key] } +func (s spanData) Exists(key string) bool { _, ok := s[key]; return ok } + +// cond builds an initialized RulesBasedSamplerCondition from a field name, +// operator, and optional value. It calls Init() so that the Matches function +// is set when Datatype is empty (the ConditionMatchesValue path). +func cond(field, operator string, value any) *RulesBasedSamplerCondition { + c := &RulesBasedSamplerCondition{ + Field: field, + Operator: operator, + Value: value, + } + if err := c.Init(); err != nil { + panic("cond Init: " + err.Error()) + } + return c +} + +// condTyped builds an initialized condition with an explicit Datatype, which +// causes Init to set a type-coercing Matches function instead of falling +// through to ConditionMatchesValue. +func condTyped(field, operator string, value any, datatype string) *RulesBasedSamplerCondition { + c := &RulesBasedSamplerCondition{ + Field: field, + Operator: operator, + Value: value, + Datatype: datatype, + } + if err := c.Init(); err != nil { + panic("condTyped Init: " + err.Error()) + } + return c +} + +// ---------------------------------------------------------------------------- +// compareValues +// ---------------------------------------------------------------------------- + +func TestCompareValues(t *testing.T) { + tests := []struct { + name string + a, b any + want int + wantOK bool + }{ + // nil handling + {"nil==nil", nil, nil, equal, true}, + {"nilnil", int64(1), nil, more, true}, + + // int64 vs int64 + {"i64 less", int64(1), int64(2), less, true}, + {"i64 equal", int64(3), int64(3), equal, true}, + {"i64 more", int64(5), int64(4), more, true}, + + // int64 vs int + {"i64 vs int less", int64(1), int(2), less, true}, + {"i64 vs int equal", int64(3), int(3), equal, true}, + {"i64 vs int more", int64(5), int(4), more, true}, + + // int64 vs float64 + {"i64 vs f64 less", int64(1), float64(1.5), less, true}, + {"i64 vs f64 equal", int64(2), float64(2.0), equal, true}, + {"i64 vs f64 more", int64(3), float64(2.9), more, true}, + + // float64 vs float64 + {"f64 less", float64(1.1), float64(1.2), less, true}, + {"f64 equal", float64(2.5), float64(2.5), equal, true}, + {"f64 more", float64(3.0), float64(2.0), more, true}, + + // float64 vs int + {"f64 vs int less", float64(0.5), int(1), less, true}, + {"f64 vs int equal", float64(2.0), int(2), equal, true}, + {"f64 vs int more", float64(2.1), int(2), more, true}, + + // float64 vs int64 + {"f64 vs i64 less", float64(0.5), int64(1), less, true}, + {"f64 vs i64 equal", float64(2.0), int64(2), equal, true}, + {"f64 vs i64 more", float64(3.0), int64(2), more, true}, + + // bool + {"bool falsefalse", true, false, more, true}, + {"bool equal", true, true, equal, true}, + + // string + {"str less", "apple", "banana", less, true}, + {"str equal", "foo", "foo", equal, true}, + {"str more", "zoo", "ant", more, true}, + + // type mismatch → ok=false + {"mismatch int64 str", int64(1), "1", equal, false}, + {"mismatch f64 str", float64(1.0), "1.0", equal, false}, + {"mismatch bool str", true, "true", equal, false}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got, ok := compareValues(tc.a, tc.b) + assert.Equal(t, tc.wantOK, ok, "ok") + if tc.wantOK { + assert.Equal(t, tc.want, got, "comparison result") + } + }) + } +} + +// ---------------------------------------------------------------------------- +// ConditionMatchesValue +// ---------------------------------------------------------------------------- + +func TestConditionMatchesValue(t *testing.T) { + tests := []struct { + name string + operator string + condVal any + spanVal any + exists bool + want bool + }{ + // Exists / NotExists + {"exists true", Exists, nil, "anything", true, true}, + {"exists false", Exists, nil, nil, false, false}, + {"not-exists true", NotExists, nil, nil, false, true}, + {"not-exists false", NotExists, nil, "x", true, false}, + + // EQ + {"eq string match", EQ, "foo", "foo", true, true}, + {"eq string no-match", EQ, "foo", "bar", true, false}, + {"eq int64 match", EQ, int64(42), int64(42), true, true}, + {"eq int64 no-match", EQ, int64(42), int64(0), true, false}, + {"eq type mismatch", EQ, "1", int64(1), true, false}, // compareValues returns ok=false → no match + + // NEQ + {"neq match", NEQ, "foo", "bar", true, true}, + {"neq no-match", NEQ, "foo", "foo", true, false}, + + // GT / GTE / LT / LTE + {"gt true", GT, int64(1), int64(2), true, true}, + {"gt false eq", GT, int64(1), int64(1), true, false}, + {"gte equal", GTE, int64(1), int64(1), true, true}, + {"gte more", GTE, int64(1), int64(2), true, true}, + {"gte less", GTE, int64(2), int64(1), true, false}, + {"lt true", LT, int64(2), int64(1), true, true}, + {"lt false", LT, int64(1), int64(2), true, false}, + {"lte equal", LTE, int64(2), int64(2), true, true}, + {"lte less", LTE, int64(3), int64(2), true, true}, + {"lte more", LTE, int64(1), int64(2), true, false}, + + // field does not exist with non-NotExists operator → no match + {"eq field missing", EQ, "foo", nil, false, false}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + c := &RulesBasedSamplerCondition{ + Operator: tc.operator, + Value: tc.condVal, + } + got := ConditionMatchesValue(c, tc.spanVal, tc.exists) + assert.Equal(t, tc.want, got) + }) + } +} + +// ---------------------------------------------------------------------------- +// SpanCounterConfig.MatchesSpan +// ---------------------------------------------------------------------------- + +func TestMatchesSpan_NoConditions(t *testing.T) { + // A counter with no conditions matches every span. + counter := SpanCounterConfig{Key: "all"} + assert.True(t, counter.MatchesSpan(spanData{"foo": "bar"}, nil)) + assert.True(t, counter.MatchesSpan(spanData{}, nil)) +} + +func TestMatchesSpan_SingleCondition(t *testing.T) { + counter := SpanCounterConfig{ + Key: "errors", + Conditions: []*RulesBasedSamplerCondition{cond("error", EQ, true)}, + } + + assert.True(t, counter.MatchesSpan(spanData{"error": true}, nil)) + assert.False(t, counter.MatchesSpan(spanData{"error": false}, nil)) + assert.False(t, counter.MatchesSpan(spanData{}, nil)) +} + +func TestMatchesSpan_MultipleConditionsAllMustMatch(t *testing.T) { + counter := SpanCounterConfig{ + Key: "slow-errors", + Conditions: []*RulesBasedSamplerCondition{ + cond("error", EQ, true), + cond("duration_ms", GT, int64(500)), + }, + } + + assert.True(t, counter.MatchesSpan(spanData{"error": true, "duration_ms": int64(1000)}, nil)) + assert.False(t, counter.MatchesSpan(spanData{"error": true, "duration_ms": int64(100)}, nil)) + assert.False(t, counter.MatchesSpan(spanData{"error": false, "duration_ms": int64(1000)}, nil)) + assert.False(t, counter.MatchesSpan(spanData{}, nil)) +} + +func TestMatchesSpan_RootPrefixedField(t *testing.T) { + // "root.service.name" reads from the root span data, not the span itself. + counter := SpanCounterConfig{ + Key: "svc-db", + Conditions: []*RulesBasedSamplerCondition{cond("root.service.name", EQ, "database")}, + } + + root := spanData{"service.name": "database"} + span := spanData{"duration_ms": int64(5)} + + assert.True(t, counter.MatchesSpan(span, root)) + assert.False(t, counter.MatchesSpan(span, spanData{"service.name": "api"})) +} + +func TestMatchesSpan_RootPrefixedField_NilRoot(t *testing.T) { + // When root is nil a root-prefixed field is never found → field is absent. + counter := SpanCounterConfig{ + Key: "svc", + Conditions: []*RulesBasedSamplerCondition{cond("root.service.name", EQ, "database")}, + } + assert.False(t, counter.MatchesSpan(spanData{}, nil)) +} + +func TestMatchesSpan_MultiFieldFallback(t *testing.T) { + // When multiple fields are listed, the first one found is used. + c := &RulesBasedSamplerCondition{ + Fields: []string{"trace.trace_id", "traceId"}, + Operator: Exists, + } + if err := c.Init(); err != nil { + t.Fatal(err) + } + counter := SpanCounterConfig{Key: "has-trace", Conditions: []*RulesBasedSamplerCondition{c}} + + assert.True(t, counter.MatchesSpan(spanData{"trace.trace_id": "abc"}, nil)) + assert.True(t, counter.MatchesSpan(spanData{"traceId": "abc"}, nil)) + assert.False(t, counter.MatchesSpan(spanData{}, nil)) +} + +func TestMatchesSpan_MultiFieldFallback_FirstWins(t *testing.T) { + // If the first field exists but evaluates to a non-match, the second field + // is not consulted — only the first found field is used. + c := &RulesBasedSamplerCondition{ + Fields: []string{"a", "b"}, + Operator: EQ, + Value: "yes", + } + if err := c.Init(); err != nil { + t.Fatal(err) + } + counter := SpanCounterConfig{Key: "k", Conditions: []*RulesBasedSamplerCondition{c}} + + // "a" is found with wrong value; "b" has the right value but is not checked. + assert.False(t, counter.MatchesSpan(spanData{"a": "no", "b": "yes"}, nil)) + // Only "b" exists → fallback to "b" → match. + assert.True(t, counter.MatchesSpan(spanData{"b": "yes"}, nil)) +} + +func TestMatchesSpan_TypedCondition(t *testing.T) { + // When Datatype is set, Init wires up a type-coercing Matches function. + // Verify that MatchesSpan delegates to it correctly. + counter := SpanCounterConfig{ + Key: "count-int", + Conditions: []*RulesBasedSamplerCondition{condTyped("code", EQ, 200, "int")}, + } + + // span value arrives as string "200"; the typed matcher coerces it. + assert.True(t, counter.MatchesSpan(spanData{"code": "200"}, nil)) + assert.False(t, counter.MatchesSpan(spanData{"code": "404"}, nil)) +} + +func TestMatchesSpan_ExistsAndNotExists(t *testing.T) { + exists := SpanCounterConfig{ + Key: "has-field", + Conditions: []*RulesBasedSamplerCondition{cond("db.query", Exists, nil)}, + } + notExists := SpanCounterConfig{ + Key: "no-field", + Conditions: []*RulesBasedSamplerCondition{cond("db.query", NotExists, nil)}, + } + + withField := spanData{"db.query": "SELECT 1"} + without := spanData{} + + assert.True(t, exists.MatchesSpan(withField, nil)) + assert.False(t, exists.MatchesSpan(without, nil)) + assert.False(t, notExists.MatchesSpan(withField, nil)) + assert.True(t, notExists.MatchesSpan(without, nil)) +} diff --git a/sample/rules.go b/sample/rules.go index 3f907f3ad1..cf25d547dd 100644 --- a/sample/rules.go +++ b/sample/rules.go @@ -308,158 +308,8 @@ func extractValueFromSpan( return nil, false, false } -// This only gets called when we're using one of the basic operators, and -// there is no datatype specified (meaning that the Matches function has not -// been set). In this case, we need to do some type conversion and comparison -// to determine whether the condition matches the value. +// conditionMatchesValue delegates to config.ConditionMatchesValue. +// It is called when condition.Matches is nil (Datatype was not specified). func conditionMatchesValue(condition *config.RulesBasedSamplerCondition, value interface{}, exists bool) bool { - var match bool - switch exists { - case true: - switch condition.Operator { - case config.Exists: - match = exists - case config.NEQ: - if comparison, ok := compare(value, condition.Value); ok { - match = comparison != equal - } - case config.EQ: - if comparison, ok := compare(value, condition.Value); ok { - match = comparison == equal - } - case config.GT: - if comparison, ok := compare(value, condition.Value); ok { - match = comparison == more - } - case config.GTE: - if comparison, ok := compare(value, condition.Value); ok { - match = comparison == more || comparison == equal - } - case config.LT: - if comparison, ok := compare(value, condition.Value); ok { - match = comparison == less - } - case config.LTE: - if comparison, ok := compare(value, condition.Value); ok { - match = comparison == less || comparison == equal - } - } - case false: - switch condition.Operator { - case config.NotExists: - match = !exists - } - } - return match -} - -const ( - less = -1 - equal = 0 - more = 1 -) - -func compare(a, b interface{}) (int, bool) { - // a is the tracing data field value. This can be: float64, int64, bool, or string - // b is the Rule condition value. This can be: float64, int64, int, bool, or string - // Note: in YAML config parsing, the Value may be returned as int - // When comparing numeric values, we need to check across the 3 types: float64, int64, and int - - if a == nil { - if b == nil { - return equal, true - } - - return less, true - } - - if b == nil { - return more, true - } - - switch at := a.(type) { - case int64: - switch bt := b.(type) { - case int: - i := int(at) - switch { - case i < bt: - return less, true - case i > bt: - return more, true - default: - return equal, true - } - case int64: - switch { - case at < bt: - return less, true - case at > bt: - return more, true - default: - return equal, true - } - case float64: - f := float64(at) - switch { - case f < bt: - return less, true - case f > bt: - return more, true - default: - return equal, true - } - } - case float64: - switch bt := b.(type) { - case int: - f := float64(bt) - switch { - case at < f: - return less, true - case at > f: - return more, true - default: - return equal, true - } - case int64: - f := float64(bt) - switch { - case at < f: - return less, true - case at > f: - return more, true - default: - return equal, true - } - case float64: - switch { - case at < bt: - return less, true - case at > bt: - return more, true - default: - return equal, true - } - } - case bool: - switch bt := b.(type) { - case bool: - switch { - case !at && bt: - return less, true - case at && !bt: - return more, true - default: - return equal, true - } - } - case string: - switch bt := b.(type) { - case string: - return strings.Compare(at, bt), true - } - } - - return equal, false + return config.ConditionMatchesValue(condition, value, exists) } From d46a60fc2da4d968994c01fc8eaac8ac16c65228 Mon Sep 17 00:00:00 2001 From: Zaq? Question Date: Mon, 9 Mar 2026 13:59:12 -0700 Subject: [PATCH 2/6] update docker builder to support khan gcp --- build-docker.sh | 117 ++++++++++++++---------------------------------- 1 file changed, 34 insertions(+), 83 deletions(-) diff --git a/build-docker.sh b/build-docker.sh index e47c388573..8a5cfb2631 100755 --- a/build-docker.sh +++ b/build-docker.sh @@ -3,101 +3,52 @@ set -o nounset set -o pipefail set -o xtrace -### Versioning and image tagging ### -# -# Three build scenarios: -# 1. CI release build: triggered by git tag -# - Stable (vX.Y.Z): tagged with major, minor, patch, and "latest" -# - Pre-release (vX.Y.Z-suffix): tagged only with exact version -# 2. CI branch build: version + CI job ID, tagged with branch name (+ "latest" if main) -# 3. Local build: version from git describe, tagged with that version - -# Get version info from git (used by branch and local builds) -# --tags: use any tag, not just annotated ones -# --match='v[0-9]*': only version tags (starts with v and a digit) -# --always: fall back to commit ID if no tag found -# e.g., v2.1.1-45-ga1b2c3d means commit a1b2c3d, 45 commits ahead of tag v2.1.1 -VERSION_FROM_GIT=$(git describe --tags --match='v[0-9]*' --always) - -if [[ -n "${CIRCLE_TAG:-}" ]]; then - # Release build (triggered by git tag) - VERSION=${CIRCLE_TAG#"v"} - - if [[ "${CIRCLE_TAG}" =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]]; then - # Stable release: tag with major, minor, patch, and latest - # e.g., v2.1.1 -> "2", "2.1", "2.1.1", "latest" - MAJOR_VERSION=${VERSION%%.*} - MINOR_VERSION=${VERSION%.*} - TAGS="$MAJOR_VERSION,$MINOR_VERSION,$VERSION,latest" - else - # Pre-release: only the exact version tag - # e.g., v3.0.0-rc1 -> "3.0.0-rc1" - TAGS="$VERSION" - fi - -elif [[ -n "${CIRCLE_BRANCH:-}" ]]; then - # CI branch build - # Version from git describe + CI job ID - # e.g., 2.1.1-45-ga1b2c3d-ci8675309 - VERSION="${VERSION_FROM_GIT#'v'}-ci${CIRCLE_BUILD_NUM}" - BRANCH_TAG=${CIRCLE_BRANCH//\//-} - TAGS="${VERSION},branch-${BRANCH_TAG}" - - # Main branch builds are tagged "latest" in the private registry - if [[ "${CIRCLE_BRANCH}" == "main" ]]; then - TAGS+=",latest" - fi - -else - # Local build - # Version from git describe only - # e.g., 2.1.1-45-ga1b2c3d - VERSION=${VERSION_FROM_GIT#'v'} - TAGS="${VERSION}" -fi - -GIT_COMMIT=${CIRCLE_SHA1:-$(git rev-parse HEAD)} +GCLOUD_REGISTRY="us-central1-docker.pkg.dev/khan-internal-services/refinery" + +# Parse flags +PUSH=false + +while [[ $# -gt 0 ]]; do + case "$1" in + --push) + PUSH=true + shift + ;; + *) + echo "Usage: $0 [--push]" + echo " --push Build and push to ${GCLOUD_REGISTRY}/refinery" + echo " (default) Build locally only" + exit 1 + ;; + esac +done + +VERSION=$(git describe --tags --match='v[0-9]*' --always) +VERSION=${VERSION#v} +GIT_COMMIT=$(git rev-parse HEAD) unset GOOS unset GOARCH export GOFLAGS="-ldflags=-X=main.BuildID=$VERSION" export SOURCE_DATE_EPOCH=${SOURCE_DATE_EPOCH:-$(make latest_modification_time)} -# Build the image once, either to a remote registry designated by PRIMARY_DOCKER_REPO -# or to the local repository as "ko.local/refinery:" if PRIMARY_DOCKER_REPO is not set. -export KO_DOCKER_REPO="${PRIMARY_DOCKER_REPO:-ko.local}" +# Force IPv4 to avoid IPv6 connectivity issues when pulling base image layers +export GODEBUG=preferIPv4=1 + +if [[ "$PUSH" == "true" ]]; then + export KO_DOCKER_REPO="$GCLOUD_REGISTRY" +else + export KO_DOCKER_REPO="ko.local" +fi -echo "Building image locally with ko for multi-registry push..." # shellcheck disable=SC2086 -IMAGE_REF=$(./ko publish \ - --tags "${TAGS}" \ +IMAGE_REF=$(ko publish \ + --tags "${VERSION}" \ --base-import-paths \ --platform "linux/amd64,linux/arm64" \ - --image-label org.opencontainers.image.source=https://github.com/honeycombio/refinery \ + --image-label org.opencontainers.image.source=https://github.com/khan/refinery \ --image-label org.opencontainers.image.licenses=Apache-2.0 \ --image-label org.opencontainers.image.revision=${GIT_COMMIT} \ ./cmd/refinery) echo "Built image: ${IMAGE_REF}" - -# If COPY_DOCKER_REPOS is set, copy the built image to each of the listed registries. -# This is a comma-separated list of registry/repo names, e.g. -# "public.ecr.aws/honeycombio,ghcr.io/honeycombio/refinery" -if [[ -n "${COPY_DOCKER_REPOS:-}" ]]; then - echo "Pushing to multiple registries: ${COPY_DOCKER_REPOS}" - - IFS=',' read -ra REPOS <<< "$COPY_DOCKER_REPOS" - for REPO in "${REPOS[@]}"; do - REPO=$(echo "$REPO" | xargs) # trim whitespace - echo "Tagging and pushing to: $REPO" - - # Tag for each tag in the TAGS list - IFS=',' read -ra TAG_LIST <<< "$TAGS" - for TAG in "${TAG_LIST[@]}"; do - TAG=$(echo "$TAG" | xargs) # trim whitespace - TARGET_IMAGE="$REPO/refinery:$TAG" - echo "Copying $IMAGE_REF to $TARGET_IMAGE" - ./crane copy "$IMAGE_REF" "$TARGET_IMAGE" - done - done -fi From 42f0b8b84f99e7c077ad87882a1856bbd6a84da0 Mon Sep 17 00:00:00 2001 From: Zaq? Question Date: Mon, 9 Mar 2026 14:10:48 -0700 Subject: [PATCH 3/6] detect suitable root span when root is absent Extracts findSuitableRootSpan: prefers the trace's RootSpan, falls back to the first non-annotation span (not a span event or link). Updates computeCustomCounts to use it so custom counts land on the best available span even when no root has arrived before the trace times out. --- collect/collect.go | 20 ++++++++++++++++-- collect/collect_test.go | 47 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 2 deletions(-) diff --git a/collect/collect.go b/collect/collect.go index da7f3a282d..9e28a946fb 100644 --- a/collect/collect.go +++ b/collect/collect.go @@ -708,9 +708,25 @@ func (i *InMemCollector) initSpanCounterConfigs() { i.mutex.Unlock() } +// findSuitableRootSpan returns the root span of the trace if one is present. +// If no root span has been identified, it falls back to the first non-annotation +// span (i.e. not a span event or link). Returns nil if no suitable span exists. +func findSuitableRootSpan(t sendableTrace) *types.Span { + if t.RootSpan != nil { + return t.RootSpan + } + for _, sp := range t.GetSpans() { + if sp.AnnotationType() != types.SpanAnnotationTypeSpanEvent && + sp.AnnotationType() != types.SpanAnnotationTypeLink { + return sp + } + } + return nil +} + // computeCustomCounts computes each counter's value by iterating all spans in the trace // and attaches the results to the root span. -// Returns nil, nil if there are no counters configured or no root span. +// Returns nil, nil if there are no counters configured or no suitable target span. // // Stress relief note: this runs inside sendTraces(), the sole consumer of the // tracesToSend channel. Work is O(N×M) — N spans × M counters — so large @@ -730,7 +746,7 @@ func (i *InMemCollector) computeCustomCounts(t sendableTrace) (*types.Span, map[ return nil, nil } - targetSpan := t.RootSpan + targetSpan := findSuitableRootSpan(t) if targetSpan == nil { return nil, nil } diff --git a/collect/collect_test.go b/collect/collect_test.go index fcc950a189..ff2297faaf 100644 --- a/collect/collect_test.go +++ b/collect/collect_test.go @@ -2115,6 +2115,53 @@ func TestCustomSpanCounts_MultipleCounters(t *testing.T) { assert.Equal(t, int64(2), rootEvent.Data.Get("error_spans"), "2 spans have error=true") } +// TestCustomSpanCounts_NoRootSpan verifies that when a trace times out without +// a root span, custom counts land on the first non-annotation span instead. +func TestCustomSpanCounts_NoRootSpan(t *testing.T) { + conf := customCountConf([]config.SpanCounterConfig{{Key: "all_spans"}}) + conf.GetTracesConfigVal.TraceTimeout = config.Duration(5 * time.Millisecond) + + coll := newTestCollector(t, conf) + transmission := coll.Transmission.(*transmit.MockTransmission) + + traceID := "no-root" + // annotation span: should not be the target + coll.AddSpanFromPeer(&types.Span{ + TraceID: traceID, + Event: &types.Event{ + Dataset: "test", + Data: func() types.Payload { + p := types.NewPayload(coll.Config, map[string]interface{}{"trace.parent_id": "x"}) + p.MetaAnnotationType = "span_event" + return p + }(), + APIKey: legacyAPIKey, + }, + }) + // regular span: should be the target + coll.AddSpanFromPeer(&types.Span{ + TraceID: traceID, + Event: &types.Event{ + Dataset: "test", + Data: types.NewPayload(coll.Config, map[string]interface{}{"trace.parent_id": "x"}), + APIKey: legacyAPIKey, + }, + }) + + events := transmission.GetBlock(2) + require.Equal(t, 2, len(events)) + + // Exactly one span should carry the custom count (the first real span). + var counted []*types.Event + for _, ev := range events { + if ev.Data.Get("all_spans") != nil { + counted = append(counted, ev) + } + } + require.Equal(t, 1, len(counted), "custom count should appear on exactly one span when there is no root") + assert.Equal(t, int64(2), counted[0].Data.Get("all_spans"), "both spans should be counted") +} + // BenchmarkCollectorWithSamplers runs benchmarks for different sampler configurations. // This is a tricky benchmark to interpret because just setting up the input data // can easily be more expensive than the collector's routing code. The goal is to From 7fe57f6ad4710f6d9e4426441017ead12d4ecfef Mon Sep 17 00:00:00 2001 From: Zaq? Question Date: Fri, 13 Mar 2026 13:06:23 -0700 Subject: [PATCH 4/6] fix: move span counters to rules, update config meta --- collect/collect.go | 22 ++++++++++---------- collect/collect_test.go | 12 +++++------ config.md | 2 +- config/config.go | 2 +- config/file_config.go | 5 ++--- config/metadata/rulesMeta.yaml | 29 ++++++++++++++++++++++++++ config/mock.go | 6 +++--- config/sampler_config.go | 5 +++-- config/span_counter_config.go | 10 ++++----- config/span_counter_config_test.go | 22 ++++++++++---------- config/validate.go | 25 ++++++++++++++++++++++ config_complete.yaml | 2 +- metrics.md | 2 +- refinery_rules.md | 24 +++++++++++++++++++++ rules.md | 30 ++++++++++++++++++++++++++- tools/convert/configDataNames.txt | 2 +- tools/convert/minimal_config.yaml | 2 +- tools/convert/templates/configV2.tmpl | 2 +- 18 files changed, 155 insertions(+), 49 deletions(-) diff --git a/collect/collect.go b/collect/collect.go index 9e28a946fb..04772c51ac 100644 --- a/collect/collect.go +++ b/collect/collect.go @@ -114,7 +114,7 @@ type InMemCollector struct { hostname string memMetricSample []rtmetrics.Sample // Memory monitoring using runtime/metrics - spanCounterConfigs []config.SpanCounterConfig + spanCounters []config.SpanCounter } // These are the names of the metrics we use to track the number of events sent to peers through the router. @@ -172,7 +172,7 @@ func (i *InMemCollector) Start() error { i.Logger.Info().WithField("num_workers", numWorkers).Logf("Starting InMemCollector with %d workers", numWorkers) i.StressRelief.UpdateFromConfig() - i.initSpanCounterConfigs() + i.initSpanCounters() // Set queue capacity metrics for stress relief calculations i.Metrics.Store(DENOMINATOR_INCOMING_CAP, float64(imcConfig.IncomingQueueSize)) i.Metrics.Store(DENOMINATOR_PEER_CAP, float64(imcConfig.PeerQueueSize)) @@ -242,7 +242,7 @@ func (i *InMemCollector) reloadConfigs() { i.SamplerFactory.ClearDynsamplers() i.StressRelief.UpdateFromConfig() - i.initSpanCounterConfigs() + i.initSpanCounters() // Send reload signals to all workers to clear their local samplers // so that the new configuration will be propagated @@ -694,17 +694,17 @@ func (i *InMemCollector) addAdditionalAttributes(sp *types.Span) { } } -// initSpanCounterConfigs loads and initializes span counter configs from the current config. +// initSpanCounters loads and initializes span counters from the current config. // Must be called at startup and on config reload. -func (i *InMemCollector) initSpanCounterConfigs() { - cfgs := i.Config.GetSpanCounterConfig() - for j := range cfgs { - if err := cfgs[j].Init(); err != nil { - i.Logger.Error().WithField("error", err).Logf("failed to initialize span counter config entry %q", cfgs[j].Key) +func (i *InMemCollector) initSpanCounters() { + counters := i.Config.GetSpanCounters() + for j := range counters { + if err := counters[j].Init(); err != nil { + i.Logger.Error().WithField("error", err).Logf("failed to initialize span counter %q", counters[j].Key) } } i.mutex.Lock() - i.spanCounterConfigs = cfgs + i.spanCounters = counters i.mutex.Unlock() } @@ -739,7 +739,7 @@ func findSuitableRootSpan(t sendableTrace) *types.Span { // computed or attached to stress-sampled traces. func (i *InMemCollector) computeCustomCounts(t sendableTrace) (*types.Span, map[string]int64) { i.mutex.RLock() - counters := i.spanCounterConfigs + counters := i.spanCounters i.mutex.RUnlock() if len(counters) == 0 { diff --git a/collect/collect_test.go b/collect/collect_test.go index ff2297faaf..e0a7b97f34 100644 --- a/collect/collect_test.go +++ b/collect/collect_test.go @@ -1902,7 +1902,7 @@ func TestWorkerHealthReporting(t *testing.T) { } // customCountConf returns a base MockConfig suitable for custom span count tests. -func customCountConf(counters []config.SpanCounterConfig) *config.MockConfig { +func customCountConf(counters []config.SpanCounter) *config.MockConfig { return &config.MockConfig{ GetTracesConfigVal: config.TracesConfig{ SendTicker: config.Duration(2 * time.Millisecond), @@ -1924,7 +1924,7 @@ func customCountConf(counters []config.SpanCounterConfig) *config.MockConfig { IncomingQueueSize: 10, PeerQueueSize: 10, }, - SpanCounterConfigs: counters, + SpanCounters: counters, } } @@ -1958,7 +1958,7 @@ func TestCustomSpanCounts_NoCounters(t *testing.T) { // TestCustomSpanCounts_CountsLandOnRoot verifies that a counter with no // conditions counts all spans and attaches the result to the root span only. func TestCustomSpanCounts_CountsLandOnRoot(t *testing.T) { - counters := []config.SpanCounterConfig{ + counters := []config.SpanCounter{ {Key: "all_spans"}, } coll := newTestCollector(t, customCountConf(counters)) @@ -2005,7 +2005,7 @@ func TestCustomSpanCounts_CountsLandOnRoot(t *testing.T) { // TestCustomSpanCounts_ConditionalCounting verifies that only spans matching // a condition are counted. func TestCustomSpanCounts_ConditionalCounting(t *testing.T) { - counters := []config.SpanCounterConfig{ + counters := []config.SpanCounter{ { Key: "error_spans", Conditions: []*config.RulesBasedSamplerCondition{ @@ -2061,7 +2061,7 @@ func TestCustomSpanCounts_ConditionalCounting(t *testing.T) { // TestCustomSpanCounts_MultipleCounters verifies that multiple counters with // different conditions produce independent counts on the root span. func TestCustomSpanCounts_MultipleCounters(t *testing.T) { - counters := []config.SpanCounterConfig{ + counters := []config.SpanCounter{ { Key: "db_spans", Conditions: []*config.RulesBasedSamplerCondition{ @@ -2118,7 +2118,7 @@ func TestCustomSpanCounts_MultipleCounters(t *testing.T) { // TestCustomSpanCounts_NoRootSpan verifies that when a trace times out without // a root span, custom counts land on the first non-annotation span instead. func TestCustomSpanCounts_NoRootSpan(t *testing.T) { - conf := customCountConf([]config.SpanCounterConfig{{Key: "all_spans"}}) + conf := customCountConf([]config.SpanCounter{{Key: "all_spans"}}) conf.GetTracesConfigVal.TraceTimeout = config.Duration(5 * time.Millisecond) coll := newTestCollector(t, conf) diff --git a/config.md b/config.md index fb1b4ca340..20133d728c 100644 --- a/config.md +++ b/config.md @@ -3,7 +3,7 @@ # Honeycomb Refinery Configuration Documentation This is the documentation for the configuration file for Honeycomb's Refinery. -It was automatically generated on 2026-02-25 at 20:49:27 UTC. +It was automatically generated on 2026-03-13 at 20:03:54 UTC. ## The Config file diff --git a/config/config.go b/config/config.go index cbbd211265..a709cf1b41 100644 --- a/config/config.go +++ b/config/config.go @@ -151,7 +151,7 @@ type Config interface { GetAddCountsToRoot() bool - GetSpanCounterConfig() []SpanCounterConfig + GetSpanCounters() []SpanCounter GetConfigMetadata() []ConfigMetadata diff --git a/config/file_config.go b/config/file_config.go index dbcf87a7b7..9eae575ea5 100644 --- a/config/file_config.go +++ b/config/file_config.go @@ -193,7 +193,6 @@ type RefineryTelemetryConfig struct { AddSpanCountToRoot *DefaultTrue `yaml:"AddSpanCountToRoot" default:"true"` // Avoid pointer woe on access, use GetAddSpanCountToRoot() instead. AddCountsToRoot bool `yaml:"AddCountsToRoot"` AddHostMetadataToTrace *DefaultTrue `yaml:"AddHostMetadataToTrace" default:"true"` // Avoid pointer woe on access, use GetAddHostMetadataToTrace() instead. - CustomSpanCounts []SpanCounterConfig `yaml:"CustomSpanCounts,omitempty"` } type TracesConfig struct { @@ -1117,11 +1116,11 @@ func (f *fileConfig) GetAddCountsToRoot() bool { return f.mainConfig.Telemetry.AddCountsToRoot } -func (f *fileConfig) GetSpanCounterConfig() []SpanCounterConfig { +func (f *fileConfig) GetSpanCounters() []SpanCounter { f.mux.RLock() defer f.mux.RUnlock() - return f.mainConfig.Telemetry.CustomSpanCounts + return f.rulesConfig.SpanCounters } func (f *fileConfig) GetSampleCacheConfig() SampleCacheConfig { diff --git a/config/metadata/rulesMeta.yaml b/config/metadata/rulesMeta.yaml index b80e4ee983..a4a9a5c823 100644 --- a/config/metadata/rulesMeta.yaml +++ b/config/metadata/rulesMeta.yaml @@ -738,3 +738,32 @@ groups: The best practice is to always specify `Datatype`; this avoids ambiguity, allows for more accurate comparisons, and offers a minor performance improvement. + + - name: SpanCounters + title: "Custom Span Count Configuration" + sortorder: 80 + description: > + Defines a single custom span counter. Each counter has a Key that names + the field written to the root span, and an optional list of Conditions + that must all match for a span to be counted. Spans are counted when + all of the entry's Conditions match. If Conditions is empty, every span + in the trace is counted. The counter value is written to the root span + under the key specified by `Key`. If no root span exists when the trace + is sent, the counter is written to the first non-annotation span instead. + fields: + - name: Key + type: string + validations: + - type: notempty + summary: is the field name written to the root span with the counter value. + description: > + The name of the field that will be added to the root span. Must not + be empty. + + - name: Conditions + type: objectarray + summary: is the list of conditions a span must satisfy to be counted. + description: > + All conditions must match for a span to be counted. If empty, every + span in the trace is counted. Uses the same condition format as + rules-based sampler conditions. diff --git a/config/mock.go b/config/mock.go index 57eef3af73..58660a281f 100644 --- a/config/mock.go +++ b/config/mock.go @@ -52,7 +52,7 @@ type MockConfig struct { AdditionalErrorFields []string AddSpanCountToRoot bool AddCountsToRoot bool - SpanCounterConfigs []SpanCounterConfig + SpanCounters []SpanCounter CacheOverrunStrategy string SampleCache SampleCacheConfig StressRelief StressReliefConfig @@ -416,11 +416,11 @@ func (f *MockConfig) GetAddCountsToRoot() bool { return f.AddSpanCountToRoot } -func (f *MockConfig) GetSpanCounterConfig() []SpanCounterConfig { +func (f *MockConfig) GetSpanCounters() []SpanCounter { f.Mux.RLock() defer f.Mux.RUnlock() - return f.SpanCounterConfigs + return f.SpanCounters } func (f *MockConfig) GetSampleCacheConfig() SampleCacheConfig { diff --git a/config/sampler_config.go b/config/sampler_config.go index 2560d322e8..fa8f39fbc9 100644 --- a/config/sampler_config.go +++ b/config/sampler_config.go @@ -172,8 +172,9 @@ func (v *RulesBasedDownstreamSampler) NameMeaningfulRate() string { } type V2SamplerConfig struct { - RulesVersion int `json:"rulesversion" yaml:"RulesVersion" validate:"required,ge=2"` - Samplers map[string]*V2SamplerChoice `json:"samplers" yaml:"Samplers,omitempty" validate:"required"` + RulesVersion int `json:"rulesversion" yaml:"RulesVersion" validate:"required,ge=2"` + Samplers map[string]*V2SamplerChoice `json:"samplers" yaml:"Samplers,omitempty" validate:"required"` + SpanCounters []SpanCounter `json:"spancounters" yaml:"SpanCounters,omitempty"` } type GetSamplingFielder interface { diff --git a/config/span_counter_config.go b/config/span_counter_config.go index 6985e9203d..1eb6c1130d 100644 --- a/config/span_counter_config.go +++ b/config/span_counter_config.go @@ -2,22 +2,22 @@ package config import "strings" -// SpanData is the interface required for matching span fields in a SpanCounterConfig. +// SpanData is the interface required for matching span fields in a SpanCounter. // It is satisfied by *types.Payload. type SpanData interface { Get(key string) any Exists(key string) bool } -// SpanCounterConfig defines a custom span count to be computed and added to +// SpanCounter defines a custom span count to be computed and added to // the root span under Key. Spans are counted if they satisfy all Conditions. -type SpanCounterConfig struct { +type SpanCounter struct { Key string `yaml:"Key"` Conditions []*RulesBasedSamplerCondition `yaml:"Conditions,omitempty"` } // Init initializes all conditions. Must be called before MatchesSpan. -func (c *SpanCounterConfig) Init() error { +func (c *SpanCounter) Init() error { for _, cond := range c.Conditions { if err := cond.Init(); err != nil { return err @@ -28,7 +28,7 @@ func (c *SpanCounterConfig) Init() error { // MatchesSpan returns true if the span satisfies all conditions. // span is the span being tested; root is the root span's data (may be nil). -func (c *SpanCounterConfig) MatchesSpan(span SpanData, root SpanData) bool { +func (c *SpanCounter) MatchesSpan(span SpanData, root SpanData) bool { for _, cond := range c.Conditions { var value any var exists bool diff --git a/config/span_counter_config_test.go b/config/span_counter_config_test.go index 366a95803b..287e438aa2 100644 --- a/config/span_counter_config_test.go +++ b/config/span_counter_config_test.go @@ -175,18 +175,18 @@ func TestConditionMatchesValue(t *testing.T) { } // ---------------------------------------------------------------------------- -// SpanCounterConfig.MatchesSpan +// SpanCounter.MatchesSpan // ---------------------------------------------------------------------------- func TestMatchesSpan_NoConditions(t *testing.T) { // A counter with no conditions matches every span. - counter := SpanCounterConfig{Key: "all"} + counter := SpanCounter{Key: "all"} assert.True(t, counter.MatchesSpan(spanData{"foo": "bar"}, nil)) assert.True(t, counter.MatchesSpan(spanData{}, nil)) } func TestMatchesSpan_SingleCondition(t *testing.T) { - counter := SpanCounterConfig{ + counter := SpanCounter{ Key: "errors", Conditions: []*RulesBasedSamplerCondition{cond("error", EQ, true)}, } @@ -197,7 +197,7 @@ func TestMatchesSpan_SingleCondition(t *testing.T) { } func TestMatchesSpan_MultipleConditionsAllMustMatch(t *testing.T) { - counter := SpanCounterConfig{ + counter := SpanCounter{ Key: "slow-errors", Conditions: []*RulesBasedSamplerCondition{ cond("error", EQ, true), @@ -213,7 +213,7 @@ func TestMatchesSpan_MultipleConditionsAllMustMatch(t *testing.T) { func TestMatchesSpan_RootPrefixedField(t *testing.T) { // "root.service.name" reads from the root span data, not the span itself. - counter := SpanCounterConfig{ + counter := SpanCounter{ Key: "svc-db", Conditions: []*RulesBasedSamplerCondition{cond("root.service.name", EQ, "database")}, } @@ -227,7 +227,7 @@ func TestMatchesSpan_RootPrefixedField(t *testing.T) { func TestMatchesSpan_RootPrefixedField_NilRoot(t *testing.T) { // When root is nil a root-prefixed field is never found → field is absent. - counter := SpanCounterConfig{ + counter := SpanCounter{ Key: "svc", Conditions: []*RulesBasedSamplerCondition{cond("root.service.name", EQ, "database")}, } @@ -243,7 +243,7 @@ func TestMatchesSpan_MultiFieldFallback(t *testing.T) { if err := c.Init(); err != nil { t.Fatal(err) } - counter := SpanCounterConfig{Key: "has-trace", Conditions: []*RulesBasedSamplerCondition{c}} + counter := SpanCounter{Key: "has-trace", Conditions: []*RulesBasedSamplerCondition{c}} assert.True(t, counter.MatchesSpan(spanData{"trace.trace_id": "abc"}, nil)) assert.True(t, counter.MatchesSpan(spanData{"traceId": "abc"}, nil)) @@ -261,7 +261,7 @@ func TestMatchesSpan_MultiFieldFallback_FirstWins(t *testing.T) { if err := c.Init(); err != nil { t.Fatal(err) } - counter := SpanCounterConfig{Key: "k", Conditions: []*RulesBasedSamplerCondition{c}} + counter := SpanCounter{Key: "k", Conditions: []*RulesBasedSamplerCondition{c}} // "a" is found with wrong value; "b" has the right value but is not checked. assert.False(t, counter.MatchesSpan(spanData{"a": "no", "b": "yes"}, nil)) @@ -272,7 +272,7 @@ func TestMatchesSpan_MultiFieldFallback_FirstWins(t *testing.T) { func TestMatchesSpan_TypedCondition(t *testing.T) { // When Datatype is set, Init wires up a type-coercing Matches function. // Verify that MatchesSpan delegates to it correctly. - counter := SpanCounterConfig{ + counter := SpanCounter{ Key: "count-int", Conditions: []*RulesBasedSamplerCondition{condTyped("code", EQ, 200, "int")}, } @@ -283,11 +283,11 @@ func TestMatchesSpan_TypedCondition(t *testing.T) { } func TestMatchesSpan_ExistsAndNotExists(t *testing.T) { - exists := SpanCounterConfig{ + exists := SpanCounter{ Key: "has-field", Conditions: []*RulesBasedSamplerCondition{cond("db.query", Exists, nil)}, } - notExists := SpanCounterConfig{ + notExists := SpanCounter{ Key: "no-field", Conditions: []*RulesBasedSamplerCondition{cond("db.query", NotExists, nil)}, } diff --git a/config/validate.go b/config/validate.go index 41b0c135b7..92f89f5c33 100644 --- a/config/validate.go +++ b/config/validate.go @@ -653,6 +653,31 @@ func (m *Metadata) ValidateRules(data map[string]any) ValidationResults { } } hasSamplers = true + case "SpanCounters": + if arr, ok := v.([]any); !ok { + results = append(results, ValidationResult{ + Message: fmt.Sprintf("SpanCounters must be an array, but %v is %T", v, v), + Severity: Error, + }) + } else { + for i, entry := range arr { + if entryMap, ok := entry.(map[string]any); ok { + rulesmap := map[string]any{"SpanCounters": entryMap} + subresults := m.Validate(rulesmap) + for _, result := range subresults { + results = append(results, ValidationResult{ + Message: fmt.Sprintf("Within SpanCounters[%d]: %s", i, result.Message), + Severity: result.Severity, + }) + } + } else { + results = append(results, ValidationResult{ + Message: fmt.Sprintf("SpanCounters[%d] must be an object, but %v is %T", i, entry, entry), + Severity: Error, + }) + } + } + } default: results = append(results, ValidationResult{ Message: fmt.Sprintf("unknown top-level key %s", k), diff --git a/config_complete.yaml b/config_complete.yaml index beb7eaf4ba..290572bca6 100644 --- a/config_complete.yaml +++ b/config_complete.yaml @@ -2,7 +2,7 @@ ## Honeycomb Refinery Configuration ## ###################################### # -# created on 2026-02-25 at 20:49:27 UTC from ../../config.yaml using a template generated on 2026-02-25 at 20:49:24 UTC +# created on 2026-03-13 at 20:03:53 UTC from ../../config.yaml using a template generated on 2026-03-13 at 20:03:51 UTC # This file contains a configuration for the Honeycomb Refinery. It is in YAML # format, organized into named groups, each of which contains a set of diff --git a/metrics.md b/metrics.md index 4be5fcf988..bcee93bd04 100644 --- a/metrics.md +++ b/metrics.md @@ -3,7 +3,7 @@ # Honeycomb Refinery Metrics Documentation This document contains the description of various metrics used in Refinery. -It was automatically generated on 2026-02-25 at 20:49:27 UTC. +It was automatically generated on 2026-03-13 at 20:03:53 UTC. Note: This document does not include metrics defined in the dynsampler-go dependency, as those metrics are generated dynamically at runtime. As a result, certain metrics may be missing or incomplete in this document, but they will still be available during execution with their full names. diff --git a/refinery_rules.md b/refinery_rules.md index 5fcd2b9592..6a22a5d98c 100644 --- a/refinery_rules.md +++ b/refinery_rules.md @@ -671,3 +671,27 @@ If your traces are consistent lengths and changes in trace length is a useful in - Type: `bool` +## Custom Span Count Configuration + +Defines a single custom span counter. +Each counter has a Key that names the field written to the root span, and an optional list of Conditions that must all match for a span to be counted. +Spans are counted when all of the entry's Conditions match. +If Conditions is empty, every span in the trace is counted. +The counter value is written to the root span under the key specified by `Key`. +If no root span exists when the trace is sent, the counter is written to the first non-annotation span instead. + +### `Key` + +The name of the field that will be added to the root span. +Must not be empty. + +- Type: `string` + +### `Conditions` + +All conditions must match for a span to be counted. +If empty, every span in the trace is counted. +Uses the same condition format as rules-based sampler conditions. + +- Type: `objectarray` + diff --git a/rules.md b/rules.md index ee6023bc0e..e3059099d2 100644 --- a/rules.md +++ b/rules.md @@ -3,7 +3,7 @@ # Honeycomb Refinery Rules Documentation This is the documentation for the rules configuration for Honeycomb's Refinery. -It was automatically generated on 2026-02-25 at 20:49:27 UTC. +It was automatically generated on 2026-03-13 at 20:03:54 UTC. ## The Rules file @@ -55,6 +55,7 @@ The remainder of this document describes the samplers that can be used within th - [Rules for Rules-based Samplers](#rules-for-rules-based-samplers) - [Conditions for the Rules in Rules-based Samplers](#conditions-for-the-rules-in-rules-based-samplers) - [Total Throughput Sampler](#total-throughput-sampler) +- [Custom Span Count Configuration](#custom-span-count-configuration) --- ## Deterministic Sampler @@ -715,3 +716,30 @@ If your traces are consistent lengths and changes in trace length is a useful in Type: `bool` +--- +## Custom Span Count Configuration + +### Name: `SpanCounters` + +Defines a single custom span counter. +Each counter has a Key that names the field written to the root span, and an optional list of Conditions that must all match for a span to be counted. +Spans are counted when all of the entry's Conditions match. +If Conditions is empty, every span in the trace is counted. +The counter value is written to the root span under the key specified by `Key`. +If no root span exists when the trace is sent, the counter is written to the first non-annotation span instead. + +### `Key` + +The name of the field that will be added to the root span. +Must not be empty. + +Type: `string` + +### `Conditions` + +All conditions must match for a span to be counted. +If empty, every span in the trace is counted. +Uses the same condition format as rules-based sampler conditions. + +Type: `objectarray` + diff --git a/tools/convert/configDataNames.txt b/tools/convert/configDataNames.txt index 793d4bb1e7..5cc84b2595 100644 --- a/tools/convert/configDataNames.txt +++ b/tools/convert/configDataNames.txt @@ -1,5 +1,5 @@ # Names of groups and fields in the new config file format. -# Automatically generated on 2026-02-25 at 20:49:25 UTC. +# Automatically generated on 2026-03-13 at 20:03:51 UTC. General: - ConfigurationVersion diff --git a/tools/convert/minimal_config.yaml b/tools/convert/minimal_config.yaml index eb49635629..3c666d0511 100644 --- a/tools/convert/minimal_config.yaml +++ b/tools/convert/minimal_config.yaml @@ -1,5 +1,5 @@ # sample uncommented config file containing all possible fields -# automatically generated on 2026-02-25 at 20:49:25 UTC +# automatically generated on 2026-03-13 at 20:03:52 UTC General: ConfigurationVersion: 2 MinRefineryVersion: "v2.0" diff --git a/tools/convert/templates/configV2.tmpl b/tools/convert/templates/configV2.tmpl index 7f91a2f2a5..b2577f70b1 100644 --- a/tools/convert/templates/configV2.tmpl +++ b/tools/convert/templates/configV2.tmpl @@ -2,7 +2,7 @@ ## Honeycomb Refinery Configuration ## ###################################### # -# created {{ now }} from {{ .Input }} using a template generated on 2026-02-25 at 20:49:24 UTC +# created {{ now }} from {{ .Input }} using a template generated on 2026-03-13 at 20:03:51 UTC # This file contains a configuration for the Honeycomb Refinery. It is in YAML # format, organized into named groups, each of which contains a set of From 903d73974296c54217f3011632996400dcf5bd2f Mon Sep 17 00:00:00 2001 From: Zaq? Question Date: Mon, 16 Mar 2026 12:44:44 -0700 Subject: [PATCH 5/6] point to the sre-team project when building docker --- build-docker.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build-docker.sh b/build-docker.sh index 8a5cfb2631..f1b9f8137d 100755 --- a/build-docker.sh +++ b/build-docker.sh @@ -3,7 +3,7 @@ set -o nounset set -o pipefail set -o xtrace -GCLOUD_REGISTRY="us-central1-docker.pkg.dev/khan-internal-services/refinery" +GCLOUD_REGISTRY="gcr.io/sre-team-418623" # Parse flags PUSH=false From d5e35cf57aa2855a37f6bb389c59e8aaa7c28fd4 Mon Sep 17 00:00:00 2001 From: Zaq? Question Date: Mon, 16 Mar 2026 16:04:04 -0700 Subject: [PATCH 6/6] fix(root): select the earliest span as the root --- collect/collect.go | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/collect/collect.go b/collect/collect.go index 04772c51ac..be24512518 100644 --- a/collect/collect.go +++ b/collect/collect.go @@ -709,19 +709,23 @@ func (i *InMemCollector) initSpanCounters() { } // findSuitableRootSpan returns the root span of the trace if one is present. -// If no root span has been identified, it falls back to the first non-annotation -// span (i.e. not a span event or link). Returns nil if no suitable span exists. +// If no root span has been identified, it falls back to the non-annotation +// span (i.e. not a span event or link) with the earliest timestamp, which is +// the most likely root. Returns nil if no suitable span exists. func findSuitableRootSpan(t sendableTrace) *types.Span { if t.RootSpan != nil { return t.RootSpan } + var best *types.Span for _, sp := range t.GetSpans() { if sp.AnnotationType() != types.SpanAnnotationTypeSpanEvent && sp.AnnotationType() != types.SpanAnnotationTypeLink { - return sp + if best == nil || sp.Timestamp.Before(best.Timestamp) { + best = sp + } } } - return nil + return best } // computeCustomCounts computes each counter's value by iterating all spans in the trace