From 833c65b4c9564aa9faa8347de1c74ea13d56e05a Mon Sep 17 00:00:00 2001
From: Yu Long <yu.long@databricks.com>
Date: Fri, 18 Apr 2025 18:53:55 +0000
Subject: [PATCH 01/40] design v2

---
 pkg/query/aggregation_label_rewriter.go      |  4 ++--
 pkg/query/aggregation_label_rewriter_test.go | 21 ++++++++++++++++----
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/pkg/query/aggregation_label_rewriter.go b/pkg/query/aggregation_label_rewriter.go
index b856f58e42f..d5f6edeab40 100644
--- a/pkg/query/aggregation_label_rewriter.go
+++ b/pkg/query/aggregation_label_rewriter.go
@@ -15,7 +15,7 @@ import (
 )
 
 const (
-	aggregationLabelName = "__rollup__"
+	aggregationLabelName = "__agg_rule_type__"
 )
 
 type AggregationLabelRewriter struct {
@@ -128,7 +128,7 @@ func (a *AggregationLabelRewriter) Rewrite(ms []*labels.Matcher) []*labels.Match
 	if needsRewrite {
 		newMatcher := &labels.Matcher{
 			Name:  aggregationLabelName,
-			Type:  labels.MatchEqual,
+			Type:  labels.MatchRegexp,
 			Value: a.desiredLabelValue,
 		}
 		if aggregationLabelMatcher != nil {
diff --git a/pkg/query/aggregation_label_rewriter_test.go b/pkg/query/aggregation_label_rewriter_test.go
index 038c3239a2e..7cdc2efec3f 100644
--- a/pkg/query/aggregation_label_rewriter_test.go
+++ b/pkg/query/aggregation_label_rewriter_test.go
@@ -41,20 +41,33 @@ func TestAggregationLabelRewriter_Rewrite(t *testing.T) {
 			},
 			expectedMatchers: []*labels.Matcher{
 				labels.MustNewMatcher(labels.MatchEqual, "__name__", "test:sum"),
-				labels.MustNewMatcher(labels.MatchEqual, "__rollup__", "5m"),
+				labels.MustNewMatcher(labels.MatchRegexp, aggregationLabelName, "5m"),
 			},
 			expectedAddCount: 1,
 		},
 		{
-			name:              "should rewrite existing aggregation label for aggregated metric",
+			name:              "should rewrite existing equal aggregation label for aggregated metric",
 			desiredLabelValue: "5m",
 			inputMatchers: []*labels.Matcher{
 				labels.MustNewMatcher(labels.MatchEqual, "__name__", "test:sum"),
-				labels.MustNewMatcher(labels.MatchEqual, "__rollup__", "1h"),
+				labels.MustNewMatcher(labels.MatchEqual, aggregationLabelName, "1h"),
 			},
 			expectedMatchers: []*labels.Matcher{
 				labels.MustNewMatcher(labels.MatchEqual, "__name__", "test:sum"),
-				labels.MustNewMatcher(labels.MatchEqual, "__rollup__", "5m"),
+				labels.MustNewMatcher(labels.MatchRegexp, aggregationLabelName, "5m"),
+			},
+			expectedRewriteMap: map[string]float64{"1h": 1},
+		},
+		{
+			name:              "should rewrite existing regex aggregation label for aggregated metric",
+			desiredLabelValue: "5m",
+			inputMatchers: []*labels.Matcher{
+				labels.MustNewMatcher(labels.MatchEqual, "__name__", "test:sum"),
+				labels.MustNewMatcher(labels.MatchRegexp, aggregationLabelName, "1h"),
+			},
+			expectedMatchers: []*labels.Matcher{
+				labels.MustNewMatcher(labels.MatchEqual, "__name__", "test:sum"),
+				labels.MustNewMatcher(labels.MatchRegexp, aggregationLabelName, "5m"),
 			},
 			expectedRewriteMap: map[string]float64{"1h": 1},
 		},

From 289f48c24d233a5ebdf81a89bbad042be116e70a Mon Sep 17 00:00:00 2001
From: Yu Long <yu.long@databricks.com>
Date: Fri, 18 Apr 2025 19:24:51 +0000
Subject: [PATCH 02/40] design v2

---
 cmd/thanos/query.go                          | 10 +++++++++-
 pkg/query/aggregation_label_rewriter.go      | 20 ++++++++++++-------
 pkg/query/aggregation_label_rewriter_test.go | 21 ++++++++++++++++++++
 pkg/query/querier.go                         |  4 ++++
 4 files changed, 47 insertions(+), 8 deletions(-)

diff --git a/cmd/thanos/query.go b/cmd/thanos/query.go
index c67d25ccb8e..51ec042154c 100644
--- a/cmd/thanos/query.go
+++ b/cmd/thanos/query.go
@@ -245,7 +245,9 @@ func registerQuery(app *extkingpin.App) {
 	enforceTenancy := cmd.Flag("query.enforce-tenancy", "Enforce tenancy on Query APIs. Responses are returned only if the label value of the configured tenant-label-name and the value of the tenant header matches.").Default("false").Bool()
 	tenantLabel := cmd.Flag("query.tenant-label-name", "Label name to use when enforcing tenancy (if --query.enforce-tenancy is enabled).").Default(tenancy.DefaultTenantLabel).String()
 
-	rewriteAggregationLabelTo := cmd.Flag("query.aggregation-label-value-override", "The value override for __rollup__ label for aggregated metrics. If set to x, all queries on aggregated metrics will have a __rollup__=x matcher. Leave empty to disable this behavior. Default is empty.").Default("").String()
+	rewriteAggregationLabelName := cmd.Flag("query.aggregation-label-name", "The name for aggregation label. See query.aggregation-label-value-override for details. Default is empty.").Default("").String()
+	rewriteAggregationLabelTo := cmd.Flag("query.aggregation-label-value-override", "The value override for aggregation label. If set to x, all queries on aggregated metrics will have a `rewriteAggregationLabelName=x` matcher. Leave either flag empty to disable this behavior. Default is empty.").Default("").String()
+	rewriteAggregationInsertOnly := cmd.Flag("query.aggregation-label-insert-only", "If enabled, the above rewriter only insert missing rewriteAggregationLabelName=x and does not modify existing aggregation labels if any. Default is false.").Default("false").Bool()
 
 	var storeRateLimits store.SeriesSelectLimits
 	storeRateLimits.RegisterFlags(cmd)
@@ -386,7 +388,9 @@ func registerQuery(app *extkingpin.App) {
 			*enforceTenancy,
 			*tenantLabel,
 			*enableGroupReplicaPartialStrategy,
+			*rewriteAggregationLabelName,
 			*rewriteAggregationLabelTo,
+			*rewriteAggregationInsertOnly,
 		)
 	})
 }
@@ -471,7 +475,9 @@ func runQuery(
 	enforceTenancy bool,
 	tenantLabel string,
 	groupReplicaPartialResponseStrategy bool,
+	rewriteAggregationLabelName string,
 	rewriteAggregationLabelTo string,
+	rewriteAggregationInsertOnly bool,
 ) error {
 	comp := component.Query
 	if alertQueryURL == "" {
@@ -601,6 +607,8 @@ func runQuery(
 	opts := query.Options{
 		GroupReplicaPartialResponseStrategy: groupReplicaPartialResponseStrategy,
 		DeduplicationFunc:                   queryDeduplicationFunc,
+		RewriteAggregationInsertOnly:        rewriteAggregationInsertOnly,
+		RewriteAggregationLabelName:         rewriteAggregationLabelName,
 		RewriteAggregationLabelTo:           rewriteAggregationLabelTo,
 	}
 	level.Info(logger).Log("msg", "databricks querier features", "opts", fmt.Sprintf("%+v", opts))
diff --git a/pkg/query/aggregation_label_rewriter.go b/pkg/query/aggregation_label_rewriter.go
index d5f6edeab40..76d4659fe6c 100644
--- a/pkg/query/aggregation_label_rewriter.go
+++ b/pkg/query/aggregation_label_rewriter.go
@@ -14,15 +14,13 @@ import (
 	"github.com/prometheus/prometheus/model/labels"
 )
 
-const (
-	aggregationLabelName = "__agg_rule_type__"
-)
-
 type AggregationLabelRewriter struct {
 	logger  log.Logger
 	metrics *aggregationLabelRewriterMetrics
 
 	enabled           bool
+	insertOnly        bool
+	labelKey          string
 	desiredLabelValue string
 }
 
@@ -74,7 +72,7 @@ func NewNopAggregationLabelRewriter() *AggregationLabelRewriter {
 	}
 }
 
-func NewAggregationLabelRewriter(logger log.Logger, reg prometheus.Registerer, desiredLabelValue string) *AggregationLabelRewriter {
+func NewAggregationLabelRewriter(logger log.Logger, reg prometheus.Registerer, labelKey string, desiredLabelValue string, insertOnly bool) *AggregationLabelRewriter {
 	if logger == nil {
 		logger = log.NewNopLogger()
 	}
@@ -82,6 +80,8 @@ func NewAggregationLabelRewriter(logger log.Logger, reg prometheus.Registerer, d
 		enabled:           desiredLabelValue != "",
 		logger:            logger,
 		metrics:           newAggregationLabelRewriterMetrics(reg, desiredLabelValue),
+		insertOnly:        insertOnly,
+		labelKey:          labelKey,
 		desiredLabelValue: desiredLabelValue,
 	}
 }
@@ -118,16 +118,22 @@ func (a *AggregationLabelRewriter) Rewrite(ms []*labels.Matcher) []*labels.Match
 				break
 			}
 			// In any case, if we see an aggregation label, we store that for later use
-		} else if m.Name == aggregationLabelName {
+		} else if m.Name == a.labelKey {
 			aggregationLabelMatcher = m
 			aggregationLabelIndex = i
 		}
 	}
+
+	if aggregationLabelMatcher != nil && a.insertOnly {
+		needsRewrite = false
+		skipReason = "insert-only"
+	}
+
 	// After the for loop, if needsRewrite is false, no need to do anything
 	// but if it is true, we either append or modify an aggregation label
 	if needsRewrite {
 		newMatcher := &labels.Matcher{
-			Name:  aggregationLabelName,
+			Name:  a.labelKey,
 			Type:  labels.MatchRegexp,
 			Value: a.desiredLabelValue,
 		}
diff --git a/pkg/query/aggregation_label_rewriter_test.go b/pkg/query/aggregation_label_rewriter_test.go
index 7cdc2efec3f..37446e2f3ee 100644
--- a/pkg/query/aggregation_label_rewriter_test.go
+++ b/pkg/query/aggregation_label_rewriter_test.go
@@ -11,12 +11,17 @@ import (
 	"github.com/prometheus/prometheus/model/labels"
 )
 
+const (
+	aggregationLabelName = "__agg_rule_type__"
+)
+
 func TestAggregationLabelRewriter_Rewrite(t *testing.T) {
 	t.Parallel()
 
 	for _, tc := range []struct {
 		name               string
 		desiredLabelValue  string // Empty means disabled
+		insertOnly         bool
 		inputMatchers      []*labels.Matcher
 		expectedMatchers   []*labels.Matcher
 		expectedSkipCount  float64
@@ -104,13 +109,29 @@ func TestAggregationLabelRewriter_Rewrite(t *testing.T) {
 			},
 			expectedSkipCount: 1,
 		},
+		{
+			name:              "if insert only, should NOT rewrite existing aggregation label for aggregated metric",
+			desiredLabelValue: "5m",
+			insertOnly:        true,
+			inputMatchers: []*labels.Matcher{
+				labels.MustNewMatcher(labels.MatchEqual, "__name__", "test:sum"),
+				labels.MustNewMatcher(labels.MatchEqual, aggregationLabelName, "1h"),
+			},
+			expectedMatchers: []*labels.Matcher{
+				labels.MustNewMatcher(labels.MatchEqual, "__name__", "test:sum"),
+				labels.MustNewMatcher(labels.MatchEqual, aggregationLabelName, "1h"),
+			},
+			expectedSkipCount: 1,
+		},
 	} {
 		t.Run(tc.name, func(t *testing.T) {
 			reg := prometheus.NewRegistry()
 			rewriter := NewAggregationLabelRewriter(
 				nil,
 				reg,
+				aggregationLabelName,
 				tc.desiredLabelValue,
+				tc.insertOnly,
 			)
 
 			result := rewriter.Rewrite(tc.inputMatchers)
diff --git a/pkg/query/querier.go b/pkg/query/querier.go
index 953aa135beb..428ced4cb2f 100644
--- a/pkg/query/querier.go
+++ b/pkg/query/querier.go
@@ -61,7 +61,9 @@ type QueryableCreator func(
 type Options struct {
 	GroupReplicaPartialResponseStrategy bool
 	DeduplicationFunc                   string
+	RewriteAggregationLabelName         string
 	RewriteAggregationLabelTo           string
+	RewriteAggregationInsertOnly        bool
 }
 
 // NewQueryableCreator creates QueryableCreator.
@@ -94,7 +96,9 @@ func NewQueryableCreatorWithOptions(
 	aggregationLabelRewriter := NewAggregationLabelRewriter(
 		logger,
 		extprom.WrapRegistererWithPrefix("aggregation_label_rewriter_", reg),
+		opts.RewriteAggregationLabelName,
 		opts.RewriteAggregationLabelTo,
+		opts.RewriteAggregationInsertOnly,
 	)
 	return func(
 		deduplicate bool,

From 2b03e6bbce59da215ca622fc8df9a96633bdbf9f Mon Sep 17 00:00:00 2001
From: Yu Long <yu.long@databricks.com>
Date: Fri, 18 Apr 2025 19:32:02 +0000
Subject: [PATCH 03/40] design v2

---
 pkg/query/aggregation_label_rewriter.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/query/aggregation_label_rewriter.go b/pkg/query/aggregation_label_rewriter.go
index 76d4659fe6c..984862579f6 100644
--- a/pkg/query/aggregation_label_rewriter.go
+++ b/pkg/query/aggregation_label_rewriter.go
@@ -77,7 +77,7 @@ func NewAggregationLabelRewriter(logger log.Logger, reg prometheus.Registerer, l
 		logger = log.NewNopLogger()
 	}
 	return &AggregationLabelRewriter{
-		enabled:           desiredLabelValue != "",
+		enabled:           desiredLabelValue != "" || labelKey != "",
 		logger:            logger,
 		metrics:           newAggregationLabelRewriterMetrics(reg, desiredLabelValue),
 		insertOnly:        insertOnly,

From 55a240a5dd52e44d6b5ae45966b3d628703138b6 Mon Sep 17 00:00:00 2001
From: Yu Long <yu.long@databricks.com>
Date: Fri, 18 Apr 2025 19:33:43 +0000
Subject: [PATCH 04/40] design v2

---
 pkg/query/aggregation_label_rewriter.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/query/aggregation_label_rewriter.go b/pkg/query/aggregation_label_rewriter.go
index 984862579f6..e0eec211e66 100644
--- a/pkg/query/aggregation_label_rewriter.go
+++ b/pkg/query/aggregation_label_rewriter.go
@@ -77,7 +77,7 @@ func NewAggregationLabelRewriter(logger log.Logger, reg prometheus.Registerer, l
 		logger = log.NewNopLogger()
 	}
 	return &AggregationLabelRewriter{
-		enabled:           desiredLabelValue != "" || labelKey != "",
+		enabled:           desiredLabelValue != "" && labelKey != "",
 		logger:            logger,
 		metrics:           newAggregationLabelRewriterMetrics(reg, desiredLabelValue),
 		insertOnly:        insertOnly,

From 66ea257828262f7e764816e66cc0663db4516d22 Mon Sep 17 00:00:00 2001
From: Yuchen Wang <yuchen.wang@databricks.com>
Date: Sat, 19 Apr 2025 12:01:27 -0700
Subject: [PATCH 05/40] do not forward partial response strategy in proxy store

---
 pkg/store/proxy.go | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/pkg/store/proxy.go b/pkg/store/proxy.go
index 47e73a3013a..367737d268d 100644
--- a/pkg/store/proxy.go
+++ b/pkg/store/proxy.go
@@ -347,6 +347,11 @@ func (s *ProxyStore) Series(originalRequest *storepb.SeriesRequest, srv storepb.
 		ShardInfo:               originalRequest.ShardInfo,
 		WithoutReplicaLabels:    originalRequest.WithoutReplicaLabels,
 	}
+	if originalRequest.PartialResponseStrategy == storepb.PartialResponseStrategy_GROUP_REPLICA {
+		// Do not forward this field as it might cause data loss.
+		r.PartialResponseDisabled = true
+		r.PartialResponseStrategy = storepb.PartialResponseStrategy_ABORT
+	}
 
 	storeResponses := make([]respSet, 0, len(stores))
 
@@ -395,14 +400,14 @@ func (s *ProxyStore) Series(originalRequest *storepb.SeriesRequest, srv storepb.
 			s.metrics.storeFailureCount.WithLabelValues(st.GroupKey(), st.ReplicaKey()).Inc()
 			bumpCounter(st.GroupKey(), st.ReplicaKey(), failedStores)
 			totalFailedStores++
-			if r.PartialResponseStrategy == storepb.PartialResponseStrategy_GROUP_REPLICA {
+			if originalRequest.PartialResponseStrategy == storepb.PartialResponseStrategy_GROUP_REPLICA {
 				if checkGroupReplicaErrors(st, err) != nil {
 					return err
 				}
 				continue
 			}
 			level.Error(reqLogger).Log("err", err)
-			if !r.PartialResponseDisabled || r.PartialResponseStrategy == storepb.PartialResponseStrategy_WARN {
+			if !originalRequest.PartialResponseDisabled || originalRequest.PartialResponseStrategy == storepb.PartialResponseStrategy_WARN {
 				if err := srv.Send(storepb.NewWarnSeriesResponse(err)); err != nil {
 					return err
 				}
@@ -438,7 +443,7 @@ func (s *ProxyStore) Series(originalRequest *storepb.SeriesRequest, srv storepb.
 			level.Error(s.logger).Log("msg", "Store failure with warning", "warning", warning)
 			// Don't have group/replica keys here, so we can't attribute the warning to a specific store.
 			s.metrics.storeFailureCount.WithLabelValues("", "").Inc()
-			if r.PartialResponseStrategy == storepb.PartialResponseStrategy_GROUP_REPLICA {
+			if originalRequest.PartialResponseStrategy == storepb.PartialResponseStrategy_GROUP_REPLICA {
 				// The first error message is from AWS S3 and the second one is from Azure Blob Storage.
 				if strings.Contains(resp.GetWarning(), "The specified key does not exist") || strings.Contains(resp.GetWarning(), "The specified blob does not exist") {
 					level.Warn(s.logger).Log("msg", "Ignore 'the specified key/blob does not exist' error from Store")
@@ -459,7 +464,7 @@ func (s *ProxyStore) Series(originalRequest *storepb.SeriesRequest, srv storepb.
 					}
 					firstWarning = &warning
 				}
-			} else if r.PartialResponseDisabled || r.PartialResponseStrategy == storepb.PartialResponseStrategy_ABORT {
+			} else if originalRequest.PartialResponseDisabled || originalRequest.PartialResponseStrategy == storepb.PartialResponseStrategy_ABORT {
 				return status.Error(codes.Aborted, resp.GetWarning())
 			}
 		}

From 8821811ec6f4b90f93f6cc55b1fd07c8c42a8452 Mon Sep 17 00:00:00 2001
From: Yuchen Wang <yuchen.wang@databricks.com>
Date: Sat, 19 Apr 2025 18:35:57 -0700
Subject: [PATCH 06/40] fix coroutine leak

---
 pkg/store/proxy_merge.go       | 4 ++--
 pkg/store/storepb/inprocess.go | 7 +++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/pkg/store/proxy_merge.go b/pkg/store/proxy_merge.go
index 378ac789881..34749570662 100644
--- a/pkg/store/proxy_merge.go
+++ b/pkg/store/proxy_merge.go
@@ -558,13 +558,13 @@ func newAsyncRespSet(
 func (l *lazyRespSet) Close() {
 	l.bufferedResponsesMtx.Lock()
 	defer l.bufferedResponsesMtx.Unlock()
+	_ = l.cl.CloseSend()
 
 	l.closeSeries()
 	l.noMoreData = true
 	l.dataOrFinishEvent.Signal()
 
 	l.shardMatcher.Close()
-	_ = l.cl.CloseSend()
 }
 
 // eagerRespSet is a SeriesSet that blocks until all data is retrieved from
@@ -750,11 +750,11 @@ func sortWithoutLabels(set []*storepb.SeriesResponse, labelsToRemove map[string]
 }
 
 func (l *eagerRespSet) Close() {
+	_ = l.cl.CloseSend()
 	if l.closeSeries != nil {
 		l.closeSeries()
 	}
 	l.shardMatcher.Close()
-	_ = l.cl.CloseSend()
 }
 
 func (l *eagerRespSet) At() *storepb.SeriesResponse {
diff --git a/pkg/store/storepb/inprocess.go b/pkg/store/storepb/inprocess.go
index 0c3e7641baa..76a0d245640 100644
--- a/pkg/store/storepb/inprocess.go
+++ b/pkg/store/storepb/inprocess.go
@@ -7,6 +7,7 @@ import (
 	"context"
 	"io"
 	"iter"
+	"sync"
 
 	"google.golang.org/grpc"
 )
@@ -38,6 +39,7 @@ type inProcessClient struct {
 	ctx  context.Context
 	next func() (*SeriesResponse, error, bool)
 	stop func()
+	mu   sync.Mutex
 }
 
 func newInProcessClient(ctx context.Context, next func() (*SeriesResponse, error, bool), stop func()) *inProcessClient {
@@ -45,11 +47,14 @@ func newInProcessClient(ctx context.Context, next func() (*SeriesResponse, error
 		ctx:  ctx,
 		next: next,
 		stop: stop,
+		mu:   sync.Mutex{},
 	}
 }
 
 func (c *inProcessClient) Recv() (*SeriesResponse, error) {
+	c.mu.Lock()
 	resp, err, ok := c.next()
+	c.mu.Unlock()
 	if err != nil {
 		c.stop()
 		return nil, err
@@ -68,7 +73,9 @@ func (c *inProcessClient) Context() context.Context {
 }
 
 func (c *inProcessClient) CloseSend() error {
+	c.mu.Lock()
 	c.stop()
+	c.mu.Unlock()
 	return nil
 }
 

From ba19b891171750d0ad0179a406ebcec5c9d52a29 Mon Sep 17 00:00:00 2001
From: Yuchen Wang <yuchen.wang@databricks.com>
Date: Sat, 19 Apr 2025 18:41:36 -0700
Subject: [PATCH 07/40] use defer to unlock

---
 pkg/store/storepb/inprocess.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/store/storepb/inprocess.go b/pkg/store/storepb/inprocess.go
index 76a0d245640..d76301b42c2 100644
--- a/pkg/store/storepb/inprocess.go
+++ b/pkg/store/storepb/inprocess.go
@@ -53,8 +53,8 @@ func newInProcessClient(ctx context.Context, next func() (*SeriesResponse, error
 
 func (c *inProcessClient) Recv() (*SeriesResponse, error) {
 	c.mu.Lock()
+	defer c.mu.Unlock()
 	resp, err, ok := c.next()
-	c.mu.Unlock()
 	if err != nil {
 		c.stop()
 		return nil, err
@@ -74,8 +74,8 @@ func (c *inProcessClient) Context() context.Context {
 
 func (c *inProcessClient) CloseSend() error {
 	c.mu.Lock()
+	defer c.mu.Unlock()
 	c.stop()
-	c.mu.Unlock()
 	return nil
 }
 

From 865c60c182f9ab96c44258f7f58c066a9a5c1eb6 Mon Sep 17 00:00:00 2001
From: Yuchen Wang <yuchen.wang@databricks.com>
Date: Sat, 19 Apr 2025 22:20:15 -0700
Subject: [PATCH 08/40] revert close order change

---
 pkg/store/proxy_merge.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/store/proxy_merge.go b/pkg/store/proxy_merge.go
index 34749570662..378ac789881 100644
--- a/pkg/store/proxy_merge.go
+++ b/pkg/store/proxy_merge.go
@@ -558,13 +558,13 @@ func newAsyncRespSet(
 func (l *lazyRespSet) Close() {
 	l.bufferedResponsesMtx.Lock()
 	defer l.bufferedResponsesMtx.Unlock()
-	_ = l.cl.CloseSend()
 
 	l.closeSeries()
 	l.noMoreData = true
 	l.dataOrFinishEvent.Signal()
 
 	l.shardMatcher.Close()
+	_ = l.cl.CloseSend()
 }
 
 // eagerRespSet is a SeriesSet that blocks until all data is retrieved from
@@ -750,11 +750,11 @@ func sortWithoutLabels(set []*storepb.SeriesResponse, labelsToRemove map[string]
 }
 
 func (l *eagerRespSet) Close() {
-	_ = l.cl.CloseSend()
 	if l.closeSeries != nil {
 		l.closeSeries()
 	}
 	l.shardMatcher.Close()
+	_ = l.cl.CloseSend()
 }
 
 func (l *eagerRespSet) At() *storepb.SeriesResponse {

From 08b1526ece8a314aa006bf8cb488f5c8bf065d0b Mon Sep 17 00:00:00 2001
From: Yu Long <yu.long@databricks.com>
Date: Tue, 22 Apr 2025 20:20:25 +0000
Subject: [PATCH 09/40] address feedback

---
 cmd/thanos/query.go                          | 14 +++----
 pkg/query/aggregation_label_rewriter.go      | 44 +++++++++++++-------
 pkg/query/aggregation_label_rewriter_test.go | 29 +++++++++----
 pkg/query/querier.go                         |  6 +--
 4 files changed, 57 insertions(+), 36 deletions(-)

diff --git a/cmd/thanos/query.go b/cmd/thanos/query.go
index 51ec042154c..f952138de26 100644
--- a/cmd/thanos/query.go
+++ b/cmd/thanos/query.go
@@ -245,9 +245,8 @@ func registerQuery(app *extkingpin.App) {
 	enforceTenancy := cmd.Flag("query.enforce-tenancy", "Enforce tenancy on Query APIs. Responses are returned only if the label value of the configured tenant-label-name and the value of the tenant header matches.").Default("false").Bool()
 	tenantLabel := cmd.Flag("query.tenant-label-name", "Label name to use when enforcing tenancy (if --query.enforce-tenancy is enabled).").Default(tenancy.DefaultTenantLabel).String()
 
-	rewriteAggregationLabelName := cmd.Flag("query.aggregation-label-name", "The name for aggregation label. See query.aggregation-label-value-override for details. Default is empty.").Default("").String()
-	rewriteAggregationLabelTo := cmd.Flag("query.aggregation-label-value-override", "The value override for aggregation label. If set to x, all queries on aggregated metrics will have a `rewriteAggregationLabelName=x` matcher. Leave either flag empty to disable this behavior. Default is empty.").Default("").String()
-	rewriteAggregationInsertOnly := cmd.Flag("query.aggregation-label-insert-only", "If enabled, the above rewriter only insert missing rewriteAggregationLabelName=x and does not modify existing aggregation labels if any. Default is false.").Default("false").Bool()
+	rewriteAggregationLabelStrategy := cmd.Flag("query.aggregation-label-strategy", "The strategy to use when rewriting aggregation labels. Used during aggregator migration only.").Default(string(query.NoopLabelRewriter)).Hidden().Enum(string(query.NoopLabelRewriter), string(query.UpsertLabelRewriter), string(query.InsertOnlyLabelRewriter))
+	rewriteAggregationLabelTo := cmd.Flag("query.aggregation-label-value-override", "The value override for aggregation label. If set to x, all queries on aggregated metrics will have a `__agg_rule_type__=x` matcher. If empty, this behavior is disabled. Default is empty.").Hidden().Default("").String()
 
 	var storeRateLimits store.SeriesSelectLimits
 	storeRateLimits.RegisterFlags(cmd)
@@ -388,9 +387,8 @@ func registerQuery(app *extkingpin.App) {
 			*enforceTenancy,
 			*tenantLabel,
 			*enableGroupReplicaPartialStrategy,
-			*rewriteAggregationLabelName,
+			*rewriteAggregationLabelStrategy,
 			*rewriteAggregationLabelTo,
-			*rewriteAggregationInsertOnly,
 		)
 	})
 }
@@ -475,9 +473,8 @@ func runQuery(
 	enforceTenancy bool,
 	tenantLabel string,
 	groupReplicaPartialResponseStrategy bool,
-	rewriteAggregationLabelName string,
+	rewriteAggregationLabelStrategy string,
 	rewriteAggregationLabelTo string,
-	rewriteAggregationInsertOnly bool,
 ) error {
 	comp := component.Query
 	if alertQueryURL == "" {
@@ -607,8 +604,7 @@ func runQuery(
 	opts := query.Options{
 		GroupReplicaPartialResponseStrategy: groupReplicaPartialResponseStrategy,
 		DeduplicationFunc:                   queryDeduplicationFunc,
-		RewriteAggregationInsertOnly:        rewriteAggregationInsertOnly,
-		RewriteAggregationLabelName:         rewriteAggregationLabelName,
+		RewriteAggregationLabelStrategy:     rewriteAggregationLabelStrategy,
 		RewriteAggregationLabelTo:           rewriteAggregationLabelTo,
 	}
 	level.Info(logger).Log("msg", "databricks querier features", "opts", fmt.Sprintf("%+v", opts))
diff --git a/pkg/query/aggregation_label_rewriter.go b/pkg/query/aggregation_label_rewriter.go
index e0eec211e66..e17d8d3915f 100644
--- a/pkg/query/aggregation_label_rewriter.go
+++ b/pkg/query/aggregation_label_rewriter.go
@@ -14,13 +14,26 @@ import (
 	"github.com/prometheus/prometheus/model/labels"
 )
 
-type AggregationLabelRewriter struct {
-	logger  log.Logger
-	metrics *aggregationLabelRewriterMetrics
+// RewriterStrategy defines the strategy used by the AggregationLabelRewriter.
+type RewriterStrategy string
+
+const (
+	// NoopLabelRewriter is a no-op strategy that basically disables the rewriter.
+	NoopLabelRewriter RewriterStrategy = "noop"
+	// UpsertLabelRewriter is a strategy that upserts the aggregation label.
+	UpsertLabelRewriter RewriterStrategy = "upsert"
+	// InsertOnlyLabelRewriter is a strategy that only inserts the aggregation label if it does not exist.
+	InsertOnlyLabelRewriter RewriterStrategy = "insert-only"
+)
+
+const (
+	aggregationLabelName = "__agg_rule_type__"
+)
 
-	enabled           bool
-	insertOnly        bool
-	labelKey          string
+type AggregationLabelRewriter struct {
+	logger            log.Logger
+	metrics           *aggregationLabelRewriterMetrics
+	strategy          RewriterStrategy
 	desiredLabelValue string
 }
 
@@ -68,26 +81,27 @@ func newAggregationLabelRewriterMetrics(reg prometheus.Registerer, desiredLabelV
 
 func NewNopAggregationLabelRewriter() *AggregationLabelRewriter {
 	return &AggregationLabelRewriter{
-		enabled: false,
+		strategy: NoopLabelRewriter,
 	}
 }
 
-func NewAggregationLabelRewriter(logger log.Logger, reg prometheus.Registerer, labelKey string, desiredLabelValue string, insertOnly bool) *AggregationLabelRewriter {
+func NewAggregationLabelRewriter(logger log.Logger, reg prometheus.Registerer, strategy RewriterStrategy, desiredLabelValue string) *AggregationLabelRewriter {
 	if logger == nil {
 		logger = log.NewNopLogger()
 	}
+	if desiredLabelValue == "" {
+		strategy = NoopLabelRewriter
+	}
 	return &AggregationLabelRewriter{
-		enabled:           desiredLabelValue != "" && labelKey != "",
+		strategy:          strategy,
 		logger:            logger,
 		metrics:           newAggregationLabelRewriterMetrics(reg, desiredLabelValue),
-		insertOnly:        insertOnly,
-		labelKey:          labelKey,
 		desiredLabelValue: desiredLabelValue,
 	}
 }
 
 func (a *AggregationLabelRewriter) Rewrite(ms []*labels.Matcher) []*labels.Matcher {
-	if !a.enabled {
+	if a.strategy == NoopLabelRewriter {
 		return ms
 	}
 
@@ -118,13 +132,13 @@ func (a *AggregationLabelRewriter) Rewrite(ms []*labels.Matcher) []*labels.Match
 				break
 			}
 			// In any case, if we see an aggregation label, we store that for later use
-		} else if m.Name == a.labelKey {
+		} else if m.Name == aggregationLabelName {
 			aggregationLabelMatcher = m
 			aggregationLabelIndex = i
 		}
 	}
 
-	if aggregationLabelMatcher != nil && a.insertOnly {
+	if aggregationLabelMatcher != nil && a.strategy == InsertOnlyLabelRewriter {
 		needsRewrite = false
 		skipReason = "insert-only"
 	}
@@ -133,7 +147,7 @@ func (a *AggregationLabelRewriter) Rewrite(ms []*labels.Matcher) []*labels.Match
 	// but if it is true, we either append or modify an aggregation label
 	if needsRewrite {
 		newMatcher := &labels.Matcher{
-			Name:  a.labelKey,
+			Name:  aggregationLabelName,
 			Type:  labels.MatchRegexp,
 			Value: a.desiredLabelValue,
 		}
diff --git a/pkg/query/aggregation_label_rewriter_test.go b/pkg/query/aggregation_label_rewriter_test.go
index 37446e2f3ee..d216d063e39 100644
--- a/pkg/query/aggregation_label_rewriter_test.go
+++ b/pkg/query/aggregation_label_rewriter_test.go
@@ -11,17 +11,13 @@ import (
 	"github.com/prometheus/prometheus/model/labels"
 )
 
-const (
-	aggregationLabelName = "__agg_rule_type__"
-)
-
 func TestAggregationLabelRewriter_Rewrite(t *testing.T) {
 	t.Parallel()
 
 	for _, tc := range []struct {
 		name               string
 		desiredLabelValue  string // Empty means disabled
-		insertOnly         bool
+		strategy           RewriterStrategy
 		inputMatchers      []*labels.Matcher
 		expectedMatchers   []*labels.Matcher
 		expectedSkipCount  float64
@@ -30,7 +26,19 @@ func TestAggregationLabelRewriter_Rewrite(t *testing.T) {
 	}{
 		{
 			name:              "disabled rewriter should not modify label matchers",
+			desiredLabelValue: "v1",
+			strategy:          NoopLabelRewriter,
+			inputMatchers: []*labels.Matcher{
+				labels.MustNewMatcher(labels.MatchEqual, "__name__", "test:sum"),
+			},
+			expectedMatchers: []*labels.Matcher{
+				labels.MustNewMatcher(labels.MatchEqual, "__name__", "test:sum"),
+			},
+		},
+		{
+			name:              "no desired label value makes a disabled rewriter and should not modify label matchers",
 			desiredLabelValue: "",
+			strategy:          UpsertLabelRewriter,
 			inputMatchers: []*labels.Matcher{
 				labels.MustNewMatcher(labels.MatchEqual, "__name__", "test:sum"),
 			},
@@ -41,6 +49,7 @@ func TestAggregationLabelRewriter_Rewrite(t *testing.T) {
 		{
 			name:              "should add label for aggregated metric if no existing aggregation label",
 			desiredLabelValue: "5m",
+			strategy:          UpsertLabelRewriter,
 			inputMatchers: []*labels.Matcher{
 				labels.MustNewMatcher(labels.MatchEqual, "__name__", "test:sum"),
 			},
@@ -53,6 +62,7 @@ func TestAggregationLabelRewriter_Rewrite(t *testing.T) {
 		{
 			name:              "should rewrite existing equal aggregation label for aggregated metric",
 			desiredLabelValue: "5m",
+			strategy:          UpsertLabelRewriter,
 			inputMatchers: []*labels.Matcher{
 				labels.MustNewMatcher(labels.MatchEqual, "__name__", "test:sum"),
 				labels.MustNewMatcher(labels.MatchEqual, aggregationLabelName, "1h"),
@@ -66,6 +76,7 @@ func TestAggregationLabelRewriter_Rewrite(t *testing.T) {
 		{
 			name:              "should rewrite existing regex aggregation label for aggregated metric",
 			desiredLabelValue: "5m",
+			strategy:          UpsertLabelRewriter,
 			inputMatchers: []*labels.Matcher{
 				labels.MustNewMatcher(labels.MatchEqual, "__name__", "test:sum"),
 				labels.MustNewMatcher(labels.MatchRegexp, aggregationLabelName, "1h"),
@@ -79,6 +90,7 @@ func TestAggregationLabelRewriter_Rewrite(t *testing.T) {
 		{
 			name:              "should skip non-aggregated metric",
 			desiredLabelValue: "5m",
+			strategy:          UpsertLabelRewriter,
 			inputMatchers: []*labels.Matcher{
 				labels.MustNewMatcher(labels.MatchEqual, "__name__", "test_metric"),
 			},
@@ -90,6 +102,7 @@ func TestAggregationLabelRewriter_Rewrite(t *testing.T) {
 		{
 			name:              "should skip non-equal name matcher",
 			desiredLabelValue: "5m",
+			strategy:          UpsertLabelRewriter,
 			inputMatchers: []*labels.Matcher{
 				labels.MustNewMatcher(labels.MatchRegexp, "__name__", "test:sum"),
 			},
@@ -101,6 +114,7 @@ func TestAggregationLabelRewriter_Rewrite(t *testing.T) {
 		{
 			name:              "should skip when no name matcher",
 			desiredLabelValue: "5m",
+			strategy:          UpsertLabelRewriter,
 			inputMatchers: []*labels.Matcher{
 				labels.MustNewMatcher(labels.MatchEqual, "job", "prometheus"),
 			},
@@ -112,7 +126,7 @@ func TestAggregationLabelRewriter_Rewrite(t *testing.T) {
 		{
 			name:              "if insert only, should NOT rewrite existing aggregation label for aggregated metric",
 			desiredLabelValue: "5m",
-			insertOnly:        true,
+			strategy:          InsertOnlyLabelRewriter,
 			inputMatchers: []*labels.Matcher{
 				labels.MustNewMatcher(labels.MatchEqual, "__name__", "test:sum"),
 				labels.MustNewMatcher(labels.MatchEqual, aggregationLabelName, "1h"),
@@ -129,9 +143,8 @@ func TestAggregationLabelRewriter_Rewrite(t *testing.T) {
 			rewriter := NewAggregationLabelRewriter(
 				nil,
 				reg,
-				aggregationLabelName,
+				tc.strategy,
 				tc.desiredLabelValue,
-				tc.insertOnly,
 			)
 
 			result := rewriter.Rewrite(tc.inputMatchers)
diff --git a/pkg/query/querier.go b/pkg/query/querier.go
index 428ced4cb2f..1d8d9cb6d3f 100644
--- a/pkg/query/querier.go
+++ b/pkg/query/querier.go
@@ -61,9 +61,8 @@ type QueryableCreator func(
 type Options struct {
 	GroupReplicaPartialResponseStrategy bool
 	DeduplicationFunc                   string
-	RewriteAggregationLabelName         string
+	RewriteAggregationLabelStrategy     string
 	RewriteAggregationLabelTo           string
-	RewriteAggregationInsertOnly        bool
 }
 
 // NewQueryableCreator creates QueryableCreator.
@@ -96,9 +95,8 @@ func NewQueryableCreatorWithOptions(
 	aggregationLabelRewriter := NewAggregationLabelRewriter(
 		logger,
 		extprom.WrapRegistererWithPrefix("aggregation_label_rewriter_", reg),
-		opts.RewriteAggregationLabelName,
+		RewriterStrategy(opts.RewriteAggregationLabelStrategy),
 		opts.RewriteAggregationLabelTo,
-		opts.RewriteAggregationInsertOnly,
 	)
 	return func(
 		deduplicate bool,

From e89c45d8fbf7d84fe3d889814e18e7f064dd1839 Mon Sep 17 00:00:00 2001
From: Filip Petkovski <filip.petkovsky@gmail.com>
Date: Thu, 28 Nov 2024 18:15:20 +0100
Subject: [PATCH 10/40] cherry pick upstream PR 7945

---
 go.mod                            |  4 ++--
 go.sum                            | 18 ++++++++----------
 pkg/receive/writecapnp/client.go  | 15 +--------------
 pkg/receive/writecapnp/marshal.go | 21 +++++++++++++--------
 4 files changed, 24 insertions(+), 34 deletions(-)

diff --git a/go.mod b/go.mod
index f08a49b2abc..51c0b60f1d2 100644
--- a/go.mod
+++ b/go.mod
@@ -112,7 +112,7 @@ require (
 )
 
 require (
-	capnproto.org/go/capnp/v3 v3.0.1-alpha.2.0.20240830165715-46ccd63a72af
+	capnproto.org/go/capnp/v3 v3.0.0-alpha.30
 	github.com/Azure/azure-sdk-for-go/sdk/storage/azdatalake v1.4.0
 	github.com/cortexproject/promqlsmith v0.0.0-20240506042652-6cfdd9739a5e
 	github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus v1.0.1
@@ -135,7 +135,6 @@ require (
 	github.com/HdrHistogram/hdrhistogram-go v1.1.2 // indirect
 	github.com/bboreham/go-loser v0.0.0-20230920113527-fcc2c21820a3 // indirect
 	github.com/cilium/ebpf v0.11.0 // indirect
-	github.com/colega/zeropool v0.0.0-20230505084239-6fb4a4f75381 // indirect
 	github.com/containerd/cgroups/v3 v3.0.3 // indirect
 	github.com/docker/go-units v0.5.0 // indirect
 	github.com/elastic/go-licenser v0.3.1 // indirect
@@ -169,6 +168,7 @@ require (
 	k8s.io/client-go v0.31.0 // indirect
 	k8s.io/klog/v2 v2.130.1 // indirect
 	k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 // indirect
+	zenhack.net/go/util v0.0.0-20230414204917-531d38494cf5 // indirect
 )
 
 require (
diff --git a/go.sum b/go.sum
index e397c08d1dc..ef5d32d076f 100644
--- a/go.sum
+++ b/go.sum
@@ -1,5 +1,5 @@
-capnproto.org/go/capnp/v3 v3.0.1-alpha.2.0.20240830165715-46ccd63a72af h1:A5wxH0ZidOtYYUGjhtBaRuB87M73bGfc06uWB8sHpg0=
-capnproto.org/go/capnp/v3 v3.0.1-alpha.2.0.20240830165715-46ccd63a72af/go.mod h1:2vT5D2dtG8sJGEoEKU17e+j7shdaYp1Myl8X03B3hmc=
+capnproto.org/go/capnp/v3 v3.0.0-alpha.30 h1:iABQan/YiHFCgSXym5aNj27osapnEgAk4WaWYqb4sQM=
+capnproto.org/go/capnp/v3 v3.0.0-alpha.30/go.mod h1:+ysMHvOh1EWNOyorxJWs1omhRFiDoKxKkWQACp54jKM=
 cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
 cloud.google.com/go v0.38.0/go.mod h1:990N+gfupTy94rShfmMCWGDn0LpTmnzTp2qbd1dvSRU=
 cloud.google.com/go v0.44.1/go.mod h1:iSa0KzasP4Uvy3f1mN/7PiObzGgflwredwwASm/v6AU=
@@ -1506,8 +1506,6 @@ github.com/cncf/xds/go v0.0.0-20231128003011-0fa0005c9caa/go.mod h1:x/1Gn8zydmfq
 github.com/cncf/xds/go v0.0.0-20240423153145-555b57ec207b h1:ga8SEFjZ60pxLcmhnThWgvH2wg8376yUJmPhEH4H3kw=
 github.com/cncf/xds/go v0.0.0-20240423153145-555b57ec207b/go.mod h1:W+zGtBO5Y1IgJhy4+A9GOqVhqLpfZi+vwmdNXUehLA8=
 github.com/codahale/hdrhistogram v0.0.0-20161010025455-3a0bb77429bd/go.mod h1:sE/e/2PUdi/liOCUjSTXgM1o87ZssimdTWN964YiIeI=
-github.com/colega/zeropool v0.0.0-20230505084239-6fb4a4f75381 h1:d5EKgQfRQvO97jnISfR89AiCCCJMwMFoSxUiU0OGCRU=
-github.com/colega/zeropool v0.0.0-20230505084239-6fb4a4f75381/go.mod h1:OU76gHeRo8xrzGJU3F3I1CqX1ekM8dfJw0+wPeMwnp0=
 github.com/containerd/cgroups/v3 v3.0.3 h1:S5ByHZ/h9PMe5IOQoN7E+nMc2UcLEM/V48DGDJ9kip0=
 github.com/containerd/cgroups/v3 v3.0.3/go.mod h1:8HBe7V3aWGLFPd/k03swSIsGjZhHI2WzJmticMgVuz0=
 github.com/coreos/go-systemd/v22 v22.4.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
@@ -2116,8 +2114,8 @@ github.com/ovh/go-ovh v1.6.0 h1:ixLOwxQdzYDx296sXcgS35TOPEahJkpjMGtzPadCjQI=
 github.com/ovh/go-ovh v1.6.0/go.mod h1:cTVDnl94z4tl8pP1uZ/8jlVxntjSIf09bNcQ5TJSC7c=
 github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 h1:onHthvaw9LFnH4t2DcNVpwGmV9E1BkGknEliJkfwQj0=
 github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58/go.mod h1:DXv8WO4yhMYhSNPKjeNKa5WY9YCIEBRbNzFFPJbWO6Y=
-github.com/philhofer/fwd v1.1.2 h1:bnDivRJ1EWPjUIRXV5KfORO897HTbpFAQddBdE8t7Gw=
-github.com/philhofer/fwd v1.1.2/go.mod h1:qkPdfjR2SIEbspLqpe1tO4n5yICnr2DY7mqEx2tUTP0=
+github.com/philhofer/fwd v1.1.1 h1:GdGcTjf5RNAxwS4QLsiMzJYj5KEvPJD3Abr261yRQXQ=
+github.com/philhofer/fwd v1.1.1/go.mod h1:gk3iGcWd9+svBvR0sR+KPcfE+RNWozjowpeBVG3ZVNU=
 github.com/phpdave11/gofpdf v1.4.2/go.mod h1:zpO6xFn9yxo3YLyMvW8HcKWVdbNqgIfOOp2dXMnm1mY=
 github.com/phpdave11/gofpdi v1.0.12/go.mod h1:vBmVV0Do6hSBHC8uKUQ71JGW+ZGQq74llk/7bXwjDoI=
 github.com/phpdave11/gofpdi v1.0.13/go.mod h1:vBmVV0Do6hSBHC8uKUQ71JGW+ZGQq74llk/7bXwjDoI=
@@ -2276,10 +2274,8 @@ github.com/thanos-io/promql-engine v0.0.0-20241106100125-097e6e9f425a h1:BhWU58V
 github.com/thanos-io/promql-engine v0.0.0-20241106100125-097e6e9f425a/go.mod h1:wx0JlRZtsB2S10JYUgeg5GqLfMxw31SzArP+28yyE00=
 github.com/themihai/gomemcache v0.0.0-20180902122335-24332e2d58ab h1:7ZR3hmisBWw77ZpO1/o86g+JV3VKlk3d48jopJxzTjU=
 github.com/themihai/gomemcache v0.0.0-20180902122335-24332e2d58ab/go.mod h1:eheTFp954zcWZXCU8d0AT76ftsQOTo4DTqkN/h3k1MY=
-github.com/tinylib/msgp v1.1.9 h1:SHf3yoO2sGA0veCJeCBYLHuttAVFHGm2RHgNodW7wQU=
-github.com/tinylib/msgp v1.1.9/go.mod h1:BCXGB54lDD8qUEPmiG0cQQUANC4IUQyB2ItS2UDlO/k=
-github.com/tj/assert v0.0.3 h1:Df/BlaZ20mq6kuai7f5z2TvPFiwC3xaWJSDQNiIS3Rk=
-github.com/tj/assert v0.0.3/go.mod h1:Ne6X72Q+TB1AteidzQncjw9PabbMp4PBMZ1k+vd1Pvk=
+github.com/tinylib/msgp v1.1.5 h1:2gXmtWueD2HefZHQe1QOy9HVzmFrLOVvsXwXBQ0ayy0=
+github.com/tinylib/msgp v1.1.5/go.mod h1:eQsjooMTnV42mHu917E26IogZ2930nFyBQdofk10Udg=
 github.com/tklauser/go-sysconf v0.3.4/go.mod h1:Cl2c8ZRWfHD5IrfHo9VN+FX9kCFjIOyVklgXycLB6ek=
 github.com/tklauser/go-sysconf v0.3.10 h1:IJ1AZGZRWbY8T5Vfk04D9WOA5WSejdflXxP03OUqALw=
 github.com/tklauser/go-sysconf v0.3.10/go.mod h1:C8XykCvCb+Gn0oNCWPIlcb0RuglQTYaQ2hGm7jmxEFk=
@@ -3350,3 +3346,5 @@ sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+s
 sigs.k8s.io/structured-merge-diff/v4 v4.4.1/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08=
 sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E=
 sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY=
+zenhack.net/go/util v0.0.0-20230414204917-531d38494cf5 h1:yksDCGMVzyn3vlyf0GZ3huiF5FFaMGQpQ3UJvR0EoGA=
+zenhack.net/go/util v0.0.0-20230414204917-531d38494cf5/go.mod h1:1LtNdPAs8WH+BTcQiZAOo2MIKD/5jyK/u7sZ9ZPe5SE=
diff --git a/pkg/receive/writecapnp/client.go b/pkg/receive/writecapnp/client.go
index 0a20d90d44d..3cd9f2d0820 100644
--- a/pkg/receive/writecapnp/client.go
+++ b/pkg/receive/writecapnp/client.go
@@ -69,22 +69,9 @@ func (r *RemoteWriteClient) writeWithReconnect(ctx context.Context, numReconnect
 	if err := r.connect(ctx); err != nil {
 		return nil, err
 	}
-	arena := capnp.SingleSegment(nil)
-	defer arena.Release()
 
 	result, release := r.writer.Write(ctx, func(params Writer_write_Params) error {
-		_, seg, err := capnp.NewMessage(arena)
-		if err != nil {
-			return err
-		}
-		wr, err := NewRootWriteRequest(seg)
-		if err != nil {
-			return err
-		}
-		if err := params.SetWr(wr); err != nil {
-			return err
-		}
-		wr, err = params.Wr()
+		wr, err := params.NewWr()
 		if err != nil {
 			return err
 		}
diff --git a/pkg/receive/writecapnp/marshal.go b/pkg/receive/writecapnp/marshal.go
index 2d42d60b849..efc1a8ef038 100644
--- a/pkg/receive/writecapnp/marshal.go
+++ b/pkg/receive/writecapnp/marshal.go
@@ -6,6 +6,8 @@ package writecapnp
 import (
 	"capnproto.org/go/capnp/v3"
 
+	"github.com/pkg/errors"
+
 	"github.com/thanos-io/thanos/pkg/store/labelpb"
 	"github.com/thanos-io/thanos/pkg/store/storepb/prompb"
 )
@@ -46,7 +48,7 @@ func Build(tenant string, tsreq []prompb.TimeSeries) (WriteRequest, error) {
 
 func BuildInto(wr WriteRequest, tenant string, tsreq []prompb.TimeSeries) error {
 	if err := wr.SetTenant(tenant); err != nil {
-		return err
+		return errors.Wrap(err, "set tenant")
 	}
 
 	series, err := wr.NewTimeSeries(int32(len(tsreq)))
@@ -59,27 +61,30 @@ func BuildInto(wr WriteRequest, tenant string, tsreq []prompb.TimeSeries) error
 
 		lblsc, err := tsc.NewLabels(int32(len(ts.Labels)))
 		if err != nil {
-			return err
+			return errors.Wrap(err, "new labels")
 		}
 		if err := marshalLabels(lblsc, ts.Labels, builder); err != nil {
-			return err
+			return errors.Wrap(err, "marshal labels")
 		}
 		if err := marshalSamples(tsc, ts.Samples); err != nil {
-			return err
+			return errors.Wrap(err, "marshal samples")
 		}
 		if err := marshalHistograms(tsc, ts.Histograms); err != nil {
-			return err
+			return errors.Wrap(err, "marshal histograms")
 		}
 		if err := marshalExemplars(tsc, ts.Exemplars, builder); err != nil {
-			return err
+			return errors.Wrap(err, "marshal exemplars")
 		}
 	}
 
 	symbols, err := wr.NewSymbols()
 	if err != nil {
-		return err
+		return errors.Wrap(err, "new symbols")
 	}
-	return marshalSymbols(builder, symbols)
+	if err := marshalSymbols(builder, symbols); err != nil {
+		return errors.Wrap(err, "marshal symbols")
+	}
+	return nil
 }
 
 func marshalSymbols(builder *symbolsBuilder, symbols Symbols) error {

From 2fb8f8ddb9005a5fab425d38b07177b8ce7b3301 Mon Sep 17 00:00:00 2001
From: Yuchen Wang <yuchen.wang@databricks.com>
Date: Thu, 24 Apr 2025 14:45:41 -0700
Subject: [PATCH 11/40] use prometheus fork

---
 go.mod | 3 +++
 go.sum | 4 ++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/go.mod b/go.mod
index 51c0b60f1d2..cd1f8d888fe 100644
--- a/go.mod
+++ b/go.mod
@@ -289,6 +289,9 @@ replace (
 	// Required by Cortex https://github.com/cortexproject/cortex/pull/3051.
 	github.com/bradfitz/gomemcache => github.com/themihai/gomemcache v0.0.0-20180902122335-24332e2d58ab
 
+	// Use Prometheus fork to include some fixes
+	github.com/prometheus/prometheus => github.com/yuchen-db/prometheus v0.0.0-20250424212119-34c3b01ad3f9
+
 	// Pin kuberesolver/v5 to support new grpc version. Need to upgrade kuberesolver version on weaveworks/common.
 	github.com/sercand/kuberesolver/v4 => github.com/sercand/kuberesolver/v5 v5.1.1
 
diff --git a/go.sum b/go.sum
index ef5d32d076f..688bdcd6574 100644
--- a/go.sum
+++ b/go.sum
@@ -2189,8 +2189,6 @@ github.com/prometheus/procfs v0.8.0/go.mod h1:z7EfXMXOkbkqb9IINtpCn86r/to3BnA0ua
 github.com/prometheus/procfs v0.9.0/go.mod h1:+pB4zwohETzFnmlpe6yd2lSc+0/46IYZRB/chUwxUZY=
 github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc=
 github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
-github.com/prometheus/prometheus v0.55.1-0.20241102120812-a6fd22b9d2c8 h1:hCxAh6+hxwy7dqUPE5ndnilMeCWrqQkJVjPDXtiYRVo=
-github.com/prometheus/prometheus v0.55.1-0.20241102120812-a6fd22b9d2c8/go.mod h1:GGS7QlWKCqCbcEzWsVahYIfQwiGhcExkarHyLJTsv6I=
 github.com/redis/go-redis/v9 v9.7.0 h1:HhLSs+B6O021gwzl+locl0zEDnyNkxMtf/Z3NNBMa9E=
 github.com/redis/go-redis/v9 v9.7.0/go.mod h1:f6zhXITC7JUJIlPEiBOTXxJgPLdZcA93GewI7inzyWw=
 github.com/redis/rueidis v1.0.45-alpha.1 h1:69Bu1l7gVC/qDYuGGwMwGg2rjOjSyxESol/Zila62gY=
@@ -2297,6 +2295,8 @@ github.com/weaveworks/promrus v1.2.0/go.mod h1:SaE82+OJ91yqjrE1rsvBWVzNZKcHYFtMU
 github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
 github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
 github.com/xhit/go-str2duration v1.2.0/go.mod h1:3cPSlfZlUHVlneIVfePFWcJZsuwf+P1v2SRTV4cUmp4=
+github.com/yuchen-db/prometheus v0.0.0-20250424212119-34c3b01ad3f9 h1:hNgEopLMPL0Ol+MOif0rqebCYAtqiBeR54lmt+GyhsA=
+github.com/yuchen-db/prometheus v0.0.0-20250424212119-34c3b01ad3f9/go.mod h1:GGS7QlWKCqCbcEzWsVahYIfQwiGhcExkarHyLJTsv6I=
 github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
 github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
 github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=

From 8ff28cdb117716bc772d22697a9ef2200d69edf7 Mon Sep 17 00:00:00 2001
From: "HC Zhu (DB)" <hc.zhu@databricks.com>
Date: Fri, 25 Apr 2025 16:49:37 -0700
Subject: [PATCH 12/40] Limit lazyRespSet memory buffer size

---
 cmd/thanos/query.go      |  6 +++
 pkg/store/bucket.go      |  1 +
 pkg/store/proxy.go       | 32 +++++++++++-----
 pkg/store/proxy_merge.go | 79 +++++++++++++++++++++++++++++++---------
 4 files changed, 91 insertions(+), 27 deletions(-)

diff --git a/cmd/thanos/query.go b/cmd/thanos/query.go
index c67d25ccb8e..4563cf9738f 100644
--- a/cmd/thanos/query.go
+++ b/cmd/thanos/query.go
@@ -247,6 +247,9 @@ func registerQuery(app *extkingpin.App) {
 
 	rewriteAggregationLabelTo := cmd.Flag("query.aggregation-label-value-override", "The value override for __rollup__ label for aggregated metrics. If set to x, all queries on aggregated metrics will have a __rollup__=x matcher. Leave empty to disable this behavior. Default is empty.").Default("").String()
 
+	lazyRetrievalBufferSize := cmd.Flag("query.lazy-retrieval-buffer-size", "The fixed buffer size (in terms of how many store responses) for lazy retrieval strategy. This is to limit the memory usage.").
+		Default("20").Int()
+
 	var storeRateLimits store.SeriesSelectLimits
 	storeRateLimits.RegisterFlags(cmd)
 
@@ -387,6 +390,7 @@ func registerQuery(app *extkingpin.App) {
 			*tenantLabel,
 			*enableGroupReplicaPartialStrategy,
 			*rewriteAggregationLabelTo,
+			*lazyRetrievalBufferSize,
 		)
 	})
 }
@@ -472,6 +476,7 @@ func runQuery(
 	tenantLabel string,
 	groupReplicaPartialResponseStrategy bool,
 	rewriteAggregationLabelTo string,
+	lazyRetrievalBufferSize int,
 ) error {
 	comp := component.Query
 	if alertQueryURL == "" {
@@ -559,6 +564,7 @@ func runQuery(
 		store.WithTSDBSelector(tsdbSelector),
 		store.WithProxyStoreDebugLogging(debugLogging),
 		store.WithQuorumChunkDedup(queryDeduplicationFunc == dedup.AlgorithmQuorum),
+		store.WithLazyRetrievalBufferSize(lazyRetrievalBufferSize),
 	}
 
 	// Parse and sanitize the provided replica labels flags.
diff --git a/pkg/store/bucket.go b/pkg/store/bucket.go
index d9940221ffd..45f74e528f0 100644
--- a/pkg/store/bucket.go
+++ b/pkg/store/bucket.go
@@ -1623,6 +1623,7 @@ func (s *BucketStore) Series(req *storepb.SeriesRequest, seriesSrv storepb.Store
 						shardMatcher,
 						false,
 						s.metrics.emptyPostingCount.WithLabelValues(tenant),
+						100,
 					)
 				}
 
diff --git a/pkg/store/proxy.go b/pkg/store/proxy.go
index 47e73a3013a..25c8c8a87fa 100644
--- a/pkg/store/proxy.go
+++ b/pkg/store/proxy.go
@@ -93,14 +93,15 @@ type ProxyStore struct {
 	selectorLabels labels.Labels
 	buffers        sync.Pool
 
-	responseTimeout   time.Duration
-	metrics           *proxyStoreMetrics
-	retrievalStrategy RetrievalStrategy
-	debugLogging      bool
-	tsdbSelector      *TSDBSelector
-	quorumChunkDedup  bool
-	enableDedup       bool
-	matcherConverter  *storepb.MatcherConverter
+	responseTimeout         time.Duration
+	metrics                 *proxyStoreMetrics
+	retrievalStrategy       RetrievalStrategy
+	debugLogging            bool
+	tsdbSelector            *TSDBSelector
+	quorumChunkDedup        bool
+	enableDedup             bool
+	matcherConverter        *storepb.MatcherConverter
+	lazyRetrievalBufferSize int
 }
 
 type proxyStoreMetrics struct {
@@ -137,6 +138,12 @@ func RegisterStoreServer(storeSrv storepb.StoreServer, logger log.Logger) func(*
 // BucketStoreOption are functions that configure BucketStore.
 type ProxyStoreOption func(s *ProxyStore)
 
+func WithLazyRetrievalBufferSize(buferSize int) ProxyStoreOption {
+	return func(s *ProxyStore) {
+		s.lazyRetrievalBufferSize = buferSize
+	}
+}
+
 // WithProxyStoreDebugLogging toggles debug logging.
 func WithProxyStoreDebugLogging(enable bool) ProxyStoreOption {
 	return func(s *ProxyStore) {
@@ -385,11 +392,16 @@ func (s *ProxyStore) Series(originalRequest *storepb.SeriesRequest, srv storepb.
 		}
 	}
 	defer logGroupReplicaErrors()
-
+	lazyRetrievalBufferSize := s.lazyRetrievalBufferSize
+	if lazyRetrievalBufferSize <= 0 {
+		// Use 1 as default value for lazy retrieval buffer size.
+		// Unit tests hit this path so that corner cases can be tested with buffer size 1.
+		lazyRetrievalBufferSize = 1
+	}
 	for _, st := range stores {
 		st := st
 
-		respSet, err := newAsyncRespSet(ctx, st, r, s.responseTimeout, s.retrievalStrategy, &s.buffers, r.ShardInfo, reqLogger, s.metrics.emptyStreamResponses)
+		respSet, err := newAsyncRespSet(ctx, st, r, s.responseTimeout, s.retrievalStrategy, &s.buffers, r.ShardInfo, reqLogger, s.metrics.emptyStreamResponses, lazyRetrievalBufferSize)
 		if err != nil {
 			level.Warn(s.logger).Log("msg", "Store failure", "group", st.GroupKey(), "replica", st.ReplicaKey(), "err", err)
 			s.metrics.storeFailureCount.WithLabelValues(st.GroupKey(), st.ReplicaKey()).Inc()
diff --git a/pkg/store/proxy_merge.go b/pkg/store/proxy_merge.go
index 378ac789881..c0dd95c91ab 100644
--- a/pkg/store/proxy_merge.go
+++ b/pkg/store/proxy_merge.go
@@ -255,8 +255,20 @@ type lazyRespSet struct {
 	frameTimeout   time.Duration
 
 	// Internal bookkeeping.
-	dataOrFinishEvent    *sync.Cond
-	bufferedResponses    []*storepb.SeriesResponse
+	dataOrFinishEvent *sync.Cond
+	// This event firing means the buffer has a slot for more data.
+	bufferSlotEvent *sync.Cond
+	fixedBufferSize int
+	// This a ring buffer of size fixedBufferSize.
+	// A ring buffer of size N can hold N - 1 elements at most in order to distinguish being empty from being full.
+	bufferedResponses []*storepb.SeriesResponse
+	// ringHead points to the first element in the ring buffer.
+	// ringTail points to the slot after the last element in the ring buffer.
+	// if ringHead == ringTail then the buffer is empty.
+	// if ringHead == (ringTail + 1) % fixedBufferSize then the buffer is full.
+	ringHead             int
+	ringTail             int
+	closed               bool
 	bufferedResponsesMtx *sync.Mutex
 	lastResp             *storepb.SeriesResponse
 
@@ -266,24 +278,32 @@ type lazyRespSet struct {
 	shardMatcher *storepb.ShardMatcher
 }
 
+func (l *lazyRespSet) isEmpty() bool {
+	return l.ringHead == l.ringTail
+}
+
+func (l *lazyRespSet) isFull() bool {
+	return (l.ringTail+1)%l.fixedBufferSize == l.ringHead
+}
+
 func (l *lazyRespSet) Empty() bool {
 	l.bufferedResponsesMtx.Lock()
 	defer l.bufferedResponsesMtx.Unlock()
 
 	// NOTE(GiedriusS): need to wait here for at least one
 	// response so that we could build the heap properly.
-	if l.noMoreData && len(l.bufferedResponses) == 0 {
+	if l.noMoreData && l.isEmpty() {
 		return true
 	}
 
-	for len(l.bufferedResponses) == 0 {
+	for l.isEmpty() {
 		l.dataOrFinishEvent.Wait()
-		if l.noMoreData && len(l.bufferedResponses) == 0 {
+		if l.noMoreData && l.isEmpty() {
 			break
 		}
 	}
 
-	return len(l.bufferedResponses) == 0 && l.noMoreData
+	return l.isEmpty() && l.noMoreData
 }
 
 // Next either blocks until more data is available or reads
@@ -295,23 +315,24 @@ func (l *lazyRespSet) Next() bool {
 
 	l.initialized = true
 
-	if l.noMoreData && len(l.bufferedResponses) == 0 {
+	if l.noMoreData && l.isEmpty() {
 		l.lastResp = nil
 
 		return false
 	}
 
-	for len(l.bufferedResponses) == 0 {
+	for l.isEmpty() {
 		l.dataOrFinishEvent.Wait()
-		if l.noMoreData && len(l.bufferedResponses) == 0 {
+		if l.noMoreData && l.isEmpty() {
 			break
 		}
 	}
 
-	if len(l.bufferedResponses) > 0 {
-		l.lastResp = l.bufferedResponses[0]
+	if !l.isEmpty() {
+		l.lastResp = l.bufferedResponses[l.ringHead]
 		if l.initialized {
-			l.bufferedResponses = l.bufferedResponses[1:]
+			l.ringHead = (l.ringHead + 1) % l.fixedBufferSize
+			l.bufferSlotEvent.Signal()
 		}
 		return true
 	}
@@ -338,8 +359,12 @@ func newLazyRespSet(
 	shardMatcher *storepb.ShardMatcher,
 	applySharding bool,
 	emptyStreamResponses prometheus.Counter,
+	fixedBufferSize int,
 ) respSet {
-	bufferedResponses := []*storepb.SeriesResponse{}
+	// A ring buffer of size N can hold N - 1 elements at most in order to distinguish being empty from being full.
+	// That's why the size is increased by 1 internally.
+	fixedBufferSize++
+	bufferedResponses := make([]*storepb.SeriesResponse, fixedBufferSize)
 	bufferedResponsesMtx := &sync.Mutex{}
 	dataAvailable := sync.NewCond(bufferedResponsesMtx)
 
@@ -351,9 +376,14 @@ func newLazyRespSet(
 		closeSeries:          closeSeries,
 		span:                 span,
 		dataOrFinishEvent:    dataAvailable,
+		bufferSlotEvent:      sync.NewCond(bufferedResponsesMtx),
 		bufferedResponsesMtx: bufferedResponsesMtx,
 		bufferedResponses:    bufferedResponses,
 		shardMatcher:         shardMatcher,
+		fixedBufferSize:      fixedBufferSize,
+		ringHead:             0,
+		ringTail:             0,
+		closed:               false,
 	}
 	respSet.storeLabels = make(map[string]struct{})
 	for _, ls := range storeLabelSets {
@@ -406,11 +436,16 @@ func newLazyRespSet(
 				} else {
 					rerr = errors.Wrapf(err, "receive series from %s", st)
 				}
-
 				l.span.SetTag("err", rerr.Error())
 
 				l.bufferedResponsesMtx.Lock()
-				l.bufferedResponses = append(l.bufferedResponses, storepb.NewWarnSeriesResponse(rerr))
+				for l.isFull() && !l.closed {
+					l.bufferSlotEvent.Wait()
+				}
+				if !l.closed {
+					l.bufferedResponses[l.ringTail] = storepb.NewWarnSeriesResponse(rerr)
+					l.ringTail = (l.ringTail + 1) % l.fixedBufferSize
+				}
 				l.noMoreData = true
 				l.dataOrFinishEvent.Signal()
 				l.bufferedResponsesMtx.Unlock()
@@ -429,8 +464,14 @@ func newLazyRespSet(
 			}
 
 			l.bufferedResponsesMtx.Lock()
-			l.bufferedResponses = append(l.bufferedResponses, resp)
-			l.dataOrFinishEvent.Signal()
+			for l.isFull() && !l.closed {
+				l.bufferSlotEvent.Wait()
+			}
+			if !l.closed {
+				l.bufferedResponses[l.ringTail] = resp
+				l.ringTail = (l.ringTail + 1) % l.fixedBufferSize
+				l.dataOrFinishEvent.Signal()
+			}
 			l.bufferedResponsesMtx.Unlock()
 			return true
 		}
@@ -474,6 +515,7 @@ func newAsyncRespSet(
 	shardInfo *storepb.ShardInfo,
 	logger log.Logger,
 	emptyStreamResponses prometheus.Counter,
+	fixedBufferSizeForLazyRetrieval int,
 ) (respSet, error) {
 
 	var (
@@ -535,6 +577,7 @@ func newAsyncRespSet(
 			shardMatcher,
 			applySharding,
 			emptyStreamResponses,
+			fixedBufferSizeForLazyRetrieval,
 		), nil
 	case EagerRetrieval:
 		span.SetTag("retrival_strategy", EagerRetrieval)
@@ -560,6 +603,8 @@ func (l *lazyRespSet) Close() {
 	defer l.bufferedResponsesMtx.Unlock()
 
 	l.closeSeries()
+	l.closed = true
+	l.bufferSlotEvent.Signal()
 	l.noMoreData = true
 	l.dataOrFinishEvent.Signal()
 

From 13c6311e2b9f3f807c553de4d8c27633ceb51b6b Mon Sep 17 00:00:00 2001
From: "HC Zhu (DB)" <hc.zhu@databricks.com>
Date: Mon, 28 Apr 2025 08:20:47 -0700
Subject: [PATCH 13/40] Addressed comments

---
 cmd/thanos/query.go |  8 ++++----
 pkg/store/bucket.go | 13 ++++++++++++-
 pkg/store/proxy.go  | 14 +++++++-------
 3 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/cmd/thanos/query.go b/cmd/thanos/query.go
index 4563cf9738f..43322bbb70a 100644
--- a/cmd/thanos/query.go
+++ b/cmd/thanos/query.go
@@ -247,7 +247,7 @@ func registerQuery(app *extkingpin.App) {
 
 	rewriteAggregationLabelTo := cmd.Flag("query.aggregation-label-value-override", "The value override for __rollup__ label for aggregated metrics. If set to x, all queries on aggregated metrics will have a __rollup__=x matcher. Leave empty to disable this behavior. Default is empty.").Default("").String()
 
-	lazyRetrievalBufferSize := cmd.Flag("query.lazy-retrieval-buffer-size", "The fixed buffer size (in terms of how many store responses) for lazy retrieval strategy. This is to limit the memory usage.").
+	lazyRetrievalMaxBufferedResponses := cmd.Flag("query.lazy-retrieval-max-buffered-responses", "The lazy retrieval strategy can buffer up to this number of responses. This is to limit the memory usage. This flag takes effect only when the lazy retrieval strategy is enabled.").
 		Default("20").Int()
 
 	var storeRateLimits store.SeriesSelectLimits
@@ -390,7 +390,7 @@ func registerQuery(app *extkingpin.App) {
 			*tenantLabel,
 			*enableGroupReplicaPartialStrategy,
 			*rewriteAggregationLabelTo,
-			*lazyRetrievalBufferSize,
+			*lazyRetrievalMaxBufferedResponses,
 		)
 	})
 }
@@ -476,7 +476,7 @@ func runQuery(
 	tenantLabel string,
 	groupReplicaPartialResponseStrategy bool,
 	rewriteAggregationLabelTo string,
-	lazyRetrievalBufferSize int,
+	lazyRetrievalMaxBufferedResponses int,
 ) error {
 	comp := component.Query
 	if alertQueryURL == "" {
@@ -564,7 +564,7 @@ func runQuery(
 		store.WithTSDBSelector(tsdbSelector),
 		store.WithProxyStoreDebugLogging(debugLogging),
 		store.WithQuorumChunkDedup(queryDeduplicationFunc == dedup.AlgorithmQuorum),
-		store.WithLazyRetrievalBufferSize(lazyRetrievalBufferSize),
+		store.WithLazyRetrievalMaxBufferedResponsesForProxy(lazyRetrievalMaxBufferedResponses),
 	}
 
 	// Parse and sanitize the provided replica labels flags.
diff --git a/pkg/store/bucket.go b/pkg/store/bucket.go
index 45f74e528f0..ec05325e57c 100644
--- a/pkg/store/bucket.go
+++ b/pkg/store/bucket.go
@@ -422,6 +422,8 @@ type BucketStore struct {
 	enabledLazyExpandedPostings bool
 
 	sortingStrategy sortingStrategy
+	// This flag limits memory usage when lazy retrieval strategy, newLazyRespSet(), is used.
+	lazyRetrievalMaxBufferedResponses int
 
 	blockEstimatedMaxSeriesFunc BlockEstimator
 	blockEstimatedMaxChunkFunc  BlockEstimator
@@ -561,6 +563,14 @@ func WithDontResort(true bool) BucketStoreOption {
 	}
 }
 
+func WithLazyRetrievalMaxBufferedResponsesForBucket(n int) BucketStoreOption {
+	return func(s *BucketStore) {
+		if true {
+			s.lazyRetrievalMaxBufferedResponses = n
+		}
+	}
+}
+
 // WithIndexHeaderLazyDownloadStrategy specifies what block to lazy download its index header.
 // Only used when lazy mmap is enabled at the same time.
 func WithIndexHeaderLazyDownloadStrategy(strategy indexheader.LazyDownloadIndexHeaderFunc) BucketStoreOption {
@@ -612,6 +622,7 @@ func NewBucketStore(
 		enableChunkHashCalculation:      enableChunkHashCalculation,
 		seriesBatchSize:                 SeriesBatchSize,
 		sortingStrategy:                 sortingStrategyStore,
+		lazyRetrievalMaxBufferedResponses: 10,
 		indexHeaderLazyDownloadStrategy: indexheader.AlwaysEagerDownloadIndexHeader,
 		requestLoggerFunc:               NoopRequestLoggerFunc,
 	}
@@ -1623,7 +1634,7 @@ func (s *BucketStore) Series(req *storepb.SeriesRequest, seriesSrv storepb.Store
 						shardMatcher,
 						false,
 						s.metrics.emptyPostingCount.WithLabelValues(tenant),
-						100,
+						s.lazyRetrievalMaxBufferedResponses,
 					)
 				}
 
diff --git a/pkg/store/proxy.go b/pkg/store/proxy.go
index 25c8c8a87fa..39d814c9794 100644
--- a/pkg/store/proxy.go
+++ b/pkg/store/proxy.go
@@ -101,7 +101,7 @@ type ProxyStore struct {
 	quorumChunkDedup        bool
 	enableDedup             bool
 	matcherConverter        *storepb.MatcherConverter
-	lazyRetrievalBufferSize int
+	lazyRetrievalMaxBufferedResponses int
 }
 
 type proxyStoreMetrics struct {
@@ -138,9 +138,9 @@ func RegisterStoreServer(storeSrv storepb.StoreServer, logger log.Logger) func(*
 // BucketStoreOption are functions that configure BucketStore.
 type ProxyStoreOption func(s *ProxyStore)
 
-func WithLazyRetrievalBufferSize(buferSize int) ProxyStoreOption {
+func WithLazyRetrievalMaxBufferedResponsesForProxy(buferSize int) ProxyStoreOption {
 	return func(s *ProxyStore) {
-		s.lazyRetrievalBufferSize = buferSize
+		s.lazyRetrievalMaxBufferedResponses = buferSize
 	}
 }
 
@@ -392,16 +392,16 @@ func (s *ProxyStore) Series(originalRequest *storepb.SeriesRequest, srv storepb.
 		}
 	}
 	defer logGroupReplicaErrors()
-	lazyRetrievalBufferSize := s.lazyRetrievalBufferSize
-	if lazyRetrievalBufferSize <= 0 {
+	lazyRetrievalMaxBufferedResponses := s.lazyRetrievalMaxBufferedResponses
+	if lazyRetrievalMaxBufferedResponses <= 0 {
 		// Use 1 as default value for lazy retrieval buffer size.
 		// Unit tests hit this path so that corner cases can be tested with buffer size 1.
-		lazyRetrievalBufferSize = 1
+		lazyRetrievalMaxBufferedResponses = 1
 	}
 	for _, st := range stores {
 		st := st
 
-		respSet, err := newAsyncRespSet(ctx, st, r, s.responseTimeout, s.retrievalStrategy, &s.buffers, r.ShardInfo, reqLogger, s.metrics.emptyStreamResponses, lazyRetrievalBufferSize)
+		respSet, err := newAsyncRespSet(ctx, st, r, s.responseTimeout, s.retrievalStrategy, &s.buffers, r.ShardInfo, reqLogger, s.metrics.emptyStreamResponses, lazyRetrievalMaxBufferedResponses)
 		if err != nil {
 			level.Warn(s.logger).Log("msg", "Store failure", "group", st.GroupKey(), "replica", st.ReplicaKey(), "err", err)
 			s.metrics.storeFailureCount.WithLabelValues(st.GroupKey(), st.ReplicaKey()).Inc()

From 6d65034960790a8753cdc824537f653004f912ee Mon Sep 17 00:00:00 2001
From: "HC Zhu (DB)" <hc.zhu@databricks.com>
Date: Mon, 28 Apr 2025 10:33:33 -0700
Subject: [PATCH 14/40] Fix linter

---
 pkg/store/bucket.go |  6 +++++-
 pkg/store/proxy.go  | 16 ++++++++--------
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/pkg/store/bucket.go b/pkg/store/bucket.go
index ec05325e57c..a660876a46d 100644
--- a/pkg/store/bucket.go
+++ b/pkg/store/bucket.go
@@ -622,7 +622,7 @@ func NewBucketStore(
 		enableChunkHashCalculation:      enableChunkHashCalculation,
 		seriesBatchSize:                 SeriesBatchSize,
 		sortingStrategy:                 sortingStrategyStore,
-		lazyRetrievalMaxBufferedResponses: 10,
+		lazyRetrievalMaxBufferedResponses: 1,
 		indexHeaderLazyDownloadStrategy: indexheader.AlwaysEagerDownloadIndexHeader,
 		requestLoggerFunc:               NoopRequestLoggerFunc,
 	}
@@ -1624,6 +1624,10 @@ func (s *BucketStore) Series(req *storepb.SeriesRequest, seriesSrv storepb.Store
 						nil,
 					)
 				} else {
+					if s.lazyRetrievalMaxBufferedResponses < 1 {
+						// Unit tests hit this path. Using 1 to test corner cases.
+						s.lazyRetrievalMaxBufferedResponses = 1
+					}
 					resp = newLazyRespSet(
 						span,
 						10*time.Minute,
diff --git a/pkg/store/proxy.go b/pkg/store/proxy.go
index 39d814c9794..0d7d0b94f0f 100644
--- a/pkg/store/proxy.go
+++ b/pkg/store/proxy.go
@@ -93,14 +93,14 @@ type ProxyStore struct {
 	selectorLabels labels.Labels
 	buffers        sync.Pool
 
-	responseTimeout         time.Duration
-	metrics                 *proxyStoreMetrics
-	retrievalStrategy       RetrievalStrategy
-	debugLogging            bool
-	tsdbSelector            *TSDBSelector
-	quorumChunkDedup        bool
-	enableDedup             bool
-	matcherConverter        *storepb.MatcherConverter
+	responseTimeout                   time.Duration
+	metrics                           *proxyStoreMetrics
+	retrievalStrategy                 RetrievalStrategy
+	debugLogging                      bool
+	tsdbSelector                      *TSDBSelector
+	quorumChunkDedup                  bool
+	enableDedup                       bool
+	matcherConverter                  *storepb.MatcherConverter
 	lazyRetrievalMaxBufferedResponses int
 }
 

From 2b563616a4a1f58afd09aed9282770d2f8231e53 Mon Sep 17 00:00:00 2001
From: "HC Zhu (DB)" <hc.zhu@databricks.com>
Date: Mon, 28 Apr 2025 12:41:48 -0700
Subject: [PATCH 15/40] goformat

---
 pkg/store/bucket.go | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/pkg/store/bucket.go b/pkg/store/bucket.go
index a660876a46d..1ec8555e946 100644
--- a/pkg/store/bucket.go
+++ b/pkg/store/bucket.go
@@ -607,24 +607,24 @@ func NewBucketStore(
 			b := make([]byte, 0, initialBufSize)
 			return &b
 		}},
-		chunkPool:                       pool.NoopPool[byte]{},
-		blocks:                          map[ulid.ULID]*bucketBlock{},
-		blockSets:                       map[uint64]*bucketBlockSet{},
-		blockSyncConcurrency:            blockSyncConcurrency,
-		queryGate:                       gate.NewNoop(),
-		chunksLimiterFactory:            chunksLimiterFactory,
-		seriesLimiterFactory:            seriesLimiterFactory,
-		bytesLimiterFactory:             bytesLimiterFactory,
-		partitioner:                     partitioner,
-		enableCompatibilityLabel:        enableCompatibilityLabel,
-		postingOffsetsInMemSampling:     postingOffsetsInMemSampling,
-		enableSeriesResponseHints:       enableSeriesResponseHints,
-		enableChunkHashCalculation:      enableChunkHashCalculation,
-		seriesBatchSize:                 SeriesBatchSize,
-		sortingStrategy:                 sortingStrategyStore,
+		chunkPool:                         pool.NoopPool[byte]{},
+		blocks:                            map[ulid.ULID]*bucketBlock{},
+		blockSets:                         map[uint64]*bucketBlockSet{},
+		blockSyncConcurrency:              blockSyncConcurrency,
+		queryGate:                         gate.NewNoop(),
+		chunksLimiterFactory:              chunksLimiterFactory,
+		seriesLimiterFactory:              seriesLimiterFactory,
+		bytesLimiterFactory:               bytesLimiterFactory,
+		partitioner:                       partitioner,
+		enableCompatibilityLabel:          enableCompatibilityLabel,
+		postingOffsetsInMemSampling:       postingOffsetsInMemSampling,
+		enableSeriesResponseHints:         enableSeriesResponseHints,
+		enableChunkHashCalculation:        enableChunkHashCalculation,
+		seriesBatchSize:                   SeriesBatchSize,
+		sortingStrategy:                   sortingStrategyStore,
 		lazyRetrievalMaxBufferedResponses: 1,
-		indexHeaderLazyDownloadStrategy: indexheader.AlwaysEagerDownloadIndexHeader,
-		requestLoggerFunc:               NoopRequestLoggerFunc,
+		indexHeaderLazyDownloadStrategy:   indexheader.AlwaysEagerDownloadIndexHeader,
+		requestLoggerFunc:                 NoopRequestLoggerFunc,
 	}
 
 	for _, option := range options {

From 7fa82649756460642361b48cf799abb5762c6453 Mon Sep 17 00:00:00 2001
From: "HC Zhu (DB)" <hc.zhu@databricks.com>
Date: Mon, 28 Apr 2025 12:56:40 -0700
Subject: [PATCH 16/40] Fix unit tests

---
 pkg/store/bucket.go      |  9 +++++----
 pkg/store/bucket_test.go | 13 +++++++------
 pkg/store/proxy.go       |  8 +-------
 pkg/store/proxy_merge.go |  9 +++++++--
 pkg/store/proxy_test.go  |  5 +++++
 5 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/pkg/store/bucket.go b/pkg/store/bucket.go
index 1ec8555e946..7417b496420 100644
--- a/pkg/store/bucket.go
+++ b/pkg/store/bucket.go
@@ -1624,9 +1624,10 @@ func (s *BucketStore) Series(req *storepb.SeriesRequest, seriesSrv storepb.Store
 						nil,
 					)
 				} else {
-					if s.lazyRetrievalMaxBufferedResponses < 1 {
-						// Unit tests hit this path. Using 1 to test corner cases.
-						s.lazyRetrievalMaxBufferedResponses = 1
+					lazyRetrievalMaxBufferedResponses := s.lazyRetrievalMaxBufferedResponses
+					if lazyRetrievalMaxBufferedResponses < 1 {
+						// Some unit and e2e tests hit this path.
+						lazyRetrievalMaxBufferedResponses = 1
 					}
 					resp = newLazyRespSet(
 						span,
@@ -1638,7 +1639,7 @@ func (s *BucketStore) Series(req *storepb.SeriesRequest, seriesSrv storepb.Store
 						shardMatcher,
 						false,
 						s.metrics.emptyPostingCount.WithLabelValues(tenant),
-						s.lazyRetrievalMaxBufferedResponses,
+						lazyRetrievalMaxBufferedResponses,
 					)
 				}
 
diff --git a/pkg/store/bucket_test.go b/pkg/store/bucket_test.go
index e8dffd093b1..8027c093de7 100644
--- a/pkg/store/bucket_test.go
+++ b/pkg/store/bucket_test.go
@@ -1780,12 +1780,13 @@ func TestBucketSeries_OneBlock_InMemIndexCacheSegfault(t *testing.T) {
 			b1.meta.ULID: b1,
 			b2.meta.ULID: b2,
 		},
-		queryGate:            gate.NewNoop(),
-		chunksLimiterFactory: NewChunksLimiterFactory(0),
-		seriesLimiterFactory: NewSeriesLimiterFactory(0),
-		bytesLimiterFactory:  NewBytesLimiterFactory(0),
-		seriesBatchSize:      SeriesBatchSize,
-		requestLoggerFunc:    NoopRequestLoggerFunc,
+		queryGate:                         gate.NewNoop(),
+		chunksLimiterFactory:              NewChunksLimiterFactory(0),
+		seriesLimiterFactory:              NewSeriesLimiterFactory(0),
+		bytesLimiterFactory:               NewBytesLimiterFactory(0),
+		seriesBatchSize:                   SeriesBatchSize,
+		requestLoggerFunc:                 NoopRequestLoggerFunc,
+		lazyRetrievalMaxBufferedResponses: 1,
 	}
 
 	t.Run("invoke series for one block. Fill the cache on the way.", func(t *testing.T) {
diff --git a/pkg/store/proxy.go b/pkg/store/proxy.go
index 0d7d0b94f0f..56c77e134d6 100644
--- a/pkg/store/proxy.go
+++ b/pkg/store/proxy.go
@@ -392,16 +392,10 @@ func (s *ProxyStore) Series(originalRequest *storepb.SeriesRequest, srv storepb.
 		}
 	}
 	defer logGroupReplicaErrors()
-	lazyRetrievalMaxBufferedResponses := s.lazyRetrievalMaxBufferedResponses
-	if lazyRetrievalMaxBufferedResponses <= 0 {
-		// Use 1 as default value for lazy retrieval buffer size.
-		// Unit tests hit this path so that corner cases can be tested with buffer size 1.
-		lazyRetrievalMaxBufferedResponses = 1
-	}
 	for _, st := range stores {
 		st := st
 
-		respSet, err := newAsyncRespSet(ctx, st, r, s.responseTimeout, s.retrievalStrategy, &s.buffers, r.ShardInfo, reqLogger, s.metrics.emptyStreamResponses, lazyRetrievalMaxBufferedResponses)
+		respSet, err := newAsyncRespSet(ctx, st, r, s.responseTimeout, s.retrievalStrategy, &s.buffers, r.ShardInfo, reqLogger, s.metrics.emptyStreamResponses, s.lazyRetrievalMaxBufferedResponses)
 		if err != nil {
 			level.Warn(s.logger).Log("msg", "Store failure", "group", st.GroupKey(), "replica", st.ReplicaKey(), "err", err)
 			s.metrics.storeFailureCount.WithLabelValues(st.GroupKey(), st.ReplicaKey()).Inc()
diff --git a/pkg/store/proxy_merge.go b/pkg/store/proxy_merge.go
index c0dd95c91ab..914a8b03a2b 100644
--- a/pkg/store/proxy_merge.go
+++ b/pkg/store/proxy_merge.go
@@ -515,7 +515,7 @@ func newAsyncRespSet(
 	shardInfo *storepb.ShardInfo,
 	logger log.Logger,
 	emptyStreamResponses prometheus.Counter,
-	fixedBufferSizeForLazyRetrieval int,
+	lazyRetrievalMaxBufferedResponses int,
 ) (respSet, error) {
 
 	var (
@@ -567,6 +567,11 @@ func newAsyncRespSet(
 	switch retrievalStrategy {
 	case LazyRetrieval:
 		span.SetTag("retrival_strategy", LazyRetrieval)
+		if lazyRetrievalMaxBufferedResponses < 1 {
+			// Some unit and e2e tests hit this path.
+			lazyRetrievalMaxBufferedResponses = 1
+		}
+
 		return newLazyRespSet(
 			span,
 			frameTimeout,
@@ -577,7 +582,7 @@ func newAsyncRespSet(
 			shardMatcher,
 			applySharding,
 			emptyStreamResponses,
-			fixedBufferSizeForLazyRetrieval,
+			lazyRetrievalMaxBufferedResponses,
 		), nil
 	case EagerRetrieval:
 		span.SetTag("retrival_strategy", EagerRetrieval)
diff --git a/pkg/store/proxy_test.go b/pkg/store/proxy_test.go
index 7767a3e6427..94bfb0d0bb0 100644
--- a/pkg/store/proxy_test.go
+++ b/pkg/store/proxy_test.go
@@ -1587,6 +1587,10 @@ func TestProxyStore_SeriesSlowStores(t *testing.T) {
 			expectedWarningsLen: 2,
 		},
 	} {
+
+		options := []ProxyStoreOption{
+			WithLazyRetrievalMaxBufferedResponsesForProxy(1),
+		}
 		if ok := t.Run(tc.title, func(t *testing.T) {
 			for _, strategy := range []RetrievalStrategy{EagerRetrieval, LazyRetrieval} {
 				if ok := t.Run(string(strategy), func(t *testing.T) {
@@ -1596,6 +1600,7 @@ func TestProxyStore_SeriesSlowStores(t *testing.T) {
 						component.Query,
 						tc.selectorLabels,
 						4*time.Second, strategy,
+						options...,
 					)
 
 					ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)

From eafcb9db54c53a1f85e2b1ea841e6a43468d5ebd Mon Sep 17 00:00:00 2001
From: Yuchen Wang <yuchen.wang@databricks.com>
Date: Wed, 30 Apr 2025 14:11:31 -0700
Subject: [PATCH 17/40] update prometheus fork to include more upstream fixes

---
 go.mod | 2 +-
 go.sum | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/go.mod b/go.mod
index cd1f8d888fe..6652c21447c 100644
--- a/go.mod
+++ b/go.mod
@@ -290,7 +290,7 @@ replace (
 	github.com/bradfitz/gomemcache => github.com/themihai/gomemcache v0.0.0-20180902122335-24332e2d58ab
 
 	// Use Prometheus fork to include some fixes
-	github.com/prometheus/prometheus => github.com/yuchen-db/prometheus v0.0.0-20250424212119-34c3b01ad3f9
+	github.com/prometheus/prometheus => github.com/yuchen-db/prometheus v0.0.0-20250430210933-b16c6463ce79
 
 	// Pin kuberesolver/v5 to support new grpc version. Need to upgrade kuberesolver version on weaveworks/common.
 	github.com/sercand/kuberesolver/v4 => github.com/sercand/kuberesolver/v5 v5.1.1
diff --git a/go.sum b/go.sum
index 688bdcd6574..0d61422bdb3 100644
--- a/go.sum
+++ b/go.sum
@@ -2295,8 +2295,8 @@ github.com/weaveworks/promrus v1.2.0/go.mod h1:SaE82+OJ91yqjrE1rsvBWVzNZKcHYFtMU
 github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
 github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
 github.com/xhit/go-str2duration v1.2.0/go.mod h1:3cPSlfZlUHVlneIVfePFWcJZsuwf+P1v2SRTV4cUmp4=
-github.com/yuchen-db/prometheus v0.0.0-20250424212119-34c3b01ad3f9 h1:hNgEopLMPL0Ol+MOif0rqebCYAtqiBeR54lmt+GyhsA=
-github.com/yuchen-db/prometheus v0.0.0-20250424212119-34c3b01ad3f9/go.mod h1:GGS7QlWKCqCbcEzWsVahYIfQwiGhcExkarHyLJTsv6I=
+github.com/yuchen-db/prometheus v0.0.0-20250430210933-b16c6463ce79 h1:t7f383QN0aQKHijBMKbpOQBmrFwRqNA0mAzKa5zr74w=
+github.com/yuchen-db/prometheus v0.0.0-20250430210933-b16c6463ce79/go.mod h1:GGS7QlWKCqCbcEzWsVahYIfQwiGhcExkarHyLJTsv6I=
 github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
 github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
 github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=

From 18f5c8cd52e7ea92e7814d1386a0f95e04a19333 Mon Sep 17 00:00:00 2001
From: Yuchen Wang <yuchen.wang@databricks.com>
Date: Wed, 30 Apr 2025 17:13:03 -0700
Subject: [PATCH 18/40] wip

---
 pkg/receive/hashring.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pkg/receive/hashring.go b/pkg/receive/hashring.go
index e3ff6ededbe..2b9d979cda6 100644
--- a/pkg/receive/hashring.go
+++ b/pkg/receive/hashring.go
@@ -142,6 +142,7 @@ type ketamaHashring struct {
 }
 
 func newKetamaHashring(endpoints []Endpoint, sectionsPerNode int, replicationFactor uint64) (*ketamaHashring, error) {
+	fmt.Println("newKetamaHashring endpoints:", endpoints)
 	numSections := len(endpoints) * sectionsPerNode
 
 	if len(endpoints) < int(replicationFactor) {

From 146ae2f2a0a29900d5037b6feb8e73fcb587c0d4 Mon Sep 17 00:00:00 2001
From: Yuchen Wang <yuchen.wang@databricks.com>
Date: Thu, 1 May 2025 15:07:08 -0700
Subject: [PATCH 19/40] add aligned ketama hashring algorithm

---
 cmd/thanos/receive.go        |   2 +-
 pkg/receive/hashring.go      | 100 +++++++++-
 pkg/receive/hashring_test.go | 372 +++++++++++++++++++++++++++++++++++
 pkg/strutil/k8s.go           |  31 +++
 pkg/strutil/k8s_test.go      |  92 +++++++++
 5 files changed, 592 insertions(+), 5 deletions(-)
 create mode 100644 pkg/strutil/k8s.go
 create mode 100644 pkg/strutil/k8s_test.go

diff --git a/cmd/thanos/receive.go b/cmd/thanos/receive.go
index a9173b34882..0f15c86b2e1 100644
--- a/cmd/thanos/receive.go
+++ b/cmd/thanos/receive.go
@@ -1049,7 +1049,7 @@ func (rc *receiveConfig) registerFlag(cmd extkingpin.FlagClause) {
 	hashringAlgorithmsHelptext := strings.Join([]string{string(receive.AlgorithmHashmod), string(receive.AlgorithmKetama)}, ", ")
 	cmd.Flag("receive.hashrings-algorithm", "The algorithm used when distributing series in the hashrings. Must be one of "+hashringAlgorithmsHelptext+". Will be overwritten by the tenant-specific algorithm in the hashring config.").
 		Default(string(receive.AlgorithmHashmod)).
-		EnumVar(&rc.hashringsAlgorithm, string(receive.AlgorithmHashmod), string(receive.AlgorithmKetama))
+		EnumVar(&rc.hashringsAlgorithm, string(receive.AlgorithmHashmod), string(receive.AlgorithmKetama), string(receive.AlgorithmAlignedKetama))
 
 	rc.refreshInterval = extkingpin.ModelDuration(cmd.Flag("receive.hashrings-file-refresh-interval", "Refresh interval to re-read the hashring configuration file. (used as a fallback)").
 		Default("5m"))
diff --git a/pkg/receive/hashring.go b/pkg/receive/hashring.go
index 2b9d979cda6..b3195c43df8 100644
--- a/pkg/receive/hashring.go
+++ b/pkg/receive/hashring.go
@@ -13,6 +13,7 @@ import (
 	"sync"
 
 	"github.com/cespare/xxhash"
+	"github.com/thanos-io/thanos/pkg/strutil"
 	"golang.org/x/exp/slices"
 
 	"github.com/go-kit/log"
@@ -21,7 +22,6 @@ import (
 	"github.com/pkg/errors"
 
 	"github.com/thanos-io/thanos/pkg/store/labelpb"
-
 	"github.com/thanos-io/thanos/pkg/store/storepb/prompb"
 )
 
@@ -29,8 +29,9 @@ import (
 type HashringAlgorithm string
 
 const (
-	AlgorithmHashmod HashringAlgorithm = "hashmod"
-	AlgorithmKetama  HashringAlgorithm = "ketama"
+	AlgorithmHashmod       HashringAlgorithm = "hashmod"
+	AlgorithmKetama        HashringAlgorithm = "ketama"
+	AlgorithmAlignedKetama HashringAlgorithm = "aligned_ketama"
 
 	// SectionsPerNode is the number of sections in the ring assigned to each node
 	// in the ketama hashring. A higher number yields a better series distribution,
@@ -142,7 +143,6 @@ type ketamaHashring struct {
 }
 
 func newKetamaHashring(endpoints []Endpoint, sectionsPerNode int, replicationFactor uint64) (*ketamaHashring, error) {
-	fmt.Println("newKetamaHashring endpoints:", endpoints)
 	numSections := len(endpoints) * sectionsPerNode
 
 	if len(endpoints) < int(replicationFactor) {
@@ -178,6 +178,96 @@ func newKetamaHashring(endpoints []Endpoint, sectionsPerNode int, replicationFac
 	}, nil
 }
 
+// groupByAZ groups the endpoints by their availability zone, and sorts them by their ordinal number in statefulset.
+// It finds the least common subset of enpoints in each AZ and returns them.
+func groupByAZ(endpoints []Endpoint) ([][]Endpoint, error) {
+	if len(endpoints) == 0 {
+		return nil, errors.New("no endpoints found")
+	}
+	azGroups := make(map[string]map[int]Endpoint)
+	for _, endpoint := range endpoints {
+		i, err := strutil.ExtractPodOrdinal(endpoint.Address)
+		if err != nil {
+			return nil, errors.Wrapf(err, "failed to extract ordinal from address %s", endpoint.Address)
+		}
+		if _, ok := azGroups[endpoint.AZ]; !ok {
+			azGroups[endpoint.AZ] = make(map[int]Endpoint)
+		}
+		if _, ok := azGroups[endpoint.AZ][i]; ok {
+			return nil, errors.Errorf("duplicate endpoint %s in AZ %s", endpoint.Address, endpoint.AZ)
+		}
+		azGroups[endpoint.AZ][i] = endpoint
+	}
+	azSlice := make([]string, 0)
+	for az := range azGroups {
+		azSlice = append(azSlice, az)
+	}
+	sort.Strings(azSlice)
+	groupedEndpoints := make([][]Endpoint, len(azSlice))
+	i := 0
+	for {
+		for _, az := range azSlice {
+			if _, ok := azGroups[az][i]; !ok {
+				if i == 0 {
+					return nil, errors.Errorf("no endpoints found in AZ %s", az)
+				}
+				return groupedEndpoints, nil
+			}
+		}
+		for j, az := range azSlice {
+			groupedEndpoints[j] = append(groupedEndpoints[j], azGroups[az][i])
+		}
+		i++
+	}
+}
+
+func newAlignedKetamaHashring(endpoints []Endpoint, sectionsPerNode int, replicationFactor uint64) (*ketamaHashring, error) {
+	fmt.Println("newAlignedKetamaHashring, endpoints:", endpoints)
+	groupedEndpoints, err := groupByAZ(endpoints)
+	if err != nil {
+		return nil, err
+	}
+	if len(groupedEndpoints) != int(replicationFactor) {
+		return nil, errors.Errorf("number of AZs (%d) must be equal to replication factor (%d)", len(groupedEndpoints), replicationFactor)
+	}
+
+	numAZs, numEndpointsPerAZ := len(groupedEndpoints), len(groupedEndpoints[0])
+	flattenEndpoints := make([]Endpoint, 0, numAZs*numEndpointsPerAZ)
+	for i := range numAZs {
+		for j := range numEndpointsPerAZ {
+			flattenEndpoints = append(flattenEndpoints, groupedEndpoints[i][j])
+		}
+	}
+	fmt.Println("flattenEndpoints:", flattenEndpoints)
+
+	hash := xxhash.New()
+	ringSections := make(sections, 0, numEndpointsPerAZ)
+
+	for endpointIndex, endpoint := range groupedEndpoints[0] {
+		for i := 1; i <= sectionsPerNode; i++ {
+			_, _ = hash.Write([]byte(endpoint.Address + ":" + strconv.Itoa(i)))
+			n := &section{
+				az:            endpoint.AZ,
+				endpointIndex: uint64(endpointIndex),
+				hash:          hash.Sum64(),
+				replicas:      make([]uint64, 0, replicationFactor),
+			}
+			for j := range numAZs {
+				n.replicas = append(n.replicas, uint64(numEndpointsPerAZ*j+endpointIndex))
+			}
+			ringSections = append(ringSections, n)
+			hash.Reset()
+		}
+	}
+	sort.Sort(ringSections)
+
+	return &ketamaHashring{
+		endpoints:    flattenEndpoints,
+		sections:     ringSections,
+		numEndpoints: uint64(len(flattenEndpoints)),
+	}, nil
+}
+
 func (k *ketamaHashring) Nodes() []Endpoint {
 	return k.endpoints
 }
@@ -376,6 +466,8 @@ func newHashring(algorithm HashringAlgorithm, endpoints []Endpoint, replicationF
 		return newSimpleHashring(endpoints)
 	case AlgorithmKetama:
 		return newKetamaHashring(endpoints, SectionsPerNode, replicationFactor)
+	case AlgorithmAlignedKetama:
+		return newAlignedKetamaHashring(endpoints, SectionsPerNode, replicationFactor)
 	default:
 		l := log.NewNopLogger()
 		level.Warn(l).Log("msg", "Unrecognizable hashring algorithm. Fall back to hashmod algorithm.",
diff --git a/pkg/receive/hashring_test.go b/pkg/receive/hashring_test.go
index ef3b94e1941..0666155c5e0 100644
--- a/pkg/receive/hashring_test.go
+++ b/pkg/receive/hashring_test.go
@@ -6,6 +6,8 @@ package receive
 import (
 	"fmt"
 	"math"
+	"sort"
+	"strconv"
 	"strings"
 	"testing"
 
@@ -302,6 +304,226 @@ func TestKetamaHashringGet(t *testing.T) {
 	}
 }
 
+func TestAlignedKetamaHashringGet(t *testing.T) {
+	t.Parallel()
+
+	// --- Reusable Endpoints for Aligned Tests ---
+	// Zone A
+	ep0a := Endpoint{Address: podDNS("pod", 0, "svc", "nsa"), AZ: "zone-a"}
+	ep1a := Endpoint{Address: podDNS("pod", 1, "svc", "nsa"), AZ: "zone-a"}
+	// ep2a removed - unused
+
+	// Zone B
+	ep0b := Endpoint{Address: podDNS("pod", 0, "svc", "nsb"), AZ: "zone-b"}
+	ep1b := Endpoint{Address: podDNS("pod", 1, "svc", "nsb"), AZ: "zone-b"}
+	// ep2b removed - unused
+
+	// Zone C
+	ep0c := Endpoint{Address: podDNS("pod", 0, "svc", "nsc"), AZ: "zone-c"}
+	ep1c := Endpoint{Address: podDNS("pod", 1, "svc", "nsc"), AZ: "zone-c"}
+	// ep2c removed - unused
+
+	// Invalid address endpoint
+	invalidEp := Endpoint{Address: "invalid-address", AZ: "zone-a"}
+	// Duplicate ordinal endpoint
+	duplicateEp0a := Endpoint{Address: podDNS("anotherpod", 0, "svc", "nsa"), AZ: "zone-a"}
+
+	// --- Time Series for Hashing ---
+	// Assume tsForReplicaTest hashes to a section generated by ep1a.Address.
+	tsForReplicaTest := &prompb.TimeSeries{
+		Labels: []labelpb.ZLabel{{Name: "test", Value: "replica-routing"}},
+	}
+
+	// --- Test Cases ---
+	testCases := map[string]struct {
+		inputEndpoints    []Endpoint
+		replicationFactor uint64
+		sectionsPerNode   int // Usually constant, but specify for clarity
+
+		// For GetN testing
+		tenant           string
+		ts               *prompb.TimeSeries
+		n                uint64   // Replica index to get
+		expectedEndpoint Endpoint // Expected result from GetN
+
+		// Error expectations
+		expectConstructorError   bool
+		constructorErrorContains string
+		expectGetNError          bool
+		getNErrorContains        string
+	}{
+		// --- Success Cases ---
+		"valid 2 AZs, RF=2, get replica 0": {
+			inputEndpoints:         []Endpoint{ep1a, ep0b, ep0a, ep1b}, // Unordered input
+			replicationFactor:      2,
+			sectionsPerNode:        SectionsPerNode,
+			tenant:                 "tenant1",
+			ts:                     tsForReplicaTest, // Assumed to hash to ep1a's section
+			n:                      0,                // Get primary replica
+			expectedEndpoint:       ep1a,             // Expect the endpoint that generated the hash section
+			expectConstructorError: false,
+			expectGetNError:        false,
+		},
+		"valid 2 AZs, RF=2, get replica 1": {
+			inputEndpoints:         []Endpoint{ep1a, ep0b, ep0a, ep1b}, // Unordered input
+			replicationFactor:      2,
+			sectionsPerNode:        SectionsPerNode,
+			tenant:                 "tenant1",
+			ts:                     tsForReplicaTest, // Assumed to hash to ep1a's section
+			n:                      1,                // Get second replica
+			expectedEndpoint:       ep1b,             // Expect corresponding endpoint in the *other* AZ (zone-b)
+			expectConstructorError: false,
+			expectGetNError:        false,
+		},
+		"valid 3 AZs, RF=3, get replica 0": {
+			inputEndpoints:         []Endpoint{ep1a, ep0b, ep0a, ep1b, ep0c, ep1c},
+			replicationFactor:      3,
+			sectionsPerNode:        SectionsPerNode,
+			tenant:                 "tenant1",
+			ts:                     tsForReplicaTest, // Assumed to hash to ep1a's section
+			n:                      0,
+			expectedEndpoint:       ep1a,
+			expectConstructorError: false,
+			expectGetNError:        false,
+		},
+		"valid 3 AZs, RF=3, get replica 1": {
+			inputEndpoints:         []Endpoint{ep1a, ep0b, ep0a, ep1b, ep0c, ep1c},
+			replicationFactor:      3,
+			sectionsPerNode:        SectionsPerNode,
+			tenant:                 "tenant1",
+			ts:                     tsForReplicaTest, // Assumed to hash to ep1a's section
+			n:                      1,
+			expectedEndpoint:       ep1b, // Corresponding endpoint in zone-b
+			expectConstructorError: false,
+			expectGetNError:        false,
+		},
+		"valid 3 AZs, RF=3, get replica 2": {
+			inputEndpoints:         []Endpoint{ep1a, ep0b, ep0a, ep1b, ep0c, ep1c},
+			replicationFactor:      3,
+			sectionsPerNode:        SectionsPerNode,
+			tenant:                 "tenant1",
+			ts:                     tsForReplicaTest, // Assumed to hash to ep1a's section
+			n:                      2,
+			expectedEndpoint:       ep1c, // Corresponding endpoint in zone-c
+			expectConstructorError: false,
+			expectGetNError:        false,
+		},
+
+		// --- Constructor Error Cases ---
+		"error: empty input": {
+			inputEndpoints:           []Endpoint{},
+			replicationFactor:        1,
+			sectionsPerNode:          SectionsPerNode,
+			expectConstructorError:   true,
+			constructorErrorContains: "no endpoints found",
+		},
+		"error: invalid address": {
+			inputEndpoints:           []Endpoint{ep0a, invalidEp, ep0b},
+			replicationFactor:        2,
+			sectionsPerNode:          SectionsPerNode,
+			expectConstructorError:   true,
+			constructorErrorContains: "failed to extract ordinal from address invalid-address",
+		},
+		"error: duplicate ordinal": {
+			inputEndpoints:           []Endpoint{ep0a, ep1a, ep0b, duplicateEp0a}, // duplicate ep0a in zone-a
+			replicationFactor:        2,
+			sectionsPerNode:          SectionsPerNode,
+			expectConstructorError:   true,
+			constructorErrorContains: "duplicate endpoint",
+		},
+		"error: missing ordinal 0": {
+			inputEndpoints:           []Endpoint{ep1a, ep1b}, // Missing ep0a, ep0b
+			replicationFactor:        2,
+			sectionsPerNode:          SectionsPerNode,
+			expectConstructorError:   true,
+			constructorErrorContains: "no endpoints found in AZ", // Error from groupByAZ
+		},
+		"error: AZ count != RF (too few AZs)": {
+			inputEndpoints:           []Endpoint{ep0a, ep1a}, // Only 1 AZ
+			replicationFactor:        2,                      // Expecting 2
+			sectionsPerNode:          SectionsPerNode,
+			expectConstructorError:   true,
+			constructorErrorContains: "number of AZs (1) must be equal to replication factor (2)",
+		},
+		"error: AZ count != RF (too many AZs)": {
+			inputEndpoints:           []Endpoint{ep0a, ep1a, ep0b, ep1b, ep0c, ep1c}, // 3 AZs
+			replicationFactor:        2,                                              // Expecting 2
+			sectionsPerNode:          SectionsPerNode,
+			expectConstructorError:   true,
+			constructorErrorContains: "number of AZs (3) must be equal to replication factor (2)",
+		},
+		"constructor success with unbalanced AZs (uses common subset)": { // Renamed and corrected expectation
+			inputEndpoints:         []Endpoint{ep0a, ep1a, ep0b}, // zone-a has 0,1; zone-b has 0. Common subset is [[ep0a], [ep0b]]
+			replicationFactor:      2,
+			sectionsPerNode:        SectionsPerNode,
+			expectConstructorError: false, // Should succeed based on common subset [[ep0a], [ep0b]]
+			// No GetN test here as the setup is just for constructor validation
+		},
+
+		// --- GetN Error Cases ---
+		"error: GetN index out of bounds (n >= numEndpoints)": { // Corrected description based on code's check
+			inputEndpoints:         []Endpoint{ep1a, ep0b, ep0a, ep1b}, // 4 endpoints total -> numEndpoints = 4
+			replicationFactor:      2,
+			sectionsPerNode:        SectionsPerNode,
+			tenant:                 "tenant1",
+			ts:                     tsForReplicaTest,
+			n:                      4, // Requesting replica index 4. Check is n >= numEndpoints (4 >= 4)
+			expectConstructorError: false,
+			expectGetNError:        true,
+			getNErrorContains:      "insufficient nodes; have 4, want 5", // numEndpoints is 4 in this balanced case.
+		},
+		// NOTE: A test case for n >= replicationFactor but n < numEndpoints would currently panic.
+		// Example: RF=2, numEndpoints=4, n=2. Check `2 >= 4` is false. Access `replicas[2]` panics.
+		// We are not adding a panic test case here, assuming the insufficientNodesError check is the intended boundary.
+	}
+
+	for tcName, tc := range testCases {
+		t.Run(tcName, func(t *testing.T) {
+			// --- Constructor Call ---
+			hashRing, err := newAlignedKetamaHashring(tc.inputEndpoints, tc.sectionsPerNode, tc.replicationFactor)
+
+			// --- Constructor Assertions ---
+			if tc.expectConstructorError {
+				require.Error(t, err, "Expected constructor error")
+				require.Nil(t, hashRing, "Hashring should be nil on constructor error")
+				if tc.constructorErrorContains != "" {
+					require.Contains(t, err.Error(), tc.constructorErrorContains, "Constructor error message mismatch")
+				}
+				return // Stop test if constructor failed as expected
+			}
+
+			// If constructor error was not expected, assert it succeeded
+			require.NoError(t, err, "Expected constructor to succeed")
+			require.NotNil(t, hashRing, "Hashring should not be nil on successful construction")
+
+			// --- GetN Call & Assertions (only if constructor succeeded and GetN test is specified) ---
+			if tc.ts == nil && !tc.expectGetNError { // Skip GetN if no timeseries provided unless we expect a GetN error
+				return
+			}
+
+			// Ensure ts is not nil if we expect GetN to run without error or with a specific error
+			if tc.ts == nil && tc.expectGetNError {
+				// If we expect a GetN error but haven't provided a TS, use a dummy one.
+				// This is mainly for the "insufficient nodes" case which doesn't depend on the TS hash.
+				tc.ts = &prompb.TimeSeries{Labels: []labelpb.ZLabel{{Name: "dummy", Value: "dummy"}}}
+			}
+
+			result, getN_err := hashRing.GetN(tc.tenant, tc.ts, tc.n)
+
+			if tc.expectGetNError {
+				require.Error(t, getN_err, "Expected GetN error")
+				if tc.getNErrorContains != "" {
+					require.Contains(t, getN_err.Error(), tc.getNErrorContains, "GetN error message mismatch")
+				}
+			} else {
+				require.NoError(t, getN_err, "Expected GetN to succeed")
+				// Use Equals for struct comparison
+				testutil.Equals(t, tc.expectedEndpoint, result, "GetN returned unexpected endpoint")
+			}
+		})
+	}
+}
+
 func TestKetamaHashringBadConfigIsRejected(t *testing.T) {
 	t.Parallel()
 
@@ -671,6 +893,156 @@ func TestInvalidAZHashringCfg(t *testing.T) {
 	}
 }
 
+// Helper to create a valid DNS name for testing
+func podDNS(name string, ordinal int, svc, ns string) string {
+	return name + "-" + strconv.Itoa(ordinal) + "." + svc + "." + ns + ".svc.cluster.local"
+}
+
+func TestGroupByAZ(t *testing.T) {
+	// Define some reusable endpoints
+	ep0a := Endpoint{Address: podDNS("pod", 0, "svc", "nsa"), AZ: "zone-a"}
+	ep1a := Endpoint{Address: podDNS("pod", 1, "svc", "nsa"), AZ: "zone-a"}
+	ep2a := Endpoint{Address: podDNS("pod", 2, "svc", "nsa"), AZ: "zone-a"}
+
+	ep0b := Endpoint{Address: podDNS("pod", 0, "svc", "nsb"), AZ: "zone-b"}
+	ep1b := Endpoint{Address: podDNS("pod", 1, "svc", "nsb"), AZ: "zone-b"}
+
+	ep0c := Endpoint{Address: podDNS("pod", 0, "svc", "nsc"), AZ: "zone-c"}
+	ep1c := Endpoint{Address: podDNS("pod", 1, "svc", "nsc"), AZ: "zone-c"} // Added for a new test case
+
+	// Endpoint with an invalid address format
+	invalidEp := Endpoint{Address: "invalid-address-format", AZ: "zone-a"}
+	// Endpoint that would cause a duplicate ordinal in its AZ
+	duplicateEp0a := Endpoint{Address: podDNS("anotherpod", 0, "svc", "nsa"), AZ: "zone-a"}
+
+	testCases := map[string]struct {
+		inputEndpoints []Endpoint
+		expectedResult [][]Endpoint
+		expectError    bool
+		errorContains  string // Substring to check in the error message
+	}{
+		"error on empty input": { // UPDATED: Now expects an error
+			inputEndpoints: []Endpoint{},
+			expectedResult: nil,
+			expectError:    true,
+			errorContains:  "no endpoints found", // Specific error message
+		},
+		"single AZ, multiple endpoints": {
+			inputEndpoints: []Endpoint{ep1a, ep0a, ep2a}, // Input order shouldn't matter
+			expectedResult: [][]Endpoint{
+				{ep0a, ep1a, ep2a}, // Expect sorted by ordinal
+			},
+			expectError: false,
+		},
+		"multiple AZs, balanced and ordered": {
+			inputEndpoints: []Endpoint{ep1a, ep0b, ep0a, ep1b}, // Input order shouldn't matter
+			expectedResult: [][]Endpoint{
+				{ep0a, ep1a}, // zone-a endpoints, sorted by ordinal
+				{ep0b, ep1b}, // zone-b endpoints, sorted by ordinal
+			}, // Outer slice sorted by AZ name ("zone-a", "zone-b")
+			expectError: false,
+		},
+		"multiple AZs, different counts, stops at missing ordinal > 0": { // Name slightly adjusted for clarity
+			// zone-a has 0, 1, 2. zone-b has 0, 1. zone-c has 0.
+			// Grouping stops when ordinal 1 is missing in zone-c. Since i=1 != 0, no error.
+			inputEndpoints: []Endpoint{ep1a, ep0b, ep0a, ep1b, ep2a, ep0c},
+			expectedResult: [][]Endpoint{
+				{ep0a}, // zone-a, ordinal 0
+				{ep0b}, // zone-b, ordinal 0
+				{ep0c}, // zone-c, ordinal 0
+			}, // Outer slice sorted by AZ name ("zone-a", "zone-b", "zone-c")
+			expectError: false,
+		},
+		"error if ordinal 0 missing in any AZ": { // UPDATED: Now expects an error
+			// zone-a has 1, 2. zone-b has 1.
+			// Grouping fails immediately because ordinal 0 is missing.
+			inputEndpoints: []Endpoint{ep1a, ep2a, ep1b},
+			expectedResult: nil, // Expect nil result on error
+			expectError:    true,
+			// The exact AZ reported depends on sort order, so check common part
+			errorContains: "no endpoints found in AZ",
+		},
+		"error if ordinal 0 missing in only one AZ": { // NEW Test Case
+			// zone-a has 0, 1. zone-b has 1.
+			// Grouping fails immediately because ordinal 0 is missing in zone-b.
+			inputEndpoints: []Endpoint{ep0a, ep1a, ep1b},
+			expectedResult: nil,
+			expectError:    true,
+			errorContains:  "no endpoints found in AZ zone-b", // We know zone-b is missing 0
+		},
+		"error on invalid address": {
+			inputEndpoints: []Endpoint{ep0a, invalidEp, ep0b},
+			expectedResult: nil,
+			expectError:    true,
+			errorContains:  "failed to extract ordinal from address invalid-address-format",
+		},
+		"error on duplicate ordinal within an AZ": {
+			inputEndpoints: []Endpoint{ep0a, ep1a, ep0b, duplicateEp0a}, // ep0a and duplicateEp0a in zone-a have ordinal 0
+			expectedResult: nil,
+			expectError:    true,
+			errorContains:  "duplicate endpoint " + duplicateEp0a.Address + " in AZ zone-a",
+		},
+		"AZ sorting check": {
+			inputEndpoints: []Endpoint{ep0b, ep0c, ep0a}, // Input AZs: b, c, a
+			expectedResult: [][]Endpoint{
+				{ep0a}, // zone-a first
+				{ep0b}, // zone-b second
+				{ep0c}, // zone-c third
+			},
+			expectError: false,
+		},
+		"multiple AZs, stops correctly when next ordinal missing everywhere": { // NEW Test Case
+			// zone-a has 0, 1. zone-b has 0, 1. zone-c has 0, 1.
+			// Should return groups for 0 and 1, then stop cleanly when checking for 2.
+			inputEndpoints: []Endpoint{ep1a, ep0b, ep0a, ep1b, ep0c, ep1c},
+			expectedResult: [][]Endpoint{
+				{ep0a, ep1a}, // zone-a
+				{ep0b, ep1b}, // zone-b
+				{ep0c, ep1c}, // zone-c
+			},
+			expectError: false,
+		},
+	}
+
+	for tcName, tc := range testCases {
+		t.Run(tcName, func(t *testing.T) {
+			result, err := groupByAZ(tc.inputEndpoints)
+
+			if tc.expectError {
+				testutil.NotOk(t, err)
+				if tc.errorContains != "" {
+					testutil.Assert(t, strings.Contains(err.Error(), tc.errorContains), "Expected error message to contain '%s', but got: %v", tc.errorContains, err)
+				}
+				// On error, the function should return nil result
+				testutil.Assert(t, result == nil, "Expected nil result on error, got: %v", result)
+			} else {
+				testutil.Ok(t, err)
+				// Compare results using Equals
+				testutil.Equals(t, tc.expectedResult, result)
+
+				// Optional: Keep the AZ sorting check if desired
+				if err == nil && len(result) > 1 {
+					azOrderCorrect := sort.SliceIsSorted(result, func(i, j int) bool {
+						if len(result[i]) == 0 && len(result[j]) == 0 {
+							// This case might not happen anymore if empty inner slices aren't returned
+							// but keep for robustness or adjust if needed.
+							return true
+						}
+						if len(result[i]) == 0 {
+							return true
+						}
+						if len(result[j]) == 0 {
+							return false
+						}
+						return result[i][0].AZ < result[j][0].AZ
+					})
+					testutil.Assert(t, azOrderCorrect, "Outer slice is not sorted by AZ")
+				}
+			}
+		})
+	}
+}
+
 func makeSeries() []prompb.TimeSeries {
 	numSeries := 10000
 	series := make([]prompb.TimeSeries, numSeries)
diff --git a/pkg/strutil/k8s.go b/pkg/strutil/k8s.go
new file mode 100644
index 00000000000..28144bda7f3
--- /dev/null
+++ b/pkg/strutil/k8s.go
@@ -0,0 +1,31 @@
+// Copyright (c) The Thanos Authors.
+// Licensed under the Apache License 2.0.
+
+package strutil
+
+import (
+	"fmt"
+	"regexp"
+	"strconv"
+	"strings"
+)
+
+// Pre-compile the regex for efficiency if the function is called often.
+// Kept the original regex for strict matching.
+var podOrdinalRegex = regexp.MustCompile(`^.*\-(\d+)\..+\.svc\.cluster\.local$`)
+
+// ExtractPodOrdinal extracts the ordinal number from a Kubernetes StatefulSet pod DNS name.
+// The DNS name should be in the format: <pod-name>.<service-name>.<namespace>.svc.cluster.local[:port]
+// where <pod-name> is typically <statefulset-name>-<ordinal> or a hyphenated name ending with -<ordinal>.
+func ExtractPodOrdinal(dnsName string) (int, error) {
+	dnsWithoutPort := strings.Split(dnsName, ":")[0]
+	matches := podOrdinalRegex.FindStringSubmatch(dnsWithoutPort)
+	if len(matches) != 2 {
+		return -1, fmt.Errorf("invalid DNS name format: %s", dnsName)
+	}
+	ordinal, err := strconv.Atoi(matches[1])
+	if err != nil {
+		return -1, fmt.Errorf("failed to parse ordinal number: %v", err)
+	}
+	return ordinal, nil
+}
diff --git a/pkg/strutil/k8s_test.go b/pkg/strutil/k8s_test.go
new file mode 100644
index 00000000000..583f205006d
--- /dev/null
+++ b/pkg/strutil/k8s_test.go
@@ -0,0 +1,92 @@
+// Copyright (c) The Thanos Authors.
+// Licensed under the Apache License 2.0.
+
+package strutil
+
+import (
+	"testing"
+
+	"github.com/efficientgo/core/testutil"
+)
+
+func TestExtractPodOrdinal(t *testing.T) {
+	testCases := map[string]struct {
+		inputDNS        string
+		expectedOrdinal int
+		expectError     bool
+	}{
+		"valid DNS without port": {
+			inputDNS:        "thanos-receive-0.thanos-receive-headless.monitoring.svc.cluster.local",
+			expectedOrdinal: 0,
+			expectError:     false,
+		},
+		"valid DNS with port": {
+			inputDNS:        "thanos-receive-10.thanos-receive-headless.monitoring.svc.cluster.local:10901",
+			expectedOrdinal: 10,
+			expectError:     false,
+		},
+		"valid DNS with multiple hyphens in name": {
+			inputDNS:        "my-complex-statefulset-name-5.headless.prod.svc.cluster.local",
+			expectedOrdinal: 5,
+			expectError:     false,
+		},
+		"invalid DNS format - wrong domain": {
+			inputDNS:        "thanos-receive-0.thanos-receive-headless.monitoring.svc.cluster.example.com",
+			expectedOrdinal: -1,
+			expectError:     true,
+		},
+		"invalid DNS format - missing service/namespace": {
+			inputDNS:        "thanos-receive-0.svc.cluster.local",
+			expectedOrdinal: -1,
+			expectError:     true,
+		},
+		"invalid DNS format - no ordinal": {
+			inputDNS:        "thanos-receive.thanos-receive-headless.monitoring.svc.cluster.local",
+			expectedOrdinal: -1,
+			expectError:     true,
+		},
+		"invalid DNS format - non-numeric ordinal": {
+			inputDNS:        "thanos-receive-abc.thanos-receive-headless.monitoring.svc.cluster.local",
+			expectedOrdinal: -1,
+			expectError:     true,
+		},
+		"invalid DNS format - ordinal not at the end of pod name part": {
+			inputDNS:        "thanos-receive-0-backup.thanos-receive-headless.monitoring.svc.cluster.local",
+			expectedOrdinal: -1, // The regex expects \d+ followed by a dot.
+			expectError:     true,
+		},
+		"empty input string": {
+			inputDNS:        "",
+			expectedOrdinal: -1,
+			expectError:     true,
+		},
+		"just hostname": {
+			inputDNS:        "my-hostname-1",
+			expectedOrdinal: -1,
+			expectError:     true,
+		},
+		"just hostname with port": {
+			inputDNS:        "my-hostname-1:8080",
+			expectedOrdinal: -1,
+			expectError:     true,
+		},
+		"DNS like string but not matching pattern": {
+			inputDNS:        "pod-1.service.namespace",
+			expectedOrdinal: -1,
+			expectError:     true,
+		},
+	}
+
+	for tcName, tc := range testCases {
+		t.Run(tcName, func(t *testing.T) {
+			ordinal, err := ExtractPodOrdinal(tc.inputDNS)
+
+			if tc.expectError {
+				testutil.NotOk(t, err)
+			} else {
+				testutil.Ok(t, err)
+				testutil.Equals(t, tc.expectedOrdinal, ordinal)
+			}
+		})
+	}
+}

From c6d7c8ef992d366ec60c66ecff98cd83fe703ed9 Mon Sep 17 00:00:00 2001
From: Yuchen Wang <yuchen.wang@databricks.com>
Date: Thu, 1 May 2025 16:02:22 -0700
Subject: [PATCH 20/40] add unit test

---
 pkg/receive/hashring_test.go | 79 ++++++++++++++++++++++++++++++++++--
 1 file changed, 75 insertions(+), 4 deletions(-)

diff --git a/pkg/receive/hashring_test.go b/pkg/receive/hashring_test.go
index 0666155c5e0..5b3479d2bcd 100644
--- a/pkg/receive/hashring_test.go
+++ b/pkg/receive/hashring_test.go
@@ -13,6 +13,7 @@ import (
 
 	"github.com/efficientgo/core/testutil"
 	"github.com/stretchr/testify/require"
+	"github.com/thanos-io/thanos/pkg/strutil"
 
 	"github.com/prometheus/prometheus/model/labels"
 
@@ -508,15 +509,15 @@ func TestAlignedKetamaHashringGet(t *testing.T) {
 				tc.ts = &prompb.TimeSeries{Labels: []labelpb.ZLabel{{Name: "dummy", Value: "dummy"}}}
 			}
 
-			result, getN_err := hashRing.GetN(tc.tenant, tc.ts, tc.n)
+			result, getNErr := hashRing.GetN(tc.tenant, tc.ts, tc.n)
 
 			if tc.expectGetNError {
-				require.Error(t, getN_err, "Expected GetN error")
+				require.Error(t, getNErr, "Expected GetN error")
 				if tc.getNErrorContains != "" {
-					require.Contains(t, getN_err.Error(), tc.getNErrorContains, "GetN error message mismatch")
+					require.Contains(t, getNErr.Error(), tc.getNErrorContains, "GetN error message mismatch")
 				}
 			} else {
-				require.NoError(t, getN_err, "Expected GetN to succeed")
+				require.NoError(t, getNErr, "Expected GetN to succeed")
 				// Use Equals for struct comparison
 				testutil.Equals(t, tc.expectedEndpoint, result, "GetN returned unexpected endpoint")
 			}
@@ -524,6 +525,76 @@ func TestAlignedKetamaHashringGet(t *testing.T) {
 	}
 }
 
+func TestAlignedKetamaHashringReplicaOrdinals(t *testing.T) {
+	t.Parallel()
+
+	// --- Test Setup ---
+	// Create a valid setup for aligned ketama
+	endpoints := []Endpoint{
+		// AZ A
+		{Address: podDNS("pod", 0, "svc", "nsa"), AZ: "zone-a"},
+		{Address: podDNS("pod", 1, "svc", "nsa"), AZ: "zone-a"},
+		{Address: podDNS("pod", 2, "svc", "nsa"), AZ: "zone-a"},
+		// AZ B
+		{Address: podDNS("pod", 0, "svc", "nsb"), AZ: "zone-b"},
+		{Address: podDNS("pod", 1, "svc", "nsb"), AZ: "zone-b"},
+		{Address: podDNS("pod", 2, "svc", "nsb"), AZ: "zone-b"},
+		// AZ C
+		{Address: podDNS("pod", 0, "svc", "nsc"), AZ: "zone-c"},
+		{Address: podDNS("pod", 1, "svc", "nsc"), AZ: "zone-c"},
+		{Address: podDNS("pod", 2, "svc", "nsc"), AZ: "zone-c"},
+	}
+	replicationFactor := uint64(3) // Must match number of AZs
+	sectionsPerNode := 10          // Use a smaller number for faster testing
+
+	// --- Create the Hashring ---
+	hashRing, err := newAlignedKetamaHashring(endpoints, sectionsPerNode, replicationFactor)
+	require.NoError(t, err, "Aligned hashring constructor failed")
+	require.NotNil(t, hashRing, "Hashring should not be nil")
+
+	// --- Verification Logic ---
+	require.NotEmpty(t, hashRing.sections, "Hashring should contain sections")
+
+	for i, s := range hashRing.sections {
+		// Skip if a section somehow has no replicas (shouldn't happen with RF > 0)
+		if len(s.replicas) == 0 {
+			continue
+		}
+
+		expectedOrdinal := -1 // Initialize with sentinel value
+
+		for replicaNum, replicaIndex := range s.replicas {
+			// Defensive check: Ensure replicaIndex is within bounds of the stored endpoints list
+			require.Less(t, int(replicaIndex), len(hashRing.endpoints),
+				"Section %d (hash %d), Replica %d: index %d out of bounds for endpoints list (len %d)",
+				i, s.hash, replicaNum, replicaIndex, len(hashRing.endpoints))
+
+			// Get the endpoint corresponding to the replica index
+			endpoint := hashRing.endpoints[replicaIndex]
+
+			// Extract the ordinal from the endpoint address
+			ordinal, err := strutil.ExtractPodOrdinal(endpoint.Address)
+			require.NoError(t, err,
+				"Section %d (hash %d), Replica %d: failed to extract ordinal from address %s",
+				i, s.hash, replicaNum, endpoint.Address)
+
+			if expectedOrdinal == -1 {
+				// This is the first replica for this section, store its ordinal
+				expectedOrdinal = ordinal
+			} else {
+				// Compare subsequent replica ordinals to the first one
+				require.Equal(t, expectedOrdinal, ordinal,
+					"Section %d (hash %d), Replica %d (%s): Mismatched ordinal. Expected %d, got %d. Replicas in section: %v",
+					i, s.hash, replicaNum, endpoint.Address, expectedOrdinal, ordinal, s.replicas)
+			}
+		}
+		// Optional: Check that we actually found an ordinal if replicas were present
+		if len(s.replicas) > 0 {
+			require.NotEqual(t, -1, expectedOrdinal, "Section %d (hash %d): Failed to determine expected ordinal for replicas %v", i, s.hash, s.replicas)
+		}
+	}
+}
+
 func TestKetamaHashringBadConfigIsRejected(t *testing.T) {
 	t.Parallel()
 

From ec54bdeb2c7c807d3baac7ab9a0f57976f5c3547 Mon Sep 17 00:00:00 2001
From: Yuchen Wang <yuchen.wang@databricks.com>
Date: Thu, 1 May 2025 16:13:30 -0700
Subject: [PATCH 21/40] add sanity check, remove print

---
 pkg/receive/hashring.go | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/pkg/receive/hashring.go b/pkg/receive/hashring.go
index b3195c43df8..e790c47c0d2 100644
--- a/pkg/receive/hashring.go
+++ b/pkg/receive/hashring.go
@@ -222,7 +222,6 @@ func groupByAZ(endpoints []Endpoint) ([][]Endpoint, error) {
 }
 
 func newAlignedKetamaHashring(endpoints []Endpoint, sectionsPerNode int, replicationFactor uint64) (*ketamaHashring, error) {
-	fmt.Println("newAlignedKetamaHashring, endpoints:", endpoints)
 	groupedEndpoints, err := groupByAZ(endpoints)
 	if err != nil {
 		return nil, err
@@ -230,7 +229,6 @@ func newAlignedKetamaHashring(endpoints []Endpoint, sectionsPerNode int, replica
 	if len(groupedEndpoints) != int(replicationFactor) {
 		return nil, errors.Errorf("number of AZs (%d) must be equal to replication factor (%d)", len(groupedEndpoints), replicationFactor)
 	}
-
 	numAZs, numEndpointsPerAZ := len(groupedEndpoints), len(groupedEndpoints[0])
 	flattenEndpoints := make([]Endpoint, 0, numAZs*numEndpointsPerAZ)
 	for i := range numAZs {
@@ -238,11 +236,9 @@ func newAlignedKetamaHashring(endpoints []Endpoint, sectionsPerNode int, replica
 			flattenEndpoints = append(flattenEndpoints, groupedEndpoints[i][j])
 		}
 	}
-	fmt.Println("flattenEndpoints:", flattenEndpoints)
 
 	hash := xxhash.New()
 	ringSections := make(sections, 0, numEndpointsPerAZ)
-
 	for endpointIndex, endpoint := range groupedEndpoints[0] {
 		for i := 1; i <= sectionsPerNode; i++ {
 			_, _ = hash.Write([]byte(endpoint.Address + ":" + strconv.Itoa(i)))
@@ -252,15 +248,25 @@ func newAlignedKetamaHashring(endpoints []Endpoint, sectionsPerNode int, replica
 				hash:          hash.Sum64(),
 				replicas:      make([]uint64, 0, replicationFactor),
 			}
+			o := -1
 			for j := range numAZs {
-				n.replicas = append(n.replicas, uint64(numEndpointsPerAZ*j+endpointIndex))
+				replicaIdx := numEndpointsPerAZ*j + endpointIndex
+				n.replicas = append(n.replicas, uint64(replicaIdx))
+				order, err := strutil.ExtractPodOrdinal(flattenEndpoints[replicaIdx].Address)
+				if err != nil {
+					return nil, err
+				}
+				if j == o {
+					o = order
+				} else if order != o {
+					return nil, errors.Errorf("replicas %d and %d have different ordinal numbers in AZ %s", order, o, flattenEndpoints[replicaIdx].AZ)
+				}
 			}
 			ringSections = append(ringSections, n)
 			hash.Reset()
 		}
 	}
 	sort.Sort(ringSections)
-
 	return &ketamaHashring{
 		endpoints:    flattenEndpoints,
 		sections:     ringSections,

From 7e36958e35fa871f7cf6bb99b9570ff31d73524f Mon Sep 17 00:00:00 2001
From: Yuchen Wang <yuchen.wang@databricks.com>
Date: Thu, 1 May 2025 17:07:24 -0700
Subject: [PATCH 22/40] add panic check

---
 pkg/receive/handler.go | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/pkg/receive/handler.go b/pkg/receive/handler.go
index 8a140fbfd69..4bcd7aa653f 100644
--- a/pkg/receive/handler.go
+++ b/pkg/receive/handler.go
@@ -34,6 +34,7 @@ import (
 	"github.com/prometheus/prometheus/model/relabel"
 	"github.com/prometheus/prometheus/storage"
 	"github.com/prometheus/prometheus/tsdb"
+	"github.com/thanos-io/thanos/pkg/strutil"
 	"go.opentelemetry.io/otel/attribute"
 	"go.opentelemetry.io/otel/trace"
 	"golang.org/x/exp/slices"
@@ -863,6 +864,19 @@ func (h *Handler) fanoutForward(ctx context.Context, params remoteWriteParams) (
 	}
 }
 
+func allSame(slice []int) bool {
+	if len(slice) <= 1 {
+		return true // Empty or single-element slice is considered all same
+	}
+	first := slice[0]
+	for _, v := range slice {
+		if v != first {
+			return false
+		}
+	}
+	return true
+}
+
 // distributeTimeseriesToReplicas distributes the given timeseries from the tenant to different endpoints in a manner
 // that achieves the replication factor indicated by replicas.
 // The first return value are the series that should be written to the local node. The second return value are the
@@ -895,11 +909,17 @@ func (h *Handler) distributeTimeseriesToReplicas(
 			}
 		}
 
+		endpointOrders := make([]int, 0, len(replicas))
 		for _, rn := range replicas {
 			endpoint, err := h.hashring.GetN(tenant, &ts, rn)
 			if err != nil {
 				return nil, nil, err
 			}
+			o, err := strutil.ExtractPodOrdinal(endpoint.Address)
+			if err != nil {
+				panic(err)
+			}
+			endpointOrders = append(endpointOrders, o)
 			endpointReplica := endpointReplica{endpoint: endpoint, replica: rn}
 			var writeDestination = remoteWrites
 			if endpoint.HasAddress(h.options.Endpoint) {
@@ -921,6 +941,9 @@ func (h *Handler) distributeTimeseriesToReplicas(
 
 			writeDestination[endpointReplica][tenant] = tenantSeries
 		}
+		if !allSame(endpointOrders) {
+			panic(fmt.Sprintf("endpoint orders are not the same, endpoint orders: %v", endpointOrders))
+		}
 	}
 	if h.receiverMode == RouterOnly && len(localWrites) > 0 {
 		panic("router only mode should not have any local writes")

From 78789557a36a727359df3435d3f41f7a22481d60 Mon Sep 17 00:00:00 2001
From: Yuchen Wang <yuchen.wang@databricks.com>
Date: Thu, 1 May 2025 17:20:05 -0700
Subject: [PATCH 23/40] fix check

---
 pkg/receive/hashring.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/receive/hashring.go b/pkg/receive/hashring.go
index e790c47c0d2..ad42f03a081 100644
--- a/pkg/receive/hashring.go
+++ b/pkg/receive/hashring.go
@@ -256,7 +256,7 @@ func newAlignedKetamaHashring(endpoints []Endpoint, sectionsPerNode int, replica
 				if err != nil {
 					return nil, err
 				}
-				if j == o {
+				if j == 0 {
 					o = order
 				} else if order != o {
 					return nil, errors.Errorf("replicas %d and %d have different ordinal numbers in AZ %s", order, o, flattenEndpoints[replicaIdx].AZ)

From bbcb3260212c8f9a9916559e9951f5fb88f9b00e Mon Sep 17 00:00:00 2001
From: Yuchen Wang <yuchen.wang@databricks.com>
Date: Thu, 1 May 2025 18:26:04 -0700
Subject: [PATCH 24/40] Revert "add panic check"

This reverts commit 7e36958e35fa871f7cf6bb99b9570ff31d73524f.
---
 pkg/receive/handler.go | 23 -----------------------
 1 file changed, 23 deletions(-)

diff --git a/pkg/receive/handler.go b/pkg/receive/handler.go
index 4bcd7aa653f..8a140fbfd69 100644
--- a/pkg/receive/handler.go
+++ b/pkg/receive/handler.go
@@ -34,7 +34,6 @@ import (
 	"github.com/prometheus/prometheus/model/relabel"
 	"github.com/prometheus/prometheus/storage"
 	"github.com/prometheus/prometheus/tsdb"
-	"github.com/thanos-io/thanos/pkg/strutil"
 	"go.opentelemetry.io/otel/attribute"
 	"go.opentelemetry.io/otel/trace"
 	"golang.org/x/exp/slices"
@@ -864,19 +863,6 @@ func (h *Handler) fanoutForward(ctx context.Context, params remoteWriteParams) (
 	}
 }
 
-func allSame(slice []int) bool {
-	if len(slice) <= 1 {
-		return true // Empty or single-element slice is considered all same
-	}
-	first := slice[0]
-	for _, v := range slice {
-		if v != first {
-			return false
-		}
-	}
-	return true
-}
-
 // distributeTimeseriesToReplicas distributes the given timeseries from the tenant to different endpoints in a manner
 // that achieves the replication factor indicated by replicas.
 // The first return value are the series that should be written to the local node. The second return value are the
@@ -909,17 +895,11 @@ func (h *Handler) distributeTimeseriesToReplicas(
 			}
 		}
 
-		endpointOrders := make([]int, 0, len(replicas))
 		for _, rn := range replicas {
 			endpoint, err := h.hashring.GetN(tenant, &ts, rn)
 			if err != nil {
 				return nil, nil, err
 			}
-			o, err := strutil.ExtractPodOrdinal(endpoint.Address)
-			if err != nil {
-				panic(err)
-			}
-			endpointOrders = append(endpointOrders, o)
 			endpointReplica := endpointReplica{endpoint: endpoint, replica: rn}
 			var writeDestination = remoteWrites
 			if endpoint.HasAddress(h.options.Endpoint) {
@@ -941,9 +921,6 @@ func (h *Handler) distributeTimeseriesToReplicas(
 
 			writeDestination[endpointReplica][tenant] = tenantSeries
 		}
-		if !allSame(endpointOrders) {
-			panic(fmt.Sprintf("endpoint orders are not the same, endpoint orders: %v", endpointOrders))
-		}
 	}
 	if h.receiverMode == RouterOnly && len(localWrites) > 0 {
 		panic("router only mode should not have any local writes")

From b32136b6f8c4527f347bb5fae40b0e5d5c132ac5 Mon Sep 17 00:00:00 2001
From: Yuchen Wang <yuchen.wang@databricks.com>
Date: Thu, 1 May 2025 18:40:11 -0700
Subject: [PATCH 25/40] polish code

---
 pkg/receive/hashring.go      | 213 +++++++++++++++++++++++++----------
 pkg/receive/hashring_test.go |   8 +-
 2 files changed, 159 insertions(+), 62 deletions(-)

diff --git a/pkg/receive/hashring.go b/pkg/receive/hashring.go
index ad42f03a081..7adbc497260 100644
--- a/pkg/receive/hashring.go
+++ b/pkg/receive/hashring.go
@@ -178,99 +178,196 @@ func newKetamaHashring(endpoints []Endpoint, sectionsPerNode int, replicationFac
 	}, nil
 }
 
-// groupByAZ groups the endpoints by their availability zone, and sorts them by their ordinal number in statefulset.
-// It finds the least common subset of enpoints in each AZ and returns them.
+// groupByAZ groups endpoints by Availability Zone and sorts them by their inferred ordinal.
+// It returns a 2D slice where each inner slice represents an AZ (sorted alphabetically)
+// and contains endpoints sorted by ordinal. All inner slices are truncated to the
+// length of the largest common sequence of ordinals starting from 0 across all AZs.
 func groupByAZ(endpoints []Endpoint) ([][]Endpoint, error) {
 	if len(endpoints) == 0 {
-		return nil, errors.New("no endpoints found")
+		return nil, errors.New("no endpoints provided")
 	}
-	azGroups := make(map[string]map[int]Endpoint)
-	for _, endpoint := range endpoints {
-		i, err := strutil.ExtractPodOrdinal(endpoint.Address)
+
+	// Group endpoints by AZ and then by ordinal.
+	azEndpoints := make(map[string]map[int]Endpoint)
+	for _, ep := range endpoints {
+		ordinal, err := strutil.ExtractPodOrdinal(ep.Address)
 		if err != nil {
-			return nil, errors.Wrapf(err, "failed to extract ordinal from address %s", endpoint.Address)
+			return nil, errors.Wrapf(err, "failed to extract ordinal from address %s", ep.Address)
 		}
-		if _, ok := azGroups[endpoint.AZ]; !ok {
-			azGroups[endpoint.AZ] = make(map[int]Endpoint)
+
+		if _, ok := azEndpoints[ep.AZ]; !ok {
+			azEndpoints[ep.AZ] = make(map[int]Endpoint)
 		}
-		if _, ok := azGroups[endpoint.AZ][i]; ok {
-			return nil, errors.Errorf("duplicate endpoint %s in AZ %s", endpoint.Address, endpoint.AZ)
+
+		if _, exists := azEndpoints[ep.AZ][ordinal]; exists {
+			return nil, fmt.Errorf("duplicate endpoint ordinal %d for address %s in AZ %s", ordinal, ep.Address, ep.AZ)
 		}
-		azGroups[endpoint.AZ][i] = endpoint
+		azEndpoints[ep.AZ][ordinal] = ep
 	}
-	azSlice := make([]string, 0)
-	for az := range azGroups {
-		azSlice = append(azSlice, az)
+
+	// Get sorted list of AZ names.
+	sortedAZs := make([]string, 0, len(azEndpoints))
+	for az := range azEndpoints {
+		sortedAZs = append(sortedAZs, az)
 	}
-	sort.Strings(azSlice)
-	groupedEndpoints := make([][]Endpoint, len(azSlice))
-	i := 0
-	for {
-		for _, az := range azSlice {
-			if _, ok := azGroups[az][i]; !ok {
+	sort.Strings(sortedAZs)
+
+	// Determine the maximum common ordinal across all AZs.
+	maxCommonOrdinal := -1
+	for i := 0; ; i++ {
+		presentInAllAZs := true
+		for _, az := range sortedAZs {
+			if _, ok := azEndpoints[az][i]; !ok {
+				presentInAllAZs = false
+				// If even ordinal 0 is missing in any AZ, it's an invalid configuration for balancing.
 				if i == 0 {
-					return nil, errors.Errorf("no endpoints found in AZ %s", az)
+					return nil, fmt.Errorf("AZ %q is missing endpoint with ordinal 0", az)
 				}
-				return groupedEndpoints, nil
+				break // Stop checking this ordinal once one AZ is missing it.
 			}
 		}
-		for j, az := range azSlice {
-			groupedEndpoints[j] = append(groupedEndpoints[j], azGroups[az][i])
+
+		if !presentInAllAZs {
+			maxCommonOrdinal = i - 1
+			break
+		}
+	}
+
+	// If maxCommonOrdinal is still -1, it means not even ordinal 0 was common.
+	if maxCommonOrdinal < 0 {
+		// This case should be caught by the i==0 check inside the loop,
+		// but added for robustness in case the logic changes.
+		return nil, errors.New("no common endpoints with ordinal 0 found across all AZs")
+	}
+
+	// Build the final result, truncated to the maxCommonOrdinal.
+	numAZs := len(sortedAZs)
+	result := make([][]Endpoint, numAZs)
+	for i, az := range sortedAZs {
+		// Pre-allocate slice capacity for efficiency.
+		result[i] = make([]Endpoint, 0, maxCommonOrdinal+1)
+		for j := 0; j <= maxCommonOrdinal; j++ {
+			// We know the endpoint exists due to the previous check.
+			result[i] = append(result[i], azEndpoints[az][j])
 		}
-		i++
 	}
+
+	return result, nil
 }
 
+// newAlignedKetamaHashring creates a Ketama hash ring where replicas are strictly aligned across Availability Zones.
+//
+// It expects endpoints to be named following a pattern allowing ordinal extraction (e.g., pod-0, pod-1).
+// It first groups endpoints by AZ using groupByAZ, which ensures:
+//  1. AZs are sorted alphabetically.
+//  2. Endpoints within each AZ group are sorted by ordinal.
+//  3. All AZ groups are truncated to the same length (largest common sequence of ordinals starting from 0).
+//
+// The function requires the number of distinct AZs found to be exactly equal to the replicationFactor.
+// Each section on the hash ring corresponds to a primary endpoint (taken from the first AZ) and its
+// aligned replicas in other AZs (endpoints with the same ordinal). The hash for a section is calculated
+// based *only* on the primary endpoint's address.
+//
+// Parameters:
+//   - endpoints: A slice of all available endpoints.
+//   - sectionsPerNode: The number of sections (points) to add to the ring for each primary endpoint and its replicas.
+//   - replicationFactor: The desired number of replicas for each piece of data; must match the number of AZs.
+//
+// Returns:
+//   - A pointer to the initialized ketamaHashring.
+//   - An error if constraints are not met (e.g., AZ count != replicationFactor, missing ordinals, non-aligned ordinals).
 func newAlignedKetamaHashring(endpoints []Endpoint, sectionsPerNode int, replicationFactor uint64) (*ketamaHashring, error) {
+	if replicationFactor == 0 {
+		return nil, errors.New("replication factor cannot be zero")
+	}
+	if sectionsPerNode <= 0 {
+		return nil, errors.New("sections per node must be positive")
+	}
+
+	// Group by AZ, sort AZs, sort endpoints by ordinal within AZs, and ensure common length.
 	groupedEndpoints, err := groupByAZ(endpoints)
 	if err != nil {
-		return nil, err
+		return nil, errors.Wrap(err, "failed to group endpoints by AZ")
 	}
-	if len(groupedEndpoints) != int(replicationFactor) {
-		return nil, errors.Errorf("number of AZs (%d) must be equal to replication factor (%d)", len(groupedEndpoints), replicationFactor)
+
+	numAZs := len(groupedEndpoints)
+	if numAZs == 0 {
+		// Should be caught by groupByAZ, but double-check.
+		return nil, errors.New("no endpoint groups found after grouping by AZ")
 	}
-	numAZs, numEndpointsPerAZ := len(groupedEndpoints), len(groupedEndpoints[0])
-	flattenEndpoints := make([]Endpoint, 0, numAZs*numEndpointsPerAZ)
-	for i := range numAZs {
-		for j := range numEndpointsPerAZ {
-			flattenEndpoints = append(flattenEndpoints, groupedEndpoints[i][j])
-		}
+	if uint64(numAZs) != replicationFactor {
+		return nil, fmt.Errorf("number of AZs (%d) must equal replication factor (%d)", numAZs, replicationFactor)
 	}
 
-	hash := xxhash.New()
-	ringSections := make(sections, 0, numEndpointsPerAZ)
-	for endpointIndex, endpoint := range groupedEndpoints[0] {
-		for i := 1; i <= sectionsPerNode; i++ {
-			_, _ = hash.Write([]byte(endpoint.Address + ":" + strconv.Itoa(i)))
-			n := &section{
-				az:            endpoint.AZ,
-				endpointIndex: uint64(endpointIndex),
-				hash:          hash.Sum64(),
+	// groupedEndpoints[0] is safe because groupByAZ guarantees at least ordinal 0 exists if no error.
+	numEndpointsPerAZ := len(groupedEndpoints[0])
+	if numEndpointsPerAZ == 0 {
+		// Should be caught by groupByAZ, but double-check.
+		return nil, errors.New("AZ groups are empty after grouping")
+	}
+
+	// Create a flat list of endpoints, ordered AZ by AZ. This order is important for replica index calculation.
+	totalEndpoints := numAZs * numEndpointsPerAZ
+	flatEndpoints := make([]Endpoint, 0, totalEndpoints)
+	for azIndex := 0; azIndex < numAZs; azIndex++ {
+		flatEndpoints = append(flatEndpoints, groupedEndpoints[azIndex]...)
+	}
+
+	hasher := xxhash.New()
+	ringSections := make(sections, 0, numEndpointsPerAZ*sectionsPerNode) // Correct capacity.
+
+	// Iterate through primary endpoints (those in the first AZ) to define sections.
+	for primaryOrdinalIndex := 0; primaryOrdinalIndex < numEndpointsPerAZ; primaryOrdinalIndex++ {
+		primaryEndpoint := groupedEndpoints[0][primaryOrdinalIndex]
+		primaryOrdinal, err := strutil.ExtractPodOrdinal(primaryEndpoint.Address) // Get ordinal once for comparison.
+		if err != nil {
+			// Should not happen if groupByAZ worked, but check defensively.
+			return nil, errors.Wrapf(err, "failed to extract ordinal from primary endpoint %s", primaryEndpoint.Address)
+		}
+
+		// Create multiple sections per primary node for better distribution.
+		for sectionIndex := 1; sectionIndex <= sectionsPerNode; sectionIndex++ {
+			hasher.Reset()
+			// Hash is based *only* on the primary endpoint address and section index.
+			_, _ = hasher.Write([]byte(primaryEndpoint.Address + ":" + strconv.Itoa(sectionIndex)))
+			sectionHash := hasher.Sum64()
+
+			sec := &section{
+				hash:          sectionHash,
+				az:            primaryEndpoint.AZ,          // AZ of the primary.
+				endpointIndex: uint64(primaryOrdinalIndex), // Index within the AZ.
 				replicas:      make([]uint64, 0, replicationFactor),
 			}
-			o := -1
-			for j := range numAZs {
-				replicaIdx := numEndpointsPerAZ*j + endpointIndex
-				n.replicas = append(n.replicas, uint64(replicaIdx))
-				order, err := strutil.ExtractPodOrdinal(flattenEndpoints[replicaIdx].Address)
+
+			// Find indices of all replicas (including primary) in the flat list and verify alignment.
+			for azIndex := 0; azIndex < numAZs; azIndex++ {
+				// Calculate index in the flatEndpoints slice.
+				replicaFlatIndex := azIndex*numEndpointsPerAZ + primaryOrdinalIndex
+				replicaEndpoint := flatEndpoints[replicaFlatIndex]
+
+				// Verify that the replica in this AZ has the same ordinal as the primary.
+				replicaOrdinal, err := strutil.ExtractPodOrdinal(replicaEndpoint.Address)
 				if err != nil {
-					return nil, err
+					return nil, errors.Wrapf(err, "failed to extract ordinal from replica endpoint %s in AZ %s", replicaEndpoint.Address, replicaEndpoint.AZ)
 				}
-				if j == 0 {
-					o = order
-				} else if order != o {
-					return nil, errors.Errorf("replicas %d and %d have different ordinal numbers in AZ %s", order, o, flattenEndpoints[replicaIdx].AZ)
+				if replicaOrdinal != primaryOrdinal {
+					return nil, fmt.Errorf("ordinal mismatch for primary endpoint %s (ordinal %d): replica %s in AZ %s has ordinal %d",
+						primaryEndpoint.Address, primaryOrdinal, replicaEndpoint.Address, replicaEndpoint.AZ, replicaOrdinal)
 				}
+
+				sec.replicas = append(sec.replicas, uint64(replicaFlatIndex))
 			}
-			ringSections = append(ringSections, n)
-			hash.Reset()
+			ringSections = append(ringSections, sec)
 		}
 	}
+
+	// Sort sections by hash value for ring lookup.
 	sort.Sort(ringSections)
+
 	return &ketamaHashring{
-		endpoints:    flattenEndpoints,
+		endpoints:    flatEndpoints,
 		sections:     ringSections,
-		numEndpoints: uint64(len(flattenEndpoints)),
+		numEndpoints: uint64(totalEndpoints),
 	}, nil
 }
 
diff --git a/pkg/receive/hashring_test.go b/pkg/receive/hashring_test.go
index 5b3479d2bcd..daffc004282 100644
--- a/pkg/receive/hashring_test.go
+++ b/pkg/receive/hashring_test.go
@@ -416,7 +416,7 @@ func TestAlignedKetamaHashringGet(t *testing.T) {
 			replicationFactor:        1,
 			sectionsPerNode:          SectionsPerNode,
 			expectConstructorError:   true,
-			constructorErrorContains: "no endpoints found",
+			constructorErrorContains: "no endpoints provided",
 		},
 		"error: invalid address": {
 			inputEndpoints:           []Endpoint{ep0a, invalidEp, ep0b},
@@ -437,21 +437,21 @@ func TestAlignedKetamaHashringGet(t *testing.T) {
 			replicationFactor:        2,
 			sectionsPerNode:          SectionsPerNode,
 			expectConstructorError:   true,
-			constructorErrorContains: "no endpoints found in AZ", // Error from groupByAZ
+			constructorErrorContains: "failed to group endpoints by AZ: AZ \"zone-a\" is missing endpoint with ordinal 0", // Error from groupByAZ
 		},
 		"error: AZ count != RF (too few AZs)": {
 			inputEndpoints:           []Endpoint{ep0a, ep1a}, // Only 1 AZ
 			replicationFactor:        2,                      // Expecting 2
 			sectionsPerNode:          SectionsPerNode,
 			expectConstructorError:   true,
-			constructorErrorContains: "number of AZs (1) must be equal to replication factor (2)",
+			constructorErrorContains: "number of AZs (1) must equal replication factor (2)",
 		},
 		"error: AZ count != RF (too many AZs)": {
 			inputEndpoints:           []Endpoint{ep0a, ep1a, ep0b, ep1b, ep0c, ep1c}, // 3 AZs
 			replicationFactor:        2,                                              // Expecting 2
 			sectionsPerNode:          SectionsPerNode,
 			expectConstructorError:   true,
-			constructorErrorContains: "number of AZs (3) must be equal to replication factor (2)",
+			constructorErrorContains: "number of AZs (3) must equal replication factor (2)",
 		},
 		"constructor success with unbalanced AZs (uses common subset)": { // Renamed and corrected expectation
 			inputEndpoints:         []Endpoint{ep0a, ep1a, ep0b}, // zone-a has 0,1; zone-b has 0. Common subset is [[ep0a], [ep0b]]

From f2da76bee1e07d92af108ccaf464c710a0064098 Mon Sep 17 00:00:00 2001
From: Yuchen Wang <yuchen.wang@databricks.com>
Date: Thu, 1 May 2025 19:03:07 -0700
Subject: [PATCH 26/40] fix lint

---
 pkg/receive/hashring_test.go | 270 +++++++++++++----------------------
 1 file changed, 96 insertions(+), 174 deletions(-)

diff --git a/pkg/receive/hashring_test.go b/pkg/receive/hashring_test.go
index daffc004282..1a4f9ed2eb4 100644
--- a/pkg/receive/hashring_test.go
+++ b/pkg/receive/hashring_test.go
@@ -308,71 +308,53 @@ func TestKetamaHashringGet(t *testing.T) {
 func TestAlignedKetamaHashringGet(t *testing.T) {
 	t.Parallel()
 
-	// --- Reusable Endpoints for Aligned Tests ---
-	// Zone A
-	ep0a := Endpoint{Address: podDNS("pod", 0, "svc", "nsa"), AZ: "zone-a"}
-	ep1a := Endpoint{Address: podDNS("pod", 1, "svc", "nsa"), AZ: "zone-a"}
-	// ep2a removed - unused
-
-	// Zone B
-	ep0b := Endpoint{Address: podDNS("pod", 0, "svc", "nsb"), AZ: "zone-b"}
-	ep1b := Endpoint{Address: podDNS("pod", 1, "svc", "nsb"), AZ: "zone-b"}
-	// ep2b removed - unused
-
-	// Zone C
-	ep0c := Endpoint{Address: podDNS("pod", 0, "svc", "nsc"), AZ: "zone-c"}
-	ep1c := Endpoint{Address: podDNS("pod", 1, "svc", "nsc"), AZ: "zone-c"}
-	// ep2c removed - unused
-
-	// Invalid address endpoint
+	ep0a := Endpoint{Address: podDNS("pod", 0), AZ: "zone-a"}
+	ep1a := Endpoint{Address: podDNS("pod", 1), AZ: "zone-a"}
+	ep0b := Endpoint{Address: podDNS("pod", 0), AZ: "zone-b"}
+	ep1b := Endpoint{Address: podDNS("pod", 1), AZ: "zone-b"}
+	ep0c := Endpoint{Address: podDNS("pod", 0), AZ: "zone-c"}
+	ep1c := Endpoint{Address: podDNS("pod", 1), AZ: "zone-c"}
 	invalidEp := Endpoint{Address: "invalid-address", AZ: "zone-a"}
-	// Duplicate ordinal endpoint
-	duplicateEp0a := Endpoint{Address: podDNS("anotherpod", 0, "svc", "nsa"), AZ: "zone-a"}
+	duplicateEp0a := Endpoint{Address: podDNS("anotherpod", 0), AZ: "zone-a"}
 
-	// --- Time Series for Hashing ---
-	// Assume tsForReplicaTest hashes to a section generated by ep1a.Address.
 	tsForReplicaTest := &prompb.TimeSeries{
 		Labels: []labelpb.ZLabel{{Name: "test", Value: "replica-routing"}},
 	}
 
-	// --- Test Cases ---
 	testCases := map[string]struct {
 		inputEndpoints    []Endpoint
 		replicationFactor uint64
-		sectionsPerNode   int // Usually constant, but specify for clarity
+		sectionsPerNode   int
 
-		// For GetN testing
 		tenant           string
 		ts               *prompb.TimeSeries
-		n                uint64   // Replica index to get
-		expectedEndpoint Endpoint // Expected result from GetN
+		n                uint64
+		expectedEndpoint Endpoint
 
-		// Error expectations
 		expectConstructorError   bool
 		constructorErrorContains string
 		expectGetNError          bool
 		getNErrorContains        string
 	}{
-		// --- Success Cases ---
 		"valid 2 AZs, RF=2, get replica 0": {
-			inputEndpoints:         []Endpoint{ep1a, ep0b, ep0a, ep1b}, // Unordered input
+			inputEndpoints:         []Endpoint{ep1a, ep0b, ep0a, ep1b},
 			replicationFactor:      2,
 			sectionsPerNode:        SectionsPerNode,
 			tenant:                 "tenant1",
-			ts:                     tsForReplicaTest, // Assumed to hash to ep1a's section
-			n:                      0,                // Get primary replica
-			expectedEndpoint:       ep1a,             // Expect the endpoint that generated the hash section
+			ts:                     tsForReplicaTest,
+			n:                      0,
+			expectedEndpoint:       ep0a,
 			expectConstructorError: false,
 			expectGetNError:        false,
 		},
 		"valid 2 AZs, RF=2, get replica 1": {
-			inputEndpoints:         []Endpoint{ep1a, ep0b, ep0a, ep1b}, // Unordered input
+			inputEndpoints:         []Endpoint{ep1a, ep0b, ep0a, ep1b},
 			replicationFactor:      2,
 			sectionsPerNode:        SectionsPerNode,
 			tenant:                 "tenant1",
-			ts:                     tsForReplicaTest, // Assumed to hash to ep1a's section
-			n:                      1,                // Get second replica
-			expectedEndpoint:       ep1b,             // Expect corresponding endpoint in the *other* AZ (zone-b)
+			ts:                     tsForReplicaTest,
+			n:                      1,
+			expectedEndpoint:       ep0b,
 			expectConstructorError: false,
 			expectGetNError:        false,
 		},
@@ -381,9 +363,9 @@ func TestAlignedKetamaHashringGet(t *testing.T) {
 			replicationFactor:      3,
 			sectionsPerNode:        SectionsPerNode,
 			tenant:                 "tenant1",
-			ts:                     tsForReplicaTest, // Assumed to hash to ep1a's section
+			ts:                     tsForReplicaTest,
 			n:                      0,
-			expectedEndpoint:       ep1a,
+			expectedEndpoint:       ep0a,
 			expectConstructorError: false,
 			expectGetNError:        false,
 		},
@@ -392,9 +374,9 @@ func TestAlignedKetamaHashringGet(t *testing.T) {
 			replicationFactor:      3,
 			sectionsPerNode:        SectionsPerNode,
 			tenant:                 "tenant1",
-			ts:                     tsForReplicaTest, // Assumed to hash to ep1a's section
+			ts:                     tsForReplicaTest,
 			n:                      1,
-			expectedEndpoint:       ep1b, // Corresponding endpoint in zone-b
+			expectedEndpoint:       ep0b,
 			expectConstructorError: false,
 			expectGetNError:        false,
 		},
@@ -403,14 +385,12 @@ func TestAlignedKetamaHashringGet(t *testing.T) {
 			replicationFactor:      3,
 			sectionsPerNode:        SectionsPerNode,
 			tenant:                 "tenant1",
-			ts:                     tsForReplicaTest, // Assumed to hash to ep1a's section
+			ts:                     tsForReplicaTest,
 			n:                      2,
-			expectedEndpoint:       ep1c, // Corresponding endpoint in zone-c
+			expectedEndpoint:       ep0c,
 			expectConstructorError: false,
 			expectGetNError:        false,
 		},
-
-		// --- Constructor Error Cases ---
 		"error: empty input": {
 			inputEndpoints:           []Endpoint{},
 			replicationFactor:        1,
@@ -426,91 +406,76 @@ func TestAlignedKetamaHashringGet(t *testing.T) {
 			constructorErrorContains: "failed to extract ordinal from address invalid-address",
 		},
 		"error: duplicate ordinal": {
-			inputEndpoints:           []Endpoint{ep0a, ep1a, ep0b, duplicateEp0a}, // duplicate ep0a in zone-a
+			inputEndpoints:           []Endpoint{ep0a, ep1a, ep0b, duplicateEp0a},
 			replicationFactor:        2,
 			sectionsPerNode:          SectionsPerNode,
 			expectConstructorError:   true,
 			constructorErrorContains: "duplicate endpoint",
 		},
 		"error: missing ordinal 0": {
-			inputEndpoints:           []Endpoint{ep1a, ep1b}, // Missing ep0a, ep0b
+			inputEndpoints:           []Endpoint{ep1a, ep1b},
 			replicationFactor:        2,
 			sectionsPerNode:          SectionsPerNode,
 			expectConstructorError:   true,
-			constructorErrorContains: "failed to group endpoints by AZ: AZ \"zone-a\" is missing endpoint with ordinal 0", // Error from groupByAZ
+			constructorErrorContains: "failed to group endpoints by AZ: AZ \"zone-a\" is missing endpoint with ordinal 0",
 		},
 		"error: AZ count != RF (too few AZs)": {
-			inputEndpoints:           []Endpoint{ep0a, ep1a}, // Only 1 AZ
-			replicationFactor:        2,                      // Expecting 2
+			inputEndpoints:           []Endpoint{ep0a, ep1a},
+			replicationFactor:        2,
 			sectionsPerNode:          SectionsPerNode,
 			expectConstructorError:   true,
 			constructorErrorContains: "number of AZs (1) must equal replication factor (2)",
 		},
 		"error: AZ count != RF (too many AZs)": {
-			inputEndpoints:           []Endpoint{ep0a, ep1a, ep0b, ep1b, ep0c, ep1c}, // 3 AZs
-			replicationFactor:        2,                                              // Expecting 2
+			inputEndpoints:           []Endpoint{ep0a, ep1a, ep0b, ep1b, ep0c, ep1c},
+			replicationFactor:        2,
 			sectionsPerNode:          SectionsPerNode,
 			expectConstructorError:   true,
 			constructorErrorContains: "number of AZs (3) must equal replication factor (2)",
 		},
-		"constructor success with unbalanced AZs (uses common subset)": { // Renamed and corrected expectation
-			inputEndpoints:         []Endpoint{ep0a, ep1a, ep0b}, // zone-a has 0,1; zone-b has 0. Common subset is [[ep0a], [ep0b]]
+		"constructor success with unbalanced AZs (uses common subset)": {
+			inputEndpoints:         []Endpoint{ep0a, ep1a, ep0b},
 			replicationFactor:      2,
 			sectionsPerNode:        SectionsPerNode,
-			expectConstructorError: false, // Should succeed based on common subset [[ep0a], [ep0b]]
-			// No GetN test here as the setup is just for constructor validation
+			expectConstructorError: false,
 		},
-
-		// --- GetN Error Cases ---
-		"error: GetN index out of bounds (n >= numEndpoints)": { // Corrected description based on code's check
-			inputEndpoints:         []Endpoint{ep1a, ep0b, ep0a, ep1b}, // 4 endpoints total -> numEndpoints = 4
+		"error: GetN index out of bounds (n >= numEndpoints)": {
+			inputEndpoints:         []Endpoint{ep1a, ep0b, ep0a, ep1b},
 			replicationFactor:      2,
 			sectionsPerNode:        SectionsPerNode,
 			tenant:                 "tenant1",
 			ts:                     tsForReplicaTest,
-			n:                      4, // Requesting replica index 4. Check is n >= numEndpoints (4 >= 4)
+			n:                      4,
 			expectConstructorError: false,
 			expectGetNError:        true,
-			getNErrorContains:      "insufficient nodes; have 4, want 5", // numEndpoints is 4 in this balanced case.
+			getNErrorContains:      "insufficient nodes; have 4, want 5",
 		},
-		// NOTE: A test case for n >= replicationFactor but n < numEndpoints would currently panic.
-		// Example: RF=2, numEndpoints=4, n=2. Check `2 >= 4` is false. Access `replicas[2]` panics.
-		// We are not adding a panic test case here, assuming the insufficientNodesError check is the intended boundary.
 	}
 
 	for tcName, tc := range testCases {
 		t.Run(tcName, func(t *testing.T) {
-			// --- Constructor Call ---
 			hashRing, err := newAlignedKetamaHashring(tc.inputEndpoints, tc.sectionsPerNode, tc.replicationFactor)
 
-			// --- Constructor Assertions ---
 			if tc.expectConstructorError {
 				require.Error(t, err, "Expected constructor error")
 				require.Nil(t, hashRing, "Hashring should be nil on constructor error")
 				if tc.constructorErrorContains != "" {
 					require.Contains(t, err.Error(), tc.constructorErrorContains, "Constructor error message mismatch")
 				}
-				return // Stop test if constructor failed as expected
+				return
 			}
 
-			// If constructor error was not expected, assert it succeeded
 			require.NoError(t, err, "Expected constructor to succeed")
 			require.NotNil(t, hashRing, "Hashring should not be nil on successful construction")
 
-			// --- GetN Call & Assertions (only if constructor succeeded and GetN test is specified) ---
-			if tc.ts == nil && !tc.expectGetNError { // Skip GetN if no timeseries provided unless we expect a GetN error
+			if tc.ts == nil && !tc.expectGetNError {
 				return
 			}
-
-			// Ensure ts is not nil if we expect GetN to run without error or with a specific error
 			if tc.ts == nil && tc.expectGetNError {
-				// If we expect a GetN error but haven't provided a TS, use a dummy one.
-				// This is mainly for the "insufficient nodes" case which doesn't depend on the TS hash.
 				tc.ts = &prompb.TimeSeries{Labels: []labelpb.ZLabel{{Name: "dummy", Value: "dummy"}}}
 			}
 
 			result, getNErr := hashRing.GetN(tc.tenant, tc.ts, tc.n)
-
 			if tc.expectGetNError {
 				require.Error(t, getNErr, "Expected GetN error")
 				if tc.getNErrorContains != "" {
@@ -518,7 +483,6 @@ func TestAlignedKetamaHashringGet(t *testing.T) {
 				}
 			} else {
 				require.NoError(t, getNErr, "Expected GetN to succeed")
-				// Use Equals for struct comparison
 				testutil.Equals(t, tc.expectedEndpoint, result, "GetN returned unexpected endpoint")
 			}
 		})
@@ -528,67 +492,52 @@ func TestAlignedKetamaHashringGet(t *testing.T) {
 func TestAlignedKetamaHashringReplicaOrdinals(t *testing.T) {
 	t.Parallel()
 
-	// --- Test Setup ---
-	// Create a valid setup for aligned ketama
 	endpoints := []Endpoint{
-		// AZ A
-		{Address: podDNS("pod", 0, "svc", "nsa"), AZ: "zone-a"},
-		{Address: podDNS("pod", 1, "svc", "nsa"), AZ: "zone-a"},
-		{Address: podDNS("pod", 2, "svc", "nsa"), AZ: "zone-a"},
-		// AZ B
-		{Address: podDNS("pod", 0, "svc", "nsb"), AZ: "zone-b"},
-		{Address: podDNS("pod", 1, "svc", "nsb"), AZ: "zone-b"},
-		{Address: podDNS("pod", 2, "svc", "nsb"), AZ: "zone-b"},
-		// AZ C
-		{Address: podDNS("pod", 0, "svc", "nsc"), AZ: "zone-c"},
-		{Address: podDNS("pod", 1, "svc", "nsc"), AZ: "zone-c"},
-		{Address: podDNS("pod", 2, "svc", "nsc"), AZ: "zone-c"},
+		{Address: podDNS("pod", 0), AZ: "zone-a"},
+		{Address: podDNS("pod", 1), AZ: "zone-a"},
+		{Address: podDNS("pod", 2), AZ: "zone-a"},
+		{Address: podDNS("pod", 0), AZ: "zone-b"},
+		{Address: podDNS("pod", 1), AZ: "zone-b"},
+		{Address: podDNS("pod", 2), AZ: "zone-b"},
+		{Address: podDNS("pod", 0), AZ: "zone-c"},
+		{Address: podDNS("pod", 1), AZ: "zone-c"},
+		{Address: podDNS("pod", 2), AZ: "zone-c"},
 	}
-	replicationFactor := uint64(3) // Must match number of AZs
-	sectionsPerNode := 10          // Use a smaller number for faster testing
+	replicationFactor := uint64(3)
+	sectionsPerNode := 10
 
-	// --- Create the Hashring ---
 	hashRing, err := newAlignedKetamaHashring(endpoints, sectionsPerNode, replicationFactor)
 	require.NoError(t, err, "Aligned hashring constructor failed")
 	require.NotNil(t, hashRing, "Hashring should not be nil")
-
-	// --- Verification Logic ---
 	require.NotEmpty(t, hashRing.sections, "Hashring should contain sections")
 
+	// Verify that all replicas within a section have the same ordinal.
 	for i, s := range hashRing.sections {
-		// Skip if a section somehow has no replicas (shouldn't happen with RF > 0)
 		if len(s.replicas) == 0 {
 			continue
 		}
 
-		expectedOrdinal := -1 // Initialize with sentinel value
+		expectedOrdinal := -1
 
 		for replicaNum, replicaIndex := range s.replicas {
-			// Defensive check: Ensure replicaIndex is within bounds of the stored endpoints list
 			require.Less(t, int(replicaIndex), len(hashRing.endpoints),
 				"Section %d (hash %d), Replica %d: index %d out of bounds for endpoints list (len %d)",
 				i, s.hash, replicaNum, replicaIndex, len(hashRing.endpoints))
 
-			// Get the endpoint corresponding to the replica index
 			endpoint := hashRing.endpoints[replicaIndex]
-
-			// Extract the ordinal from the endpoint address
 			ordinal, err := strutil.ExtractPodOrdinal(endpoint.Address)
 			require.NoError(t, err,
 				"Section %d (hash %d), Replica %d: failed to extract ordinal from address %s",
 				i, s.hash, replicaNum, endpoint.Address)
 
 			if expectedOrdinal == -1 {
-				// This is the first replica for this section, store its ordinal
 				expectedOrdinal = ordinal
 			} else {
-				// Compare subsequent replica ordinals to the first one
 				require.Equal(t, expectedOrdinal, ordinal,
 					"Section %d (hash %d), Replica %d (%s): Mismatched ordinal. Expected %d, got %d. Replicas in section: %v",
 					i, s.hash, replicaNum, endpoint.Address, expectedOrdinal, ordinal, s.replicas)
 			}
 		}
-		// Optional: Check that we actually found an ordinal if replicas were present
 		if len(s.replicas) > 0 {
 			require.NotEqual(t, -1, expectedOrdinal, "Section %d (hash %d): Failed to determine expected ordinal for replicas %v", i, s.hash, s.replicas)
 		}
@@ -964,112 +913,98 @@ func TestInvalidAZHashringCfg(t *testing.T) {
 	}
 }
 
-// Helper to create a valid DNS name for testing
-func podDNS(name string, ordinal int, svc, ns string) string {
-	return name + "-" + strconv.Itoa(ordinal) + "." + svc + "." + ns + ".svc.cluster.local"
+// podDNS creates a DNS-like string for testing endpoint addresses.
+func podDNS(name string, ordinal int) string {
+	return name + "-" + strconv.Itoa(ordinal) + ".test-svc.test-namespace.svc.cluster.local"
 }
 
 func TestGroupByAZ(t *testing.T) {
-	// Define some reusable endpoints
-	ep0a := Endpoint{Address: podDNS("pod", 0, "svc", "nsa"), AZ: "zone-a"}
-	ep1a := Endpoint{Address: podDNS("pod", 1, "svc", "nsa"), AZ: "zone-a"}
-	ep2a := Endpoint{Address: podDNS("pod", 2, "svc", "nsa"), AZ: "zone-a"}
-
-	ep0b := Endpoint{Address: podDNS("pod", 0, "svc", "nsb"), AZ: "zone-b"}
-	ep1b := Endpoint{Address: podDNS("pod", 1, "svc", "nsb"), AZ: "zone-b"}
-
-	ep0c := Endpoint{Address: podDNS("pod", 0, "svc", "nsc"), AZ: "zone-c"}
-	ep1c := Endpoint{Address: podDNS("pod", 1, "svc", "nsc"), AZ: "zone-c"} // Added for a new test case
-
-	// Endpoint with an invalid address format
+	// Test setup endpoints.
+	ep0a := Endpoint{Address: podDNS("pod", 0), AZ: "zone-a"}
+	ep1a := Endpoint{Address: podDNS("pod", 1), AZ: "zone-a"}
+	ep2a := Endpoint{Address: podDNS("pod", 2), AZ: "zone-a"}
+	ep0b := Endpoint{Address: podDNS("pod", 0), AZ: "zone-b"}
+	ep1b := Endpoint{Address: podDNS("pod", 1), AZ: "zone-b"}
+	ep0c := Endpoint{Address: podDNS("pod", 0), AZ: "zone-c"}
+	ep1c := Endpoint{Address: podDNS("pod", 1), AZ: "zone-c"}
 	invalidEp := Endpoint{Address: "invalid-address-format", AZ: "zone-a"}
-	// Endpoint that would cause a duplicate ordinal in its AZ
-	duplicateEp0a := Endpoint{Address: podDNS("anotherpod", 0, "svc", "nsa"), AZ: "zone-a"}
+	duplicateEp0a := Endpoint{Address: podDNS("anotherpod", 0), AZ: "zone-a"} // Same ordinal (0) as ep0a in zone-a.
 
 	testCases := map[string]struct {
 		inputEndpoints []Endpoint
 		expectedResult [][]Endpoint
 		expectError    bool
-		errorContains  string // Substring to check in the error message
+		errorContains  string
 	}{
-		"error on empty input": { // UPDATED: Now expects an error
+		"error on empty input": {
 			inputEndpoints: []Endpoint{},
 			expectedResult: nil,
 			expectError:    true,
-			errorContains:  "no endpoints found", // Specific error message
+			errorContains:  "no endpoints provided",
 		},
 		"single AZ, multiple endpoints": {
-			inputEndpoints: []Endpoint{ep1a, ep0a, ep2a}, // Input order shouldn't matter
+			inputEndpoints: []Endpoint{ep1a, ep0a, ep2a},
 			expectedResult: [][]Endpoint{
-				{ep0a, ep1a, ep2a}, // Expect sorted by ordinal
+				{ep0a, ep1a, ep2a},
 			},
 			expectError: false,
 		},
 		"multiple AZs, balanced and ordered": {
-			inputEndpoints: []Endpoint{ep1a, ep0b, ep0a, ep1b}, // Input order shouldn't matter
+			inputEndpoints: []Endpoint{ep1a, ep0b, ep0a, ep1b},
 			expectedResult: [][]Endpoint{
-				{ep0a, ep1a}, // zone-a endpoints, sorted by ordinal
-				{ep0b, ep1b}, // zone-b endpoints, sorted by ordinal
-			}, // Outer slice sorted by AZ name ("zone-a", "zone-b")
+				{ep0a, ep1a},
+				{ep0b, ep1b},
+			},
 			expectError: false,
 		},
-		"multiple AZs, different counts, stops at missing ordinal > 0": { // Name slightly adjusted for clarity
-			// zone-a has 0, 1, 2. zone-b has 0, 1. zone-c has 0.
-			// Grouping stops when ordinal 1 is missing in zone-c. Since i=1 != 0, no error.
+		"multiple AZs, different counts, stops at first missing ordinal > 0": {
 			inputEndpoints: []Endpoint{ep1a, ep0b, ep0a, ep1b, ep2a, ep0c},
 			expectedResult: [][]Endpoint{
-				{ep0a}, // zone-a, ordinal 0
-				{ep0b}, // zone-b, ordinal 0
-				{ep0c}, // zone-c, ordinal 0
-			}, // Outer slice sorted by AZ name ("zone-a", "zone-b", "zone-c")
+				{ep0a},
+				{ep0b},
+				{ep0c},
+			},
 			expectError: false,
 		},
-		"error if ordinal 0 missing in any AZ": { // UPDATED: Now expects an error
-			// zone-a has 1, 2. zone-b has 1.
-			// Grouping fails immediately because ordinal 0 is missing.
+		"error if ordinal 0 missing in any AZ": {
 			inputEndpoints: []Endpoint{ep1a, ep2a, ep1b},
-			expectedResult: nil, // Expect nil result on error
+			expectedResult: nil,
 			expectError:    true,
-			// The exact AZ reported depends on sort order, so check common part
-			errorContains: "no endpoints found in AZ",
+			errorContains:  "missing endpoint with ordinal 0",
 		},
-		"error if ordinal 0 missing in only one AZ": { // NEW Test Case
-			// zone-a has 0, 1. zone-b has 1.
-			// Grouping fails immediately because ordinal 0 is missing in zone-b.
+		"error if ordinal 0 missing in only one AZ": {
 			inputEndpoints: []Endpoint{ep0a, ep1a, ep1b},
 			expectedResult: nil,
 			expectError:    true,
-			errorContains:  "no endpoints found in AZ zone-b", // We know zone-b is missing 0
+			errorContains:  `AZ "zone-b" is missing endpoint with ordinal 0`,
 		},
-		"error on invalid address": {
+		"error on invalid address format": {
 			inputEndpoints: []Endpoint{ep0a, invalidEp, ep0b},
 			expectedResult: nil,
 			expectError:    true,
 			errorContains:  "failed to extract ordinal from address invalid-address-format",
 		},
 		"error on duplicate ordinal within an AZ": {
-			inputEndpoints: []Endpoint{ep0a, ep1a, ep0b, duplicateEp0a}, // ep0a and duplicateEp0a in zone-a have ordinal 0
+			inputEndpoints: []Endpoint{ep0a, ep1a, ep0b, duplicateEp0a},
 			expectedResult: nil,
 			expectError:    true,
-			errorContains:  "duplicate endpoint " + duplicateEp0a.Address + " in AZ zone-a",
+			errorContains:  "duplicate endpoint ordinal 0 for address " + duplicateEp0a.Address + " in AZ zone-a",
 		},
 		"AZ sorting check": {
-			inputEndpoints: []Endpoint{ep0b, ep0c, ep0a}, // Input AZs: b, c, a
+			inputEndpoints: []Endpoint{ep0b, ep0c, ep0a},
 			expectedResult: [][]Endpoint{
-				{ep0a}, // zone-a first
-				{ep0b}, // zone-b second
-				{ep0c}, // zone-c third
+				{ep0a},
+				{ep0b},
+				{ep0c},
 			},
 			expectError: false,
 		},
-		"multiple AZs, stops correctly when next ordinal missing everywhere": { // NEW Test Case
-			// zone-a has 0, 1. zone-b has 0, 1. zone-c has 0, 1.
-			// Should return groups for 0 and 1, then stop cleanly when checking for 2.
+		"multiple AZs, stops correctly when next ordinal missing everywhere": {
 			inputEndpoints: []Endpoint{ep1a, ep0b, ep0a, ep1b, ep0c, ep1c},
 			expectedResult: [][]Endpoint{
-				{ep0a, ep1a}, // zone-a
-				{ep0b, ep1b}, // zone-b
-				{ep0c, ep1c}, // zone-c
+				{ep0a, ep1a},
+				{ep0b, ep1b},
+				{ep0c, ep1c},
 			},
 			expectError: false,
 		},
@@ -1084,27 +1019,14 @@ func TestGroupByAZ(t *testing.T) {
 				if tc.errorContains != "" {
 					testutil.Assert(t, strings.Contains(err.Error(), tc.errorContains), "Expected error message to contain '%s', but got: %v", tc.errorContains, err)
 				}
-				// On error, the function should return nil result
 				testutil.Assert(t, result == nil, "Expected nil result on error, got: %v", result)
 			} else {
 				testutil.Ok(t, err)
-				// Compare results using Equals
 				testutil.Equals(t, tc.expectedResult, result)
 
-				// Optional: Keep the AZ sorting check if desired
+				// Verify outer slice (AZs) is sorted alphabetically.
 				if err == nil && len(result) > 1 {
 					azOrderCorrect := sort.SliceIsSorted(result, func(i, j int) bool {
-						if len(result[i]) == 0 && len(result[j]) == 0 {
-							// This case might not happen anymore if empty inner slices aren't returned
-							// but keep for robustness or adjust if needed.
-							return true
-						}
-						if len(result[i]) == 0 {
-							return true
-						}
-						if len(result[j]) == 0 {
-							return false
-						}
 						return result[i][0].AZ < result[j][0].AZ
 					})
 					testutil.Assert(t, azOrderCorrect, "Outer slice is not sorted by AZ")

From 40f8359e21358b62176152198d6fff61582f2474 Mon Sep 17 00:00:00 2001
From: Yuchen Wang <yuchen.wang@databricks.com>
Date: Thu, 1 May 2025 19:06:43 -0700
Subject: [PATCH 27/40] clean up code

---
 pkg/receive/hashring.go | 46 +++--------------------------------------
 1 file changed, 3 insertions(+), 43 deletions(-)

diff --git a/pkg/receive/hashring.go b/pkg/receive/hashring.go
index 7adbc497260..95659526162 100644
--- a/pkg/receive/hashring.go
+++ b/pkg/receive/hashring.go
@@ -178,7 +178,7 @@ func newKetamaHashring(endpoints []Endpoint, sectionsPerNode int, replicationFac
 	}, nil
 }
 
-// groupByAZ groups endpoints by Availability Zone and sorts them by their inferred ordinal.
+// groupByAZ groups endpoints by Availability Zone and sorts them by their inferred ordinal in a k8s statefulset.
 // It returns a 2D slice where each inner slice represents an AZ (sorted alphabetically)
 // and contains endpoints sorted by ordinal. All inner slices are truncated to the
 // length of the largest common sequence of ordinals starting from 0 across all AZs.
@@ -219,11 +219,10 @@ func groupByAZ(endpoints []Endpoint) ([][]Endpoint, error) {
 		for _, az := range sortedAZs {
 			if _, ok := azEndpoints[az][i]; !ok {
 				presentInAllAZs = false
-				// If even ordinal 0 is missing in any AZ, it's an invalid configuration for balancing.
 				if i == 0 {
 					return nil, fmt.Errorf("AZ %q is missing endpoint with ordinal 0", az)
 				}
-				break // Stop checking this ordinal once one AZ is missing it.
+				break
 			}
 		}
 
@@ -233,49 +232,25 @@ func groupByAZ(endpoints []Endpoint) ([][]Endpoint, error) {
 		}
 	}
 
-	// If maxCommonOrdinal is still -1, it means not even ordinal 0 was common.
 	if maxCommonOrdinal < 0 {
-		// This case should be caught by the i==0 check inside the loop,
-		// but added for robustness in case the logic changes.
 		return nil, errors.New("no common endpoints with ordinal 0 found across all AZs")
 	}
 
-	// Build the final result, truncated to the maxCommonOrdinal.
 	numAZs := len(sortedAZs)
 	result := make([][]Endpoint, numAZs)
 	for i, az := range sortedAZs {
-		// Pre-allocate slice capacity for efficiency.
 		result[i] = make([]Endpoint, 0, maxCommonOrdinal+1)
 		for j := 0; j <= maxCommonOrdinal; j++ {
-			// We know the endpoint exists due to the previous check.
 			result[i] = append(result[i], azEndpoints[az][j])
 		}
 	}
-
 	return result, nil
 }
 
 // newAlignedKetamaHashring creates a Ketama hash ring where replicas are strictly aligned across Availability Zones.
-//
-// It expects endpoints to be named following a pattern allowing ordinal extraction (e.g., pod-0, pod-1).
-// It first groups endpoints by AZ using groupByAZ, which ensures:
-//  1. AZs are sorted alphabetically.
-//  2. Endpoints within each AZ group are sorted by ordinal.
-//  3. All AZ groups are truncated to the same length (largest common sequence of ordinals starting from 0).
-//
-// The function requires the number of distinct AZs found to be exactly equal to the replicationFactor.
 // Each section on the hash ring corresponds to a primary endpoint (taken from the first AZ) and its
 // aligned replicas in other AZs (endpoints with the same ordinal). The hash for a section is calculated
 // based *only* on the primary endpoint's address.
-//
-// Parameters:
-//   - endpoints: A slice of all available endpoints.
-//   - sectionsPerNode: The number of sections (points) to add to the ring for each primary endpoint and its replicas.
-//   - replicationFactor: The desired number of replicas for each piece of data; must match the number of AZs.
-//
-// Returns:
-//   - A pointer to the initialized ketamaHashring.
-//   - An error if constraints are not met (e.g., AZ count != replicationFactor, missing ordinals, non-aligned ordinals).
 func newAlignedKetamaHashring(endpoints []Endpoint, sectionsPerNode int, replicationFactor uint64) (*ketamaHashring, error) {
 	if replicationFactor == 0 {
 		return nil, errors.New("replication factor cannot be zero")
@@ -284,7 +259,6 @@ func newAlignedKetamaHashring(endpoints []Endpoint, sectionsPerNode int, replica
 		return nil, errors.New("sections per node must be positive")
 	}
 
-	// Group by AZ, sort AZs, sort endpoints by ordinal within AZs, and ensure common length.
 	groupedEndpoints, err := groupByAZ(endpoints)
 	if err != nil {
 		return nil, errors.Wrap(err, "failed to group endpoints by AZ")
@@ -292,21 +266,17 @@ func newAlignedKetamaHashring(endpoints []Endpoint, sectionsPerNode int, replica
 
 	numAZs := len(groupedEndpoints)
 	if numAZs == 0 {
-		// Should be caught by groupByAZ, but double-check.
 		return nil, errors.New("no endpoint groups found after grouping by AZ")
 	}
 	if uint64(numAZs) != replicationFactor {
 		return nil, fmt.Errorf("number of AZs (%d) must equal replication factor (%d)", numAZs, replicationFactor)
 	}
 
-	// groupedEndpoints[0] is safe because groupByAZ guarantees at least ordinal 0 exists if no error.
 	numEndpointsPerAZ := len(groupedEndpoints[0])
 	if numEndpointsPerAZ == 0 {
-		// Should be caught by groupByAZ, but double-check.
 		return nil, errors.New("AZ groups are empty after grouping")
 	}
 
-	// Create a flat list of endpoints, ordered AZ by AZ. This order is important for replica index calculation.
 	totalEndpoints := numAZs * numEndpointsPerAZ
 	flatEndpoints := make([]Endpoint, 0, totalEndpoints)
 	for azIndex := 0; azIndex < numAZs; azIndex++ {
@@ -319,16 +289,13 @@ func newAlignedKetamaHashring(endpoints []Endpoint, sectionsPerNode int, replica
 	// Iterate through primary endpoints (those in the first AZ) to define sections.
 	for primaryOrdinalIndex := 0; primaryOrdinalIndex < numEndpointsPerAZ; primaryOrdinalIndex++ {
 		primaryEndpoint := groupedEndpoints[0][primaryOrdinalIndex]
-		primaryOrdinal, err := strutil.ExtractPodOrdinal(primaryEndpoint.Address) // Get ordinal once for comparison.
+		primaryOrdinal, err := strutil.ExtractPodOrdinal(primaryEndpoint.Address)
 		if err != nil {
-			// Should not happen if groupByAZ worked, but check defensively.
 			return nil, errors.Wrapf(err, "failed to extract ordinal from primary endpoint %s", primaryEndpoint.Address)
 		}
 
-		// Create multiple sections per primary node for better distribution.
 		for sectionIndex := 1; sectionIndex <= sectionsPerNode; sectionIndex++ {
 			hasher.Reset()
-			// Hash is based *only* on the primary endpoint address and section index.
 			_, _ = hasher.Write([]byte(primaryEndpoint.Address + ":" + strconv.Itoa(sectionIndex)))
 			sectionHash := hasher.Sum64()
 
@@ -341,11 +308,8 @@ func newAlignedKetamaHashring(endpoints []Endpoint, sectionsPerNode int, replica
 
 			// Find indices of all replicas (including primary) in the flat list and verify alignment.
 			for azIndex := 0; azIndex < numAZs; azIndex++ {
-				// Calculate index in the flatEndpoints slice.
 				replicaFlatIndex := azIndex*numEndpointsPerAZ + primaryOrdinalIndex
 				replicaEndpoint := flatEndpoints[replicaFlatIndex]
-
-				// Verify that the replica in this AZ has the same ordinal as the primary.
 				replicaOrdinal, err := strutil.ExtractPodOrdinal(replicaEndpoint.Address)
 				if err != nil {
 					return nil, errors.Wrapf(err, "failed to extract ordinal from replica endpoint %s in AZ %s", replicaEndpoint.Address, replicaEndpoint.AZ)
@@ -354,16 +318,12 @@ func newAlignedKetamaHashring(endpoints []Endpoint, sectionsPerNode int, replica
 					return nil, fmt.Errorf("ordinal mismatch for primary endpoint %s (ordinal %d): replica %s in AZ %s has ordinal %d",
 						primaryEndpoint.Address, primaryOrdinal, replicaEndpoint.Address, replicaEndpoint.AZ, replicaOrdinal)
 				}
-
 				sec.replicas = append(sec.replicas, uint64(replicaFlatIndex))
 			}
 			ringSections = append(ringSections, sec)
 		}
 	}
-
-	// Sort sections by hash value for ring lookup.
 	sort.Sort(ringSections)
-
 	return &ketamaHashring{
 		endpoints:    flatEndpoints,
 		sections:     ringSections,

From e7404dcac771696a4d007f6f7a54e7df53717cba Mon Sep 17 00:00:00 2001
From: "HC Zhu (DB)" <hc.zhu@databricks.com>
Date: Wed, 30 Apr 2025 22:12:22 -0700
Subject: [PATCH 28/40] Thanos Streamer support ignore warnings from Querier

---
 cmd/thanos/query.go        | 11 +++++++++++
 cmd/thanos/streamer.go     |  6 +++++-
 cmd/thanos/tools_metric.go | 22 ++++++++++++++++++++--
 3 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/cmd/thanos/query.go b/cmd/thanos/query.go
index 884c2b256a7..7e8c4defde5 100644
--- a/cmd/thanos/query.go
+++ b/cmd/thanos/query.go
@@ -62,6 +62,7 @@ import (
 	"github.com/thanos-io/thanos/pkg/tenancy"
 	"github.com/thanos-io/thanos/pkg/tls"
 	"github.com/thanos-io/thanos/pkg/ui"
+	"google.golang.org/grpc/keepalive"
 )
 
 const (
@@ -251,6 +252,9 @@ func registerQuery(app *extkingpin.App) {
 	lazyRetrievalMaxBufferedResponses := cmd.Flag("query.lazy-retrieval-max-buffered-responses", "The lazy retrieval strategy can buffer up to this number of responses. This is to limit the memory usage. This flag takes effect only when the lazy retrieval strategy is enabled.").
 		Default("20").Int()
 
+	grpcStoreClientKeepAlivePingInterval := extkingpin.ModelDuration(cmd.Flag("query.grcp-store-client-keep-alive-ping-interval", "This value defines how often a store client sends a keepalive ping on an established gRPC stream. 0 means not to set. NB: a client is keeping a long‐running gRPC stream open. It still has active RPCs on the wire—even if Recv() is not called in a while. Setting PermitWithoutStream=false only stops pings when no streams exist; it does not suppress pings during an open stream").
+		Default("0s"))
+
 	var storeRateLimits store.SeriesSelectLimits
 	storeRateLimits.RegisterFlags(cmd)
 
@@ -393,6 +397,7 @@ func registerQuery(app *extkingpin.App) {
 			*rewriteAggregationLabelStrategy,
 			*rewriteAggregationLabelTo,
 			*lazyRetrievalMaxBufferedResponses,
+			time.Duration(*grpcStoreClientKeepAlivePingInterval),
 		)
 	})
 }
@@ -480,6 +485,7 @@ func runQuery(
 	rewriteAggregationLabelStrategy string,
 	rewriteAggregationLabelTo string,
 	lazyRetrievalMaxBufferedResponses int,
+	grpcStoreClientKeepAlivePingInterval time.Duration,
 ) error {
 	comp := component.Query
 	if alertQueryURL == "" {
@@ -499,6 +505,11 @@ func runQuery(
 	if err != nil {
 		return errors.Wrap(err, "building gRPC client")
 	}
+	if grpcStoreClientKeepAlivePingInterval > 0 {
+		dialOpts = append(dialOpts, grpc.WithKeepaliveParams(keepalive.ClientParameters{
+			Time: grpcStoreClientKeepAlivePingInterval,
+		}))
+	}
 	if grpcCompression != compressionNone {
 		dialOpts = append(dialOpts, grpc.WithDefaultCallOptions(grpc.UseCompressor(grpcCompression)))
 	}
diff --git a/cmd/thanos/streamer.go b/cmd/thanos/streamer.go
index 0ba89e130b6..aac9fb2bee4 100644
--- a/cmd/thanos/streamer.go
+++ b/cmd/thanos/streamer.go
@@ -39,6 +39,7 @@ type StreamerConfig struct {
 	storeAddrPort        string
 	streamTimeoutSeconds int
 	replicaLabel         string
+	ignoreWarnings	   	 bool
 }
 
 func registerStreamer(app *extkingpin.App) {
@@ -48,6 +49,7 @@ func registerStreamer(app *extkingpin.App) {
 	cmd.Flag("store", "Thanos Store API gRPC endpoint").Default("localhost:10901").StringVar(&config.storeAddrPort)
 	cmd.Flag("stream.timeout_seconds", "One stream's overall timeout in seconds ").Default("36000").IntVar(&config.streamTimeoutSeconds)
 	cmd.Flag("stream.replica_label", "Drop this replica label from all returns time series and dedup them.").Default("").StringVar(&config.replicaLabel)
+	cmd.Flag("stream.ignore_warnings", "Don't return an error when a warning response is received. --no-stream.ignore_warnings to disable").Default("true").BoolVar(&config.ignoreWarnings)
 
 	hc := &httpConfig{}
 	hc = hc.registerFlag(cmd)
@@ -266,7 +268,9 @@ func (s *Streamer) streamOneRequest(request *streamer.StreamerRequest, writer io
 				"warning", warning,
 				"msg", "warning response from Store gRPC stream",
 				"request_id", request.RequestId)
-			return writeResponse(&streamer.StreamerResponse{Err: fmt.Sprintf("warning response from Store gRPC stream: %s", warning)})
+			if !s.config.ignoreWarnings {
+				return writeResponse(&streamer.StreamerResponse{Err: fmt.Sprintf("warning response from Store gRPC stream: %s", warning)})
+			}
 		}
 		seriesResp := response.GetSeries()
 		if seriesResp == nil {
diff --git a/cmd/thanos/tools_metric.go b/cmd/thanos/tools_metric.go
index eba14c389f8..111d942f170 100644
--- a/cmd/thanos/tools_metric.go
+++ b/cmd/thanos/tools_metric.go
@@ -29,6 +29,8 @@ type rawMetricConfig struct {
 	metric     string
 	hoursAgo   int
 	skipChunks bool
+	equalLabelMatcher string
+	notEqualLabelMatcher string
 }
 
 func registerFlags(cmd extkingpin.FlagClause) *rawMetricConfig {
@@ -37,6 +39,8 @@ func registerFlags(cmd extkingpin.FlagClause) *rawMetricConfig {
 	cmd.Flag("metric", "The metric name to stream time series for this metric").Default("node_cpu_seconds_total").StringVar(&conf.metric)
 	cmd.Flag("hours_ago", "Stream the metric from this number of hours ago").Default("16").IntVar(&conf.hoursAgo)
 	cmd.Flag("skip_chunks", "Skip chunks in the response").Default("false").BoolVar(&conf.skipChunks)
+	cmd.Flag("eq_label_matcher", "One label matcher using equal").Default("").StringVar(&conf.equalLabelMatcher)
+	cmd.Flag("neq_label_matcher", "One label matcher using not equal").Default("").StringVar(&conf.notEqualLabelMatcher)
 	return conf
 }
 
@@ -72,6 +76,20 @@ func streamMetric(conf *rawMetricConfig, logger log.Logger) error {
 	labelMatchers := []storepb.LabelMatcher{
 		{Type: storepb.LabelMatcher_EQ, Name: "__name__", Value: conf.metric},
 	}
+	addMatcher := func(mtype storepb.LabelMatcher_Type, matcher string) {
+		parts := strings.Split(matcher, "=")
+		if len(parts) != 2 {
+			level.Error(logger).Log("msg", "ignoring an invalid label matcher", "matcher", matcher)
+			return
+		}
+		labelMatchers = append(labelMatchers, storepb.LabelMatcher{
+			Type: mtype,
+			Name: parts[0],
+			Value: parts[1],
+		})
+	}
+	addMatcher(storepb.LabelMatcher_EQ, conf.equalLabelMatcher)
+	addMatcher(storepb.LabelMatcher_NEQ, conf.notEqualLabelMatcher)
 	storeReq := &storepb.SeriesRequest{
 		Aggregates: []storepb.Aggr{storepb.Aggr_RAW},
 		Matchers:   labelMatchers,
@@ -103,9 +121,9 @@ func streamMetric(conf *rawMetricConfig, logger log.Logger) error {
 		}
 		series := resPtr.GetSeries()
 		if series == nil {
-			return fmt.Errorf("Got a nil series")
+			return fmt.Errorf("got a nil series")
 		}
-		if 0 == (seq % 1000) {
+		if (seq % 1000) == 0 {
 			level.Info(logger).Log("msg", "streaming time series", "seq", seq)
 		}
 		metric := ""

From edb1a75fbf7bbee63c0b2551d08e586f84fee57b Mon Sep 17 00:00:00 2001
From: "HC Zhu (DB)" <hc.zhu@databricks.com>
Date: Thu, 1 May 2025 19:23:53 -0700
Subject: [PATCH 29/40] Fix a lazy retrieval timer issue

---
 pkg/store/proxy_merge.go | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pkg/store/proxy_merge.go b/pkg/store/proxy_merge.go
index 914a8b03a2b..abbc2e6c58a 100644
--- a/pkg/store/proxy_merge.go
+++ b/pkg/store/proxy_merge.go
@@ -7,6 +7,7 @@ import (
 	"context"
 	"fmt"
 	"io"
+	"math"
 	"sort"
 	"sync"
 	"time"
@@ -451,6 +452,11 @@ func newLazyRespSet(
 				l.bufferedResponsesMtx.Unlock()
 				return false
 			}
+			if t != nil {
+				// frameTimeout only applies to cl.Recv() gRPC call because the goroutine may be blocked on waiting for an empty buffer slot.
+				// Set the timeout to the largest possible value to void triggering it.
+				t.Reset(time.Duration(math.MaxInt64))
+			}
 
 			numResponses++
 			bytesProcessed += resp.Size()

From 022719741f55953de1c24bbe744f70659b4a2015 Mon Sep 17 00:00:00 2001
From: "HC Zhu (DB)" <hc.zhu@databricks.com>
Date: Thu, 1 May 2025 19:29:55 -0700
Subject: [PATCH 30/40] Add a flag to control client keepalive ping interval

---
 cmd/thanos/query.go   | 7 +++----
 pkg/extgrpc/client.go | 6 +++++-
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/cmd/thanos/query.go b/cmd/thanos/query.go
index 7e8c4defde5..a1e7c37161b 100644
--- a/cmd/thanos/query.go
+++ b/cmd/thanos/query.go
@@ -62,7 +62,6 @@ import (
 	"github.com/thanos-io/thanos/pkg/tenancy"
 	"github.com/thanos-io/thanos/pkg/tls"
 	"github.com/thanos-io/thanos/pkg/ui"
-	"google.golang.org/grpc/keepalive"
 )
 
 const (
@@ -506,9 +505,9 @@ func runQuery(
 		return errors.Wrap(err, "building gRPC client")
 	}
 	if grpcStoreClientKeepAlivePingInterval > 0 {
-		dialOpts = append(dialOpts, grpc.WithKeepaliveParams(keepalive.ClientParameters{
-			Time: grpcStoreClientKeepAlivePingInterval,
-		}))
+		clientParameters := extgrpc.GetDefaultKeepaliveClientParameters()
+		clientParameters.Time = grpcStoreClientKeepAlivePingInterval
+		dialOpts = append(dialOpts, grpc.WithKeepaliveParams(clientParameters))
 	}
 	if grpcCompression != compressionNone {
 		dialOpts = append(dialOpts, grpc.WithDefaultCallOptions(grpc.UseCompressor(grpcCompression)))
diff --git a/pkg/extgrpc/client.go b/pkg/extgrpc/client.go
index a6dcc7b633d..8bf1521c503 100644
--- a/pkg/extgrpc/client.go
+++ b/pkg/extgrpc/client.go
@@ -44,6 +44,10 @@ func EndpointGroupGRPCOpts() []grpc.DialOption {
 	}
 }
 
+func GetDefaultKeepaliveClientParameters() keepalive.ClientParameters {
+	return keepalive.ClientParameters{Time: 10 * time.Second, Timeout: 5 * time.Second}
+}
+
 // StoreClientGRPCOpts creates gRPC dial options for connecting to a store client.
 func StoreClientGRPCOpts(logger log.Logger, reg prometheus.Registerer, tracer opentracing.Tracer, secure, skipVerify bool, cert, key, caCert, serverName string) ([]grpc.DialOption, error) {
 	grpcMets := grpc_prometheus.NewClientMetrics(
@@ -71,7 +75,7 @@ func StoreClientGRPCOpts(logger log.Logger, reg prometheus.Registerer, tracer op
 			grpcMets.StreamClientInterceptor(),
 			tracing.StreamClientInterceptor(tracer),
 		),
-		grpc.WithKeepaliveParams(keepalive.ClientParameters{Time: 10 * time.Second, Timeout: 5 * time.Second}),
+		grpc.WithKeepaliveParams(GetDefaultKeepaliveClientParameters()),
 	}
 	if reg != nil {
 		reg.MustRegister(grpcMets)

From 2b393c970fa1ae7f893022e7eee76e701b1aa6bc Mon Sep 17 00:00:00 2001
From: "HC Zhu (DB)" <hc.zhu@databricks.com>
Date: Thu, 1 May 2025 19:33:01 -0700
Subject: [PATCH 31/40] Fix linter

---
 cmd/thanos/streamer.go     |  2 +-
 cmd/thanos/tools_metric.go | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/cmd/thanos/streamer.go b/cmd/thanos/streamer.go
index aac9fb2bee4..bfda39f3c2c 100644
--- a/cmd/thanos/streamer.go
+++ b/cmd/thanos/streamer.go
@@ -39,7 +39,7 @@ type StreamerConfig struct {
 	storeAddrPort        string
 	streamTimeoutSeconds int
 	replicaLabel         string
-	ignoreWarnings	   	 bool
+	ignoreWarnings       bool
 }
 
 func registerStreamer(app *extkingpin.App) {
diff --git a/cmd/thanos/tools_metric.go b/cmd/thanos/tools_metric.go
index 111d942f170..dc70593ad28 100644
--- a/cmd/thanos/tools_metric.go
+++ b/cmd/thanos/tools_metric.go
@@ -25,11 +25,11 @@ import (
 )
 
 type rawMetricConfig struct {
-	storeAddr  string
-	metric     string
-	hoursAgo   int
-	skipChunks bool
-	equalLabelMatcher string
+	storeAddr            string
+	metric               string
+	hoursAgo             int
+	skipChunks           bool
+	equalLabelMatcher    string
 	notEqualLabelMatcher string
 }
 
@@ -83,8 +83,8 @@ func streamMetric(conf *rawMetricConfig, logger log.Logger) error {
 			return
 		}
 		labelMatchers = append(labelMatchers, storepb.LabelMatcher{
-			Type: mtype,
-			Name: parts[0],
+			Type:  mtype,
+			Name:  parts[0],
 			Value: parts[1],
 		})
 	}

From bdf6675f942a3ebf1223f7410d971a2ce5c868ad Mon Sep 17 00:00:00 2001
From: "HC Zhu (DB)" <hc.zhu@databricks.com>
Date: Thu, 1 May 2025 21:14:14 -0700
Subject: [PATCH 32/40] Update streamer tool

---
 cmd/thanos/tools_streamer.go | 33 ++++++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/cmd/thanos/tools_streamer.go b/cmd/thanos/tools_streamer.go
index bd486177ae4..258f98e6835 100644
--- a/cmd/thanos/tools_streamer.go
+++ b/cmd/thanos/tools_streamer.go
@@ -23,10 +23,13 @@ import (
 )
 
 type streamerToolConfig struct {
-	socketPath string
-	metric     string
-	hoursAgo   int
-	skipChunks bool
+	socketPath           string
+	metric               string
+	hoursAgo             int
+	skipChunks           bool
+	equalLabelMatcher    string
+	notEqualLabelMatcher string
+	sleepSeconds         int
 }
 
 func registerStreamerTool(app extkingpin.AppClause) {
@@ -37,6 +40,9 @@ func registerStreamerTool(app extkingpin.AppClause) {
 	cmd.Flag("metric", "The metric name to stream time series for this metric").Default("node_cpu_seconds_total").StringVar(&conf.metric)
 	cmd.Flag("hours_ago", "Stream the metric from this number of hours ago").Default("16").IntVar(&conf.hoursAgo)
 	cmd.Flag("skip_chunks", "Skip chunks in the response").Default("false").BoolVar(&conf.skipChunks)
+	cmd.Flag("eq_label_matcher", "One label matcher using equal").Default("").StringVar(&conf.equalLabelMatcher)
+	cmd.Flag("neq_label_matcher", "One label matcher using not equal").Default("").StringVar(&conf.notEqualLabelMatcher)
+	cmd.Flag("sleep_seconds_between_calls", "Sleep this amount of seconds between calling two gRPC").Default("0").IntVar(&conf.sleepSeconds)
 
 	cmd.Setup(func(
 		g *run.Group,
@@ -72,6 +78,20 @@ func runStreamerTool(conf *streamerToolConfig, logger log.Logger) error {
 		SkipChunks:       conf.skipChunks,
 		Metric:           conf.metric,
 	}
+	addMatcher := func(mtype streamer_pkg.LabelMatcher_Type, matcher string) {
+		parts := strings.Split(matcher, "=")
+		if len(parts) != 2 {
+			level.Error(logger).Log("msg", "ignoring an invalid label matcher", "matcher", matcher)
+			return
+		}
+		request.LabelMatchers = append(request.LabelMatchers, streamer_pkg.LabelMatcher{
+			Type:  mtype,
+			Name:  parts[0],
+			Value: parts[1],
+		})
+	}
+	addMatcher(streamer_pkg.LabelMatcher_EQ, conf.equalLabelMatcher)
+	addMatcher(streamer_pkg.LabelMatcher_NEQ, conf.notEqualLabelMatcher)
 
 	level.Info(logger).Log(
 		"msg", "sending a socket request to Thanos streamer",
@@ -120,7 +140,7 @@ func runStreamerTool(conf *streamerToolConfig, logger log.Logger) error {
 			level.Error(logger).Log("msg", "error in response", "err", resp.Err)
 			return nil
 		}
-		if 0 == (seq % 1000) {
+		if (seq % 1000) == 0 {
 			level.Info(logger).Log("msg", "streaming time series", "seq", seq)
 		}
 		series := resp.Data
@@ -147,6 +167,9 @@ func runStreamerTool(conf *streamerToolConfig, logger log.Logger) error {
 			order++
 		}
 		fmt.Printf("\n")
+		if conf.sleepSeconds > 0 {
+			time.Sleep(time.Duration(conf.sleepSeconds) * time.Second)
+		}
 	}
 	return fmt.Errorf("unexpected interruption: %w", scanner.Err())
 }

From c1704dd56238f1e0b74eff5cdbc897a69e654c5c Mon Sep 17 00:00:00 2001
From: Yuchen Wang <yuchen.wang@databricks.com>
Date: Fri, 2 May 2025 12:11:01 -0700
Subject: [PATCH 33/40] clean up code

---
 pkg/receive/hashring.go | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/pkg/receive/hashring.go b/pkg/receive/hashring.go
index 95659526162..4c7581a1661 100644
--- a/pkg/receive/hashring.go
+++ b/pkg/receive/hashring.go
@@ -258,12 +258,10 @@ func newAlignedKetamaHashring(endpoints []Endpoint, sectionsPerNode int, replica
 	if sectionsPerNode <= 0 {
 		return nil, errors.New("sections per node must be positive")
 	}
-
 	groupedEndpoints, err := groupByAZ(endpoints)
 	if err != nil {
 		return nil, errors.Wrap(err, "failed to group endpoints by AZ")
 	}
-
 	numAZs := len(groupedEndpoints)
 	if numAZs == 0 {
 		return nil, errors.New("no endpoint groups found after grouping by AZ")
@@ -271,20 +269,17 @@ func newAlignedKetamaHashring(endpoints []Endpoint, sectionsPerNode int, replica
 	if uint64(numAZs) != replicationFactor {
 		return nil, fmt.Errorf("number of AZs (%d) must equal replication factor (%d)", numAZs, replicationFactor)
 	}
-
 	numEndpointsPerAZ := len(groupedEndpoints[0])
 	if numEndpointsPerAZ == 0 {
 		return nil, errors.New("AZ groups are empty after grouping")
 	}
-
 	totalEndpoints := numAZs * numEndpointsPerAZ
 	flatEndpoints := make([]Endpoint, 0, totalEndpoints)
 	for azIndex := 0; azIndex < numAZs; azIndex++ {
 		flatEndpoints = append(flatEndpoints, groupedEndpoints[azIndex]...)
 	}
-
 	hasher := xxhash.New()
-	ringSections := make(sections, 0, numEndpointsPerAZ*sectionsPerNode) // Correct capacity.
+	ringSections := make(sections, 0, numEndpointsPerAZ*sectionsPerNode)
 
 	// Iterate through primary endpoints (those in the first AZ) to define sections.
 	for primaryOrdinalIndex := 0; primaryOrdinalIndex < numEndpointsPerAZ; primaryOrdinalIndex++ {
@@ -301,8 +296,8 @@ func newAlignedKetamaHashring(endpoints []Endpoint, sectionsPerNode int, replica
 
 			sec := &section{
 				hash:          sectionHash,
-				az:            primaryEndpoint.AZ,          // AZ of the primary.
-				endpointIndex: uint64(primaryOrdinalIndex), // Index within the AZ.
+				az:            primaryEndpoint.AZ,
+				endpointIndex: uint64(primaryOrdinalIndex),
 				replicas:      make([]uint64, 0, replicationFactor),
 			}
 

From 30c03ca2c739c6e7fbba153026c3db02e0dfd92a Mon Sep 17 00:00:00 2001
From: Yuchen Wang <yuchen.wang@databricks.com>
Date: Fri, 2 May 2025 12:34:10 -0700
Subject: [PATCH 34/40] clean up code

---
 pkg/receive/hashring.go | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/pkg/receive/hashring.go b/pkg/receive/hashring.go
index 4c7581a1661..d812aa23c61 100644
--- a/pkg/receive/hashring.go
+++ b/pkg/receive/hashring.go
@@ -194,11 +194,9 @@ func groupByAZ(endpoints []Endpoint) ([][]Endpoint, error) {
 		if err != nil {
 			return nil, errors.Wrapf(err, "failed to extract ordinal from address %s", ep.Address)
 		}
-
 		if _, ok := azEndpoints[ep.AZ]; !ok {
 			azEndpoints[ep.AZ] = make(map[int]Endpoint)
 		}
-
 		if _, exists := azEndpoints[ep.AZ][ordinal]; exists {
 			return nil, fmt.Errorf("duplicate endpoint ordinal %d for address %s in AZ %s", ordinal, ep.Address, ep.AZ)
 		}
@@ -225,17 +223,14 @@ func groupByAZ(endpoints []Endpoint) ([][]Endpoint, error) {
 				break
 			}
 		}
-
 		if !presentInAllAZs {
 			maxCommonOrdinal = i - 1
 			break
 		}
 	}
-
 	if maxCommonOrdinal < 0 {
 		return nil, errors.New("no common endpoints with ordinal 0 found across all AZs")
 	}
-
 	numAZs := len(sortedAZs)
 	result := make([][]Endpoint, numAZs)
 	for i, az := range sortedAZs {
@@ -284,16 +279,10 @@ func newAlignedKetamaHashring(endpoints []Endpoint, sectionsPerNode int, replica
 	// Iterate through primary endpoints (those in the first AZ) to define sections.
 	for primaryOrdinalIndex := 0; primaryOrdinalIndex < numEndpointsPerAZ; primaryOrdinalIndex++ {
 		primaryEndpoint := groupedEndpoints[0][primaryOrdinalIndex]
-		primaryOrdinal, err := strutil.ExtractPodOrdinal(primaryEndpoint.Address)
-		if err != nil {
-			return nil, errors.Wrapf(err, "failed to extract ordinal from primary endpoint %s", primaryEndpoint.Address)
-		}
-
 		for sectionIndex := 1; sectionIndex <= sectionsPerNode; sectionIndex++ {
 			hasher.Reset()
 			_, _ = hasher.Write([]byte(primaryEndpoint.Address + ":" + strconv.Itoa(sectionIndex)))
 			sectionHash := hasher.Sum64()
-
 			sec := &section{
 				hash:          sectionHash,
 				az:            primaryEndpoint.AZ,
@@ -309,9 +298,9 @@ func newAlignedKetamaHashring(endpoints []Endpoint, sectionsPerNode int, replica
 				if err != nil {
 					return nil, errors.Wrapf(err, "failed to extract ordinal from replica endpoint %s in AZ %s", replicaEndpoint.Address, replicaEndpoint.AZ)
 				}
-				if replicaOrdinal != primaryOrdinal {
+				if replicaOrdinal != primaryOrdinalIndex {
 					return nil, fmt.Errorf("ordinal mismatch for primary endpoint %s (ordinal %d): replica %s in AZ %s has ordinal %d",
-						primaryEndpoint.Address, primaryOrdinal, replicaEndpoint.Address, replicaEndpoint.AZ, replicaOrdinal)
+						primaryEndpoint.Address, primaryOrdinalIndex, replicaEndpoint.Address, replicaEndpoint.AZ, replicaOrdinal)
 				}
 				sec.replicas = append(sec.replicas, uint64(replicaFlatIndex))
 			}

From e0a4a3a3a7ac4300aee042fcf1453a241bba3fca Mon Sep 17 00:00:00 2001
From: "HC Zhu (DB)" <hc.zhu@databricks.com>
Date: Fri, 2 May 2025 14:26:07 -0700
Subject: [PATCH 35/40] Fix ignore warnings

---
 cmd/thanos/streamer.go | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/cmd/thanos/streamer.go b/cmd/thanos/streamer.go
index bfda39f3c2c..0537b55c061 100644
--- a/cmd/thanos/streamer.go
+++ b/cmd/thanos/streamer.go
@@ -264,13 +264,15 @@ func (s *Streamer) streamOneRequest(request *streamer.StreamerRequest, writer io
 			return writeResponse(&streamer.StreamerResponse{Err: err.Error()})
 		}
 		if warning := response.GetWarning(); warning != "" {
-			level.Error(s.logger).Log(
-				"warning", warning,
+			level.Warn(s.logger).Log(
 				"msg", "warning response from Store gRPC stream",
+				"warning", warning,
+				"ignore_warnings", s.config.ignoreWarnings,
 				"request_id", request.RequestId)
-			if !s.config.ignoreWarnings {
-				return writeResponse(&streamer.StreamerResponse{Err: fmt.Sprintf("warning response from Store gRPC stream: %s", warning)})
+			if s.config.ignoreWarnings {
+				continue
 			}
+			return writeResponse(&streamer.StreamerResponse{Err: fmt.Sprintf("warning response from Store gRPC stream: %s", warning)})
 		}
 		seriesResp := response.GetSeries()
 		if seriesResp == nil {

From a62512546497f58405ccd89fc1b457d4b4456e8c Mon Sep 17 00:00:00 2001
From: Yuchen Wang <yuchen.wang@databricks.com>
Date: Fri, 2 May 2025 14:24:10 -0700
Subject: [PATCH 36/40] resolve comments

---
 pkg/receive/aligned_hashring.go      | 153 +++++++++++
 pkg/receive/aligned_hashring_test.go | 378 +++++++++++++++++++++++++++
 pkg/receive/hashring.go              | 139 +---------
 pkg/receive/hashring_test.go         | 365 --------------------------
 4 files changed, 532 insertions(+), 503 deletions(-)
 create mode 100644 pkg/receive/aligned_hashring.go
 create mode 100644 pkg/receive/aligned_hashring_test.go

diff --git a/pkg/receive/aligned_hashring.go b/pkg/receive/aligned_hashring.go
new file mode 100644
index 00000000000..30203fbb90d
--- /dev/null
+++ b/pkg/receive/aligned_hashring.go
@@ -0,0 +1,153 @@
+// Copyright (c) The Thanos Authors.
+// Licensed under the Apache License 2.0.
+
+package receive
+
+import (
+	"fmt"
+	"sort"
+	"strconv"
+
+	"github.com/cespare/xxhash"
+	"github.com/pkg/errors"
+
+	"github.com/thanos-io/thanos/pkg/strutil"
+)
+
+// groupByAZ groups endpoints by Availability Zone and sorts them by their inferred ordinal in a k8s statefulset.
+// It returns a 2D slice where each inner slice represents an AZ (sorted alphabetically)
+// and contains endpoints sorted by ordinal. All inner slices are truncated to the
+// length of the largest common sequence of ordinals starting from 0 across all AZs.
+// All endpoint addresses must be valid k8s DNS names with 0-index ordinals at the end of the pod name.
+func groupByAZ(endpoints []Endpoint) ([][]Endpoint, error) {
+	if len(endpoints) == 0 {
+		return nil, errors.New("no endpoints provided")
+	}
+
+	// Group endpoints by AZ and then by ordinal.
+	azEndpoints := make(map[string]map[int]Endpoint)
+	for _, ep := range endpoints {
+		ordinal, err := strutil.ExtractPodOrdinal(ep.Address)
+		if err != nil {
+			return nil, errors.Wrapf(err, "failed to extract ordinal from address %s", ep.Address)
+		}
+		if _, ok := azEndpoints[ep.AZ]; !ok {
+			azEndpoints[ep.AZ] = make(map[int]Endpoint)
+		}
+		if _, exists := azEndpoints[ep.AZ][ordinal]; exists {
+			return nil, fmt.Errorf("duplicate endpoint ordinal %d for address %s in AZ %s", ordinal, ep.Address, ep.AZ)
+		}
+		azEndpoints[ep.AZ][ordinal] = ep
+	}
+
+	// Get sorted list of AZ names.
+	sortedAZs := make([]string, 0, len(azEndpoints))
+	for az := range azEndpoints {
+		sortedAZs = append(sortedAZs, az)
+	}
+	sort.Strings(sortedAZs)
+
+	// Determine the maximum common ordinal across all AZs.
+	maxCommonOrdinal := -1
+	for i := 0; ; i++ {
+		presentInAllAZs := true
+		for _, az := range sortedAZs {
+			if _, ok := azEndpoints[az][i]; !ok {
+				presentInAllAZs = false
+				if i == 0 {
+					return nil, fmt.Errorf("AZ %q is missing endpoint with ordinal 0", az)
+				}
+				break
+			}
+		}
+		if !presentInAllAZs {
+			maxCommonOrdinal = i - 1
+			break
+		}
+	}
+	if maxCommonOrdinal < 0 {
+		return nil, errors.New("no common endpoints with ordinal 0 found across all AZs")
+	}
+	numAZs := len(sortedAZs)
+	result := make([][]Endpoint, numAZs)
+	for i, az := range sortedAZs {
+		result[i] = make([]Endpoint, 0, maxCommonOrdinal+1)
+		for j := 0; j <= maxCommonOrdinal; j++ {
+			result[i] = append(result[i], azEndpoints[az][j])
+		}
+	}
+	return result, nil
+}
+
+// newAlignedKetamaHashring creates a Ketama hash ring where replicas are strictly aligned across Availability Zones.
+// Each section on the hash ring corresponds to a primary endpoint (taken from the first AZ) and its
+// aligned replicas in other AZs (endpoints with the same ordinal). The hash for a section is calculated
+// based *only* on the primary endpoint's address.
+func newAlignedKetamaHashring(endpoints []Endpoint, sectionsPerNode int, replicationFactor uint64) (*ketamaHashring, error) {
+	if replicationFactor == 0 {
+		return nil, errors.New("replication factor cannot be zero")
+	}
+	if sectionsPerNode <= 0 {
+		return nil, errors.New("sections per node must be positive")
+	}
+	groupedEndpoints, err := groupByAZ(endpoints)
+	if err != nil {
+		return nil, errors.Wrap(err, "failed to group endpoints by AZ")
+	}
+	numAZs := len(groupedEndpoints)
+	if numAZs == 0 {
+		return nil, errors.New("no endpoint groups found after grouping by AZ")
+	}
+	if uint64(numAZs) != replicationFactor {
+		return nil, fmt.Errorf("number of AZs (%d) must equal replication factor (%d)", numAZs, replicationFactor)
+	}
+	numEndpointsPerAZ := len(groupedEndpoints[0])
+	if numEndpointsPerAZ == 0 {
+		return nil, errors.New("AZ groups are empty after grouping")
+	}
+	totalEndpoints := numAZs * numEndpointsPerAZ
+	flatEndpoints := make([]Endpoint, 0, totalEndpoints)
+	for azIndex := 0; azIndex < numAZs; azIndex++ {
+		flatEndpoints = append(flatEndpoints, groupedEndpoints[azIndex]...)
+	}
+	hasher := xxhash.New()
+	ringSections := make(sections, 0, numEndpointsPerAZ*sectionsPerNode)
+
+	// Iterate through primary endpoints (those in the first AZ) to define sections.
+	for primaryOrdinalIndex := 0; primaryOrdinalIndex < numEndpointsPerAZ; primaryOrdinalIndex++ {
+		primaryEndpoint := groupedEndpoints[0][primaryOrdinalIndex]
+		for sectionIndex := 1; sectionIndex <= sectionsPerNode; sectionIndex++ {
+			hasher.Reset()
+			_, _ = hasher.Write([]byte(primaryEndpoint.Address + ":" + strconv.Itoa(sectionIndex)))
+			sectionHash := hasher.Sum64()
+			sec := &section{
+				hash:          sectionHash,
+				az:            primaryEndpoint.AZ,
+				endpointIndex: uint64(primaryOrdinalIndex),
+				replicas:      make([]uint64, 0, replicationFactor),
+			}
+
+			// Find indices of all replicas (including primary) in the flat list and verify alignment.
+			for azIndex := 0; azIndex < numAZs; azIndex++ {
+				replicaFlatIndex := azIndex*numEndpointsPerAZ + primaryOrdinalIndex
+				replicaEndpoint := flatEndpoints[replicaFlatIndex]
+				replicaOrdinal, err := strutil.ExtractPodOrdinal(replicaEndpoint.Address)
+				if err != nil {
+					return nil, errors.Wrapf(err, "failed to extract ordinal from replica endpoint %s in AZ %s", replicaEndpoint.Address, replicaEndpoint.AZ)
+				}
+				if replicaOrdinal != primaryOrdinalIndex {
+					return nil, fmt.Errorf("ordinal mismatch for primary endpoint %s (ordinal %d): replica %s in AZ %s has ordinal %d",
+						primaryEndpoint.Address, primaryOrdinalIndex, replicaEndpoint.Address, replicaEndpoint.AZ, replicaOrdinal)
+				}
+				sec.replicas = append(sec.replicas, uint64(replicaFlatIndex))
+			}
+			ringSections = append(ringSections, sec)
+		}
+	}
+	sort.Sort(ringSections)
+	return &ketamaHashring{
+		endpoints:    flatEndpoints,
+		sections:     ringSections,
+		numEndpoints: uint64(totalEndpoints),
+	}, nil
+}
diff --git a/pkg/receive/aligned_hashring_test.go b/pkg/receive/aligned_hashring_test.go
new file mode 100644
index 00000000000..c5e9feb611e
--- /dev/null
+++ b/pkg/receive/aligned_hashring_test.go
@@ -0,0 +1,378 @@
+// Copyright (c) The Thanos Authors.
+// Licensed under the Apache License 2.0.
+
+package receive
+
+import (
+	"sort"
+	"strconv"
+	"strings"
+	"testing"
+
+	"github.com/efficientgo/core/testutil"
+	"github.com/stretchr/testify/require"
+	"github.com/thanos-io/thanos/pkg/store/labelpb"
+	"github.com/thanos-io/thanos/pkg/store/storepb/prompb"
+	"github.com/thanos-io/thanos/pkg/strutil"
+)
+
+// podDNS creates a DNS-like string for testing endpoint addresses.
+func podDNS(name string, ordinal int) string {
+	return name + "-" + strconv.Itoa(ordinal) + ".test-svc.test-namespace.svc.cluster.local"
+}
+
+func TestGroupByAZ(t *testing.T) {
+	// Test setup endpoints.
+	ep0a := Endpoint{Address: podDNS("pod", 0), AZ: "zone-a"}
+	ep1a := Endpoint{Address: podDNS("pod", 1), AZ: "zone-a"}
+	ep2a := Endpoint{Address: podDNS("pod", 2), AZ: "zone-a"}
+	ep0b := Endpoint{Address: podDNS("pod", 0), AZ: "zone-b"}
+	ep1b := Endpoint{Address: podDNS("pod", 1), AZ: "zone-b"}
+	ep0c := Endpoint{Address: podDNS("pod", 0), AZ: "zone-c"}
+	ep1c := Endpoint{Address: podDNS("pod", 1), AZ: "zone-c"}
+	invalidEp := Endpoint{Address: "invalid-address-format", AZ: "zone-a"}
+	duplicateEp0a := Endpoint{Address: podDNS("anotherpod", 0), AZ: "zone-a"} // Same ordinal (0) as ep0a in zone-a.
+
+	testCases := map[string]struct {
+		inputEndpoints []Endpoint
+		expectedResult [][]Endpoint
+		expectError    bool
+		errorContains  string
+	}{
+		"error on empty input": {
+			inputEndpoints: []Endpoint{},
+			expectedResult: nil,
+			expectError:    true,
+			errorContains:  "no endpoints provided",
+		},
+		"single AZ, multiple endpoints": {
+			inputEndpoints: []Endpoint{ep1a, ep0a, ep2a},
+			expectedResult: [][]Endpoint{
+				{ep0a, ep1a, ep2a},
+			},
+			expectError: false,
+		},
+		"multiple AZs, balanced and ordered": {
+			inputEndpoints: []Endpoint{ep1a, ep0b, ep0a, ep1b},
+			expectedResult: [][]Endpoint{
+				{ep0a, ep1a},
+				{ep0b, ep1b},
+			},
+			expectError: false,
+		},
+		"multiple AZs, different counts, stops at first missing ordinal > 0": {
+			inputEndpoints: []Endpoint{ep1a, ep0b, ep0a, ep1b, ep2a, ep0c},
+			expectedResult: [][]Endpoint{
+				{ep0a},
+				{ep0b},
+				{ep0c},
+			},
+			expectError: false,
+		},
+		"error if ordinal 0 missing in any AZ": {
+			inputEndpoints: []Endpoint{ep1a, ep2a, ep1b},
+			expectedResult: nil,
+			expectError:    true,
+			errorContains:  "missing endpoint with ordinal 0",
+		},
+		"error if ordinal 0 missing in only one AZ": {
+			inputEndpoints: []Endpoint{ep0a, ep1a, ep1b},
+			expectedResult: nil,
+			expectError:    true,
+			errorContains:  `AZ "zone-b" is missing endpoint with ordinal 0`,
+		},
+		"error on invalid address format": {
+			inputEndpoints: []Endpoint{ep0a, invalidEp, ep0b},
+			expectedResult: nil,
+			expectError:    true,
+			errorContains:  "failed to extract ordinal from address invalid-address-format",
+		},
+		"error on duplicate ordinal within an AZ": {
+			inputEndpoints: []Endpoint{ep0a, ep1a, ep0b, duplicateEp0a},
+			expectedResult: nil,
+			expectError:    true,
+			errorContains:  "duplicate endpoint ordinal 0 for address " + duplicateEp0a.Address + " in AZ zone-a",
+		},
+		"AZ sorting check": {
+			inputEndpoints: []Endpoint{ep0b, ep0c, ep0a},
+			expectedResult: [][]Endpoint{
+				{ep0a},
+				{ep0b},
+				{ep0c},
+			},
+			expectError: false,
+		},
+		"multiple AZs, stops correctly when next ordinal missing everywhere": {
+			inputEndpoints: []Endpoint{ep1a, ep0b, ep0a, ep1b, ep0c, ep1c},
+			expectedResult: [][]Endpoint{
+				{ep0a, ep1a},
+				{ep0b, ep1b},
+				{ep0c, ep1c},
+			},
+			expectError: false,
+		},
+	}
+
+	for tcName, tc := range testCases {
+		t.Run(tcName, func(t *testing.T) {
+			result, err := groupByAZ(tc.inputEndpoints)
+
+			if tc.expectError {
+				testutil.NotOk(t, err)
+				if tc.errorContains != "" {
+					testutil.Assert(t, strings.Contains(err.Error(), tc.errorContains), "Expected error message to contain '%s', but got: %v", tc.errorContains, err)
+				}
+				testutil.Assert(t, result == nil, "Expected nil result on error, got: %v", result)
+			} else {
+				testutil.Ok(t, err)
+				testutil.Equals(t, tc.expectedResult, result)
+
+				// Verify outer slice (AZs) is sorted alphabetically.
+				if err == nil && len(result) > 1 {
+					azOrderCorrect := sort.SliceIsSorted(result, func(i, j int) bool {
+						return result[i][0].AZ < result[j][0].AZ
+					})
+					testutil.Assert(t, azOrderCorrect, "Outer slice is not sorted by AZ")
+				}
+			}
+		})
+	}
+}
+
+func TestAlignedKetamaHashringGet(t *testing.T) {
+	t.Parallel()
+
+	ep0a := Endpoint{Address: podDNS("pod", 0), AZ: "zone-a"}
+	ep1a := Endpoint{Address: podDNS("pod", 1), AZ: "zone-a"}
+	ep0b := Endpoint{Address: podDNS("pod", 0), AZ: "zone-b"}
+	ep1b := Endpoint{Address: podDNS("pod", 1), AZ: "zone-b"}
+	ep0c := Endpoint{Address: podDNS("pod", 0), AZ: "zone-c"}
+	ep1c := Endpoint{Address: podDNS("pod", 1), AZ: "zone-c"}
+	invalidEp := Endpoint{Address: "invalid-address", AZ: "zone-a"}
+	duplicateEp0a := Endpoint{Address: podDNS("anotherpod", 0), AZ: "zone-a"}
+
+	tsForReplicaTest := &prompb.TimeSeries{
+		Labels: []labelpb.ZLabel{{Name: "test", Value: "replica-routing"}},
+	}
+
+	testCases := map[string]struct {
+		inputEndpoints    []Endpoint
+		replicationFactor uint64
+		sectionsPerNode   int
+
+		tenant           string
+		ts               *prompb.TimeSeries
+		n                uint64
+		expectedEndpoint Endpoint
+
+		expectConstructorError   bool
+		constructorErrorContains string
+		expectGetNError          bool
+		getNErrorContains        string
+	}{
+		"valid 2 AZs, RF=2, get replica 0": {
+			inputEndpoints:         []Endpoint{ep1a, ep0b, ep0a, ep1b},
+			replicationFactor:      2,
+			sectionsPerNode:        SectionsPerNode,
+			tenant:                 "tenant1",
+			ts:                     tsForReplicaTest,
+			n:                      0,
+			expectedEndpoint:       ep0a,
+			expectConstructorError: false,
+			expectGetNError:        false,
+		},
+		"valid 2 AZs, RF=2, get replica 1": {
+			inputEndpoints:         []Endpoint{ep1a, ep0b, ep0a, ep1b},
+			replicationFactor:      2,
+			sectionsPerNode:        SectionsPerNode,
+			tenant:                 "tenant1",
+			ts:                     tsForReplicaTest,
+			n:                      1,
+			expectedEndpoint:       ep0b,
+			expectConstructorError: false,
+			expectGetNError:        false,
+		},
+		"valid 3 AZs, RF=3, get replica 0": {
+			inputEndpoints:         []Endpoint{ep1a, ep0b, ep0a, ep1b, ep0c, ep1c},
+			replicationFactor:      3,
+			sectionsPerNode:        SectionsPerNode,
+			tenant:                 "tenant1",
+			ts:                     tsForReplicaTest,
+			n:                      0,
+			expectedEndpoint:       ep0a,
+			expectConstructorError: false,
+			expectGetNError:        false,
+		},
+		"valid 3 AZs, RF=3, get replica 1": {
+			inputEndpoints:         []Endpoint{ep1a, ep0b, ep0a, ep1b, ep0c, ep1c},
+			replicationFactor:      3,
+			sectionsPerNode:        SectionsPerNode,
+			tenant:                 "tenant1",
+			ts:                     tsForReplicaTest,
+			n:                      1,
+			expectedEndpoint:       ep0b,
+			expectConstructorError: false,
+			expectGetNError:        false,
+		},
+		"valid 3 AZs, RF=3, get replica 2": {
+			inputEndpoints:         []Endpoint{ep1a, ep0b, ep0a, ep1b, ep0c, ep1c},
+			replicationFactor:      3,
+			sectionsPerNode:        SectionsPerNode,
+			tenant:                 "tenant1",
+			ts:                     tsForReplicaTest,
+			n:                      2,
+			expectedEndpoint:       ep0c,
+			expectConstructorError: false,
+			expectGetNError:        false,
+		},
+		"error: empty input": {
+			inputEndpoints:           []Endpoint{},
+			replicationFactor:        1,
+			sectionsPerNode:          SectionsPerNode,
+			expectConstructorError:   true,
+			constructorErrorContains: "no endpoints provided",
+		},
+		"error: invalid address": {
+			inputEndpoints:           []Endpoint{ep0a, invalidEp, ep0b},
+			replicationFactor:        2,
+			sectionsPerNode:          SectionsPerNode,
+			expectConstructorError:   true,
+			constructorErrorContains: "failed to extract ordinal from address invalid-address",
+		},
+		"error: duplicate ordinal": {
+			inputEndpoints:           []Endpoint{ep0a, ep1a, ep0b, duplicateEp0a},
+			replicationFactor:        2,
+			sectionsPerNode:          SectionsPerNode,
+			expectConstructorError:   true,
+			constructorErrorContains: "duplicate endpoint",
+		},
+		"error: missing ordinal 0": {
+			inputEndpoints:           []Endpoint{ep1a, ep1b},
+			replicationFactor:        2,
+			sectionsPerNode:          SectionsPerNode,
+			expectConstructorError:   true,
+			constructorErrorContains: "failed to group endpoints by AZ: AZ \"zone-a\" is missing endpoint with ordinal 0",
+		},
+		"error: AZ count != RF (too few AZs)": {
+			inputEndpoints:           []Endpoint{ep0a, ep1a},
+			replicationFactor:        2,
+			sectionsPerNode:          SectionsPerNode,
+			expectConstructorError:   true,
+			constructorErrorContains: "number of AZs (1) must equal replication factor (2)",
+		},
+		"error: AZ count != RF (too many AZs)": {
+			inputEndpoints:           []Endpoint{ep0a, ep1a, ep0b, ep1b, ep0c, ep1c},
+			replicationFactor:        2,
+			sectionsPerNode:          SectionsPerNode,
+			expectConstructorError:   true,
+			constructorErrorContains: "number of AZs (3) must equal replication factor (2)",
+		},
+		"constructor success with unbalanced AZs (uses common subset)": {
+			inputEndpoints:         []Endpoint{ep0a, ep1a, ep0b},
+			replicationFactor:      2,
+			sectionsPerNode:        SectionsPerNode,
+			expectConstructorError: false,
+		},
+		"error: GetN index out of bounds (n >= numEndpoints)": {
+			inputEndpoints:         []Endpoint{ep1a, ep0b, ep0a, ep1b},
+			replicationFactor:      2,
+			sectionsPerNode:        SectionsPerNode,
+			tenant:                 "tenant1",
+			ts:                     tsForReplicaTest,
+			n:                      4,
+			expectConstructorError: false,
+			expectGetNError:        true,
+			getNErrorContains:      "insufficient nodes; have 4, want 5",
+		},
+	}
+
+	for tcName, tc := range testCases {
+		t.Run(tcName, func(t *testing.T) {
+			hashRing, err := newAlignedKetamaHashring(tc.inputEndpoints, tc.sectionsPerNode, tc.replicationFactor)
+
+			if tc.expectConstructorError {
+				require.Error(t, err, "Expected constructor error")
+				require.Nil(t, hashRing, "Hashring should be nil on constructor error")
+				if tc.constructorErrorContains != "" {
+					require.Contains(t, err.Error(), tc.constructorErrorContains, "Constructor error message mismatch")
+				}
+				return
+			}
+
+			require.NoError(t, err, "Expected constructor to succeed")
+			require.NotNil(t, hashRing, "Hashring should not be nil on successful construction")
+
+			if tc.ts == nil && !tc.expectGetNError {
+				return
+			}
+			if tc.ts == nil && tc.expectGetNError {
+				tc.ts = &prompb.TimeSeries{Labels: []labelpb.ZLabel{{Name: "dummy", Value: "dummy"}}}
+			}
+
+			result, getNErr := hashRing.GetN(tc.tenant, tc.ts, tc.n)
+			if tc.expectGetNError {
+				require.Error(t, getNErr, "Expected GetN error")
+				if tc.getNErrorContains != "" {
+					require.Contains(t, getNErr.Error(), tc.getNErrorContains, "GetN error message mismatch")
+				}
+			} else {
+				require.NoError(t, getNErr, "Expected GetN to succeed")
+				testutil.Equals(t, tc.expectedEndpoint, result, "GetN returned unexpected endpoint")
+			}
+		})
+	}
+}
+
+func TestAlignedKetamaHashringReplicaOrdinals(t *testing.T) {
+	t.Parallel()
+
+	var endpoints []Endpoint
+	for i := 0; i < 20; i++ {
+		endpoints = append(endpoints, Endpoint{Address: podDNS("pod", i), AZ: "zone-a"})
+	}
+	for i := 0; i < 20; i++ {
+		endpoints = append(endpoints, Endpoint{Address: podDNS("pod", i), AZ: "zone-b"})
+	}
+	for i := 0; i < 20; i++ {
+		endpoints = append(endpoints, Endpoint{Address: podDNS("pod", i), AZ: "zone-c"})
+	}
+	replicationFactor := uint64(3)
+	sectionsPerNode := 10
+
+	hashRing, err := newAlignedKetamaHashring(endpoints, sectionsPerNode, replicationFactor)
+	require.NoError(t, err, "Aligned hashring constructor failed")
+	require.NotNil(t, hashRing, "Hashring should not be nil")
+	require.NotEmpty(t, hashRing.sections, "Hashring should contain sections")
+
+	// Verify that all replicas within a section have the same ordinal.
+	for i, s := range hashRing.sections {
+		if len(s.replicas) == 0 {
+			continue
+		}
+
+		expectedOrdinal := -1
+
+		for replicaNum, replicaIndex := range s.replicas {
+			require.Less(t, int(replicaIndex), len(hashRing.endpoints),
+				"Section %d (hash %d), Replica %d: index %d out of bounds for endpoints list (len %d)",
+				i, s.hash, replicaNum, replicaIndex, len(hashRing.endpoints))
+
+			endpoint := hashRing.endpoints[replicaIndex]
+			ordinal, err := strutil.ExtractPodOrdinal(endpoint.Address)
+			require.NoError(t, err,
+				"Section %d (hash %d), Replica %d: failed to extract ordinal from address %s",
+				i, s.hash, replicaNum, endpoint.Address)
+
+			if expectedOrdinal == -1 {
+				expectedOrdinal = ordinal
+			} else {
+				require.Equal(t, expectedOrdinal, ordinal,
+					"Section %d (hash %d), Replica %d (%s): Mismatched ordinal. Expected %d, got %d. Replicas in section: %v",
+					i, s.hash, replicaNum, endpoint.Address, expectedOrdinal, ordinal, s.replicas)
+			}
+		}
+		if len(s.replicas) > 0 {
+			require.NotEqual(t, -1, expectedOrdinal, "Section %d (hash %d): Failed to determine expected ordinal for replicas %v", i, s.hash, s.replicas)
+		}
+	}
+}
diff --git a/pkg/receive/hashring.go b/pkg/receive/hashring.go
index d812aa23c61..19fd19262fb 100644
--- a/pkg/receive/hashring.go
+++ b/pkg/receive/hashring.go
@@ -13,7 +13,6 @@ import (
 	"sync"
 
 	"github.com/cespare/xxhash"
-	"github.com/thanos-io/thanos/pkg/strutil"
 	"golang.org/x/exp/slices"
 
 	"github.com/go-kit/log"
@@ -22,6 +21,7 @@ import (
 	"github.com/pkg/errors"
 
 	"github.com/thanos-io/thanos/pkg/store/labelpb"
+
 	"github.com/thanos-io/thanos/pkg/store/storepb/prompb"
 )
 
@@ -178,143 +178,6 @@ func newKetamaHashring(endpoints []Endpoint, sectionsPerNode int, replicationFac
 	}, nil
 }
 
-// groupByAZ groups endpoints by Availability Zone and sorts them by their inferred ordinal in a k8s statefulset.
-// It returns a 2D slice where each inner slice represents an AZ (sorted alphabetically)
-// and contains endpoints sorted by ordinal. All inner slices are truncated to the
-// length of the largest common sequence of ordinals starting from 0 across all AZs.
-func groupByAZ(endpoints []Endpoint) ([][]Endpoint, error) {
-	if len(endpoints) == 0 {
-		return nil, errors.New("no endpoints provided")
-	}
-
-	// Group endpoints by AZ and then by ordinal.
-	azEndpoints := make(map[string]map[int]Endpoint)
-	for _, ep := range endpoints {
-		ordinal, err := strutil.ExtractPodOrdinal(ep.Address)
-		if err != nil {
-			return nil, errors.Wrapf(err, "failed to extract ordinal from address %s", ep.Address)
-		}
-		if _, ok := azEndpoints[ep.AZ]; !ok {
-			azEndpoints[ep.AZ] = make(map[int]Endpoint)
-		}
-		if _, exists := azEndpoints[ep.AZ][ordinal]; exists {
-			return nil, fmt.Errorf("duplicate endpoint ordinal %d for address %s in AZ %s", ordinal, ep.Address, ep.AZ)
-		}
-		azEndpoints[ep.AZ][ordinal] = ep
-	}
-
-	// Get sorted list of AZ names.
-	sortedAZs := make([]string, 0, len(azEndpoints))
-	for az := range azEndpoints {
-		sortedAZs = append(sortedAZs, az)
-	}
-	sort.Strings(sortedAZs)
-
-	// Determine the maximum common ordinal across all AZs.
-	maxCommonOrdinal := -1
-	for i := 0; ; i++ {
-		presentInAllAZs := true
-		for _, az := range sortedAZs {
-			if _, ok := azEndpoints[az][i]; !ok {
-				presentInAllAZs = false
-				if i == 0 {
-					return nil, fmt.Errorf("AZ %q is missing endpoint with ordinal 0", az)
-				}
-				break
-			}
-		}
-		if !presentInAllAZs {
-			maxCommonOrdinal = i - 1
-			break
-		}
-	}
-	if maxCommonOrdinal < 0 {
-		return nil, errors.New("no common endpoints with ordinal 0 found across all AZs")
-	}
-	numAZs := len(sortedAZs)
-	result := make([][]Endpoint, numAZs)
-	for i, az := range sortedAZs {
-		result[i] = make([]Endpoint, 0, maxCommonOrdinal+1)
-		for j := 0; j <= maxCommonOrdinal; j++ {
-			result[i] = append(result[i], azEndpoints[az][j])
-		}
-	}
-	return result, nil
-}
-
-// newAlignedKetamaHashring creates a Ketama hash ring where replicas are strictly aligned across Availability Zones.
-// Each section on the hash ring corresponds to a primary endpoint (taken from the first AZ) and its
-// aligned replicas in other AZs (endpoints with the same ordinal). The hash for a section is calculated
-// based *only* on the primary endpoint's address.
-func newAlignedKetamaHashring(endpoints []Endpoint, sectionsPerNode int, replicationFactor uint64) (*ketamaHashring, error) {
-	if replicationFactor == 0 {
-		return nil, errors.New("replication factor cannot be zero")
-	}
-	if sectionsPerNode <= 0 {
-		return nil, errors.New("sections per node must be positive")
-	}
-	groupedEndpoints, err := groupByAZ(endpoints)
-	if err != nil {
-		return nil, errors.Wrap(err, "failed to group endpoints by AZ")
-	}
-	numAZs := len(groupedEndpoints)
-	if numAZs == 0 {
-		return nil, errors.New("no endpoint groups found after grouping by AZ")
-	}
-	if uint64(numAZs) != replicationFactor {
-		return nil, fmt.Errorf("number of AZs (%d) must equal replication factor (%d)", numAZs, replicationFactor)
-	}
-	numEndpointsPerAZ := len(groupedEndpoints[0])
-	if numEndpointsPerAZ == 0 {
-		return nil, errors.New("AZ groups are empty after grouping")
-	}
-	totalEndpoints := numAZs * numEndpointsPerAZ
-	flatEndpoints := make([]Endpoint, 0, totalEndpoints)
-	for azIndex := 0; azIndex < numAZs; azIndex++ {
-		flatEndpoints = append(flatEndpoints, groupedEndpoints[azIndex]...)
-	}
-	hasher := xxhash.New()
-	ringSections := make(sections, 0, numEndpointsPerAZ*sectionsPerNode)
-
-	// Iterate through primary endpoints (those in the first AZ) to define sections.
-	for primaryOrdinalIndex := 0; primaryOrdinalIndex < numEndpointsPerAZ; primaryOrdinalIndex++ {
-		primaryEndpoint := groupedEndpoints[0][primaryOrdinalIndex]
-		for sectionIndex := 1; sectionIndex <= sectionsPerNode; sectionIndex++ {
-			hasher.Reset()
-			_, _ = hasher.Write([]byte(primaryEndpoint.Address + ":" + strconv.Itoa(sectionIndex)))
-			sectionHash := hasher.Sum64()
-			sec := &section{
-				hash:          sectionHash,
-				az:            primaryEndpoint.AZ,
-				endpointIndex: uint64(primaryOrdinalIndex),
-				replicas:      make([]uint64, 0, replicationFactor),
-			}
-
-			// Find indices of all replicas (including primary) in the flat list and verify alignment.
-			for azIndex := 0; azIndex < numAZs; azIndex++ {
-				replicaFlatIndex := azIndex*numEndpointsPerAZ + primaryOrdinalIndex
-				replicaEndpoint := flatEndpoints[replicaFlatIndex]
-				replicaOrdinal, err := strutil.ExtractPodOrdinal(replicaEndpoint.Address)
-				if err != nil {
-					return nil, errors.Wrapf(err, "failed to extract ordinal from replica endpoint %s in AZ %s", replicaEndpoint.Address, replicaEndpoint.AZ)
-				}
-				if replicaOrdinal != primaryOrdinalIndex {
-					return nil, fmt.Errorf("ordinal mismatch for primary endpoint %s (ordinal %d): replica %s in AZ %s has ordinal %d",
-						primaryEndpoint.Address, primaryOrdinalIndex, replicaEndpoint.Address, replicaEndpoint.AZ, replicaOrdinal)
-				}
-				sec.replicas = append(sec.replicas, uint64(replicaFlatIndex))
-			}
-			ringSections = append(ringSections, sec)
-		}
-	}
-	sort.Sort(ringSections)
-	return &ketamaHashring{
-		endpoints:    flatEndpoints,
-		sections:     ringSections,
-		numEndpoints: uint64(totalEndpoints),
-	}, nil
-}
-
 func (k *ketamaHashring) Nodes() []Endpoint {
 	return k.endpoints
 }
diff --git a/pkg/receive/hashring_test.go b/pkg/receive/hashring_test.go
index 1a4f9ed2eb4..ef3b94e1941 100644
--- a/pkg/receive/hashring_test.go
+++ b/pkg/receive/hashring_test.go
@@ -6,14 +6,11 @@ package receive
 import (
 	"fmt"
 	"math"
-	"sort"
-	"strconv"
 	"strings"
 	"testing"
 
 	"github.com/efficientgo/core/testutil"
 	"github.com/stretchr/testify/require"
-	"github.com/thanos-io/thanos/pkg/strutil"
 
 	"github.com/prometheus/prometheus/model/labels"
 
@@ -305,245 +302,6 @@ func TestKetamaHashringGet(t *testing.T) {
 	}
 }
 
-func TestAlignedKetamaHashringGet(t *testing.T) {
-	t.Parallel()
-
-	ep0a := Endpoint{Address: podDNS("pod", 0), AZ: "zone-a"}
-	ep1a := Endpoint{Address: podDNS("pod", 1), AZ: "zone-a"}
-	ep0b := Endpoint{Address: podDNS("pod", 0), AZ: "zone-b"}
-	ep1b := Endpoint{Address: podDNS("pod", 1), AZ: "zone-b"}
-	ep0c := Endpoint{Address: podDNS("pod", 0), AZ: "zone-c"}
-	ep1c := Endpoint{Address: podDNS("pod", 1), AZ: "zone-c"}
-	invalidEp := Endpoint{Address: "invalid-address", AZ: "zone-a"}
-	duplicateEp0a := Endpoint{Address: podDNS("anotherpod", 0), AZ: "zone-a"}
-
-	tsForReplicaTest := &prompb.TimeSeries{
-		Labels: []labelpb.ZLabel{{Name: "test", Value: "replica-routing"}},
-	}
-
-	testCases := map[string]struct {
-		inputEndpoints    []Endpoint
-		replicationFactor uint64
-		sectionsPerNode   int
-
-		tenant           string
-		ts               *prompb.TimeSeries
-		n                uint64
-		expectedEndpoint Endpoint
-
-		expectConstructorError   bool
-		constructorErrorContains string
-		expectGetNError          bool
-		getNErrorContains        string
-	}{
-		"valid 2 AZs, RF=2, get replica 0": {
-			inputEndpoints:         []Endpoint{ep1a, ep0b, ep0a, ep1b},
-			replicationFactor:      2,
-			sectionsPerNode:        SectionsPerNode,
-			tenant:                 "tenant1",
-			ts:                     tsForReplicaTest,
-			n:                      0,
-			expectedEndpoint:       ep0a,
-			expectConstructorError: false,
-			expectGetNError:        false,
-		},
-		"valid 2 AZs, RF=2, get replica 1": {
-			inputEndpoints:         []Endpoint{ep1a, ep0b, ep0a, ep1b},
-			replicationFactor:      2,
-			sectionsPerNode:        SectionsPerNode,
-			tenant:                 "tenant1",
-			ts:                     tsForReplicaTest,
-			n:                      1,
-			expectedEndpoint:       ep0b,
-			expectConstructorError: false,
-			expectGetNError:        false,
-		},
-		"valid 3 AZs, RF=3, get replica 0": {
-			inputEndpoints:         []Endpoint{ep1a, ep0b, ep0a, ep1b, ep0c, ep1c},
-			replicationFactor:      3,
-			sectionsPerNode:        SectionsPerNode,
-			tenant:                 "tenant1",
-			ts:                     tsForReplicaTest,
-			n:                      0,
-			expectedEndpoint:       ep0a,
-			expectConstructorError: false,
-			expectGetNError:        false,
-		},
-		"valid 3 AZs, RF=3, get replica 1": {
-			inputEndpoints:         []Endpoint{ep1a, ep0b, ep0a, ep1b, ep0c, ep1c},
-			replicationFactor:      3,
-			sectionsPerNode:        SectionsPerNode,
-			tenant:                 "tenant1",
-			ts:                     tsForReplicaTest,
-			n:                      1,
-			expectedEndpoint:       ep0b,
-			expectConstructorError: false,
-			expectGetNError:        false,
-		},
-		"valid 3 AZs, RF=3, get replica 2": {
-			inputEndpoints:         []Endpoint{ep1a, ep0b, ep0a, ep1b, ep0c, ep1c},
-			replicationFactor:      3,
-			sectionsPerNode:        SectionsPerNode,
-			tenant:                 "tenant1",
-			ts:                     tsForReplicaTest,
-			n:                      2,
-			expectedEndpoint:       ep0c,
-			expectConstructorError: false,
-			expectGetNError:        false,
-		},
-		"error: empty input": {
-			inputEndpoints:           []Endpoint{},
-			replicationFactor:        1,
-			sectionsPerNode:          SectionsPerNode,
-			expectConstructorError:   true,
-			constructorErrorContains: "no endpoints provided",
-		},
-		"error: invalid address": {
-			inputEndpoints:           []Endpoint{ep0a, invalidEp, ep0b},
-			replicationFactor:        2,
-			sectionsPerNode:          SectionsPerNode,
-			expectConstructorError:   true,
-			constructorErrorContains: "failed to extract ordinal from address invalid-address",
-		},
-		"error: duplicate ordinal": {
-			inputEndpoints:           []Endpoint{ep0a, ep1a, ep0b, duplicateEp0a},
-			replicationFactor:        2,
-			sectionsPerNode:          SectionsPerNode,
-			expectConstructorError:   true,
-			constructorErrorContains: "duplicate endpoint",
-		},
-		"error: missing ordinal 0": {
-			inputEndpoints:           []Endpoint{ep1a, ep1b},
-			replicationFactor:        2,
-			sectionsPerNode:          SectionsPerNode,
-			expectConstructorError:   true,
-			constructorErrorContains: "failed to group endpoints by AZ: AZ \"zone-a\" is missing endpoint with ordinal 0",
-		},
-		"error: AZ count != RF (too few AZs)": {
-			inputEndpoints:           []Endpoint{ep0a, ep1a},
-			replicationFactor:        2,
-			sectionsPerNode:          SectionsPerNode,
-			expectConstructorError:   true,
-			constructorErrorContains: "number of AZs (1) must equal replication factor (2)",
-		},
-		"error: AZ count != RF (too many AZs)": {
-			inputEndpoints:           []Endpoint{ep0a, ep1a, ep0b, ep1b, ep0c, ep1c},
-			replicationFactor:        2,
-			sectionsPerNode:          SectionsPerNode,
-			expectConstructorError:   true,
-			constructorErrorContains: "number of AZs (3) must equal replication factor (2)",
-		},
-		"constructor success with unbalanced AZs (uses common subset)": {
-			inputEndpoints:         []Endpoint{ep0a, ep1a, ep0b},
-			replicationFactor:      2,
-			sectionsPerNode:        SectionsPerNode,
-			expectConstructorError: false,
-		},
-		"error: GetN index out of bounds (n >= numEndpoints)": {
-			inputEndpoints:         []Endpoint{ep1a, ep0b, ep0a, ep1b},
-			replicationFactor:      2,
-			sectionsPerNode:        SectionsPerNode,
-			tenant:                 "tenant1",
-			ts:                     tsForReplicaTest,
-			n:                      4,
-			expectConstructorError: false,
-			expectGetNError:        true,
-			getNErrorContains:      "insufficient nodes; have 4, want 5",
-		},
-	}
-
-	for tcName, tc := range testCases {
-		t.Run(tcName, func(t *testing.T) {
-			hashRing, err := newAlignedKetamaHashring(tc.inputEndpoints, tc.sectionsPerNode, tc.replicationFactor)
-
-			if tc.expectConstructorError {
-				require.Error(t, err, "Expected constructor error")
-				require.Nil(t, hashRing, "Hashring should be nil on constructor error")
-				if tc.constructorErrorContains != "" {
-					require.Contains(t, err.Error(), tc.constructorErrorContains, "Constructor error message mismatch")
-				}
-				return
-			}
-
-			require.NoError(t, err, "Expected constructor to succeed")
-			require.NotNil(t, hashRing, "Hashring should not be nil on successful construction")
-
-			if tc.ts == nil && !tc.expectGetNError {
-				return
-			}
-			if tc.ts == nil && tc.expectGetNError {
-				tc.ts = &prompb.TimeSeries{Labels: []labelpb.ZLabel{{Name: "dummy", Value: "dummy"}}}
-			}
-
-			result, getNErr := hashRing.GetN(tc.tenant, tc.ts, tc.n)
-			if tc.expectGetNError {
-				require.Error(t, getNErr, "Expected GetN error")
-				if tc.getNErrorContains != "" {
-					require.Contains(t, getNErr.Error(), tc.getNErrorContains, "GetN error message mismatch")
-				}
-			} else {
-				require.NoError(t, getNErr, "Expected GetN to succeed")
-				testutil.Equals(t, tc.expectedEndpoint, result, "GetN returned unexpected endpoint")
-			}
-		})
-	}
-}
-
-func TestAlignedKetamaHashringReplicaOrdinals(t *testing.T) {
-	t.Parallel()
-
-	endpoints := []Endpoint{
-		{Address: podDNS("pod", 0), AZ: "zone-a"},
-		{Address: podDNS("pod", 1), AZ: "zone-a"},
-		{Address: podDNS("pod", 2), AZ: "zone-a"},
-		{Address: podDNS("pod", 0), AZ: "zone-b"},
-		{Address: podDNS("pod", 1), AZ: "zone-b"},
-		{Address: podDNS("pod", 2), AZ: "zone-b"},
-		{Address: podDNS("pod", 0), AZ: "zone-c"},
-		{Address: podDNS("pod", 1), AZ: "zone-c"},
-		{Address: podDNS("pod", 2), AZ: "zone-c"},
-	}
-	replicationFactor := uint64(3)
-	sectionsPerNode := 10
-
-	hashRing, err := newAlignedKetamaHashring(endpoints, sectionsPerNode, replicationFactor)
-	require.NoError(t, err, "Aligned hashring constructor failed")
-	require.NotNil(t, hashRing, "Hashring should not be nil")
-	require.NotEmpty(t, hashRing.sections, "Hashring should contain sections")
-
-	// Verify that all replicas within a section have the same ordinal.
-	for i, s := range hashRing.sections {
-		if len(s.replicas) == 0 {
-			continue
-		}
-
-		expectedOrdinal := -1
-
-		for replicaNum, replicaIndex := range s.replicas {
-			require.Less(t, int(replicaIndex), len(hashRing.endpoints),
-				"Section %d (hash %d), Replica %d: index %d out of bounds for endpoints list (len %d)",
-				i, s.hash, replicaNum, replicaIndex, len(hashRing.endpoints))
-
-			endpoint := hashRing.endpoints[replicaIndex]
-			ordinal, err := strutil.ExtractPodOrdinal(endpoint.Address)
-			require.NoError(t, err,
-				"Section %d (hash %d), Replica %d: failed to extract ordinal from address %s",
-				i, s.hash, replicaNum, endpoint.Address)
-
-			if expectedOrdinal == -1 {
-				expectedOrdinal = ordinal
-			} else {
-				require.Equal(t, expectedOrdinal, ordinal,
-					"Section %d (hash %d), Replica %d (%s): Mismatched ordinal. Expected %d, got %d. Replicas in section: %v",
-					i, s.hash, replicaNum, endpoint.Address, expectedOrdinal, ordinal, s.replicas)
-			}
-		}
-		if len(s.replicas) > 0 {
-			require.NotEqual(t, -1, expectedOrdinal, "Section %d (hash %d): Failed to determine expected ordinal for replicas %v", i, s.hash, s.replicas)
-		}
-	}
-}
-
 func TestKetamaHashringBadConfigIsRejected(t *testing.T) {
 	t.Parallel()
 
@@ -913,129 +671,6 @@ func TestInvalidAZHashringCfg(t *testing.T) {
 	}
 }
 
-// podDNS creates a DNS-like string for testing endpoint addresses.
-func podDNS(name string, ordinal int) string {
-	return name + "-" + strconv.Itoa(ordinal) + ".test-svc.test-namespace.svc.cluster.local"
-}
-
-func TestGroupByAZ(t *testing.T) {
-	// Test setup endpoints.
-	ep0a := Endpoint{Address: podDNS("pod", 0), AZ: "zone-a"}
-	ep1a := Endpoint{Address: podDNS("pod", 1), AZ: "zone-a"}
-	ep2a := Endpoint{Address: podDNS("pod", 2), AZ: "zone-a"}
-	ep0b := Endpoint{Address: podDNS("pod", 0), AZ: "zone-b"}
-	ep1b := Endpoint{Address: podDNS("pod", 1), AZ: "zone-b"}
-	ep0c := Endpoint{Address: podDNS("pod", 0), AZ: "zone-c"}
-	ep1c := Endpoint{Address: podDNS("pod", 1), AZ: "zone-c"}
-	invalidEp := Endpoint{Address: "invalid-address-format", AZ: "zone-a"}
-	duplicateEp0a := Endpoint{Address: podDNS("anotherpod", 0), AZ: "zone-a"} // Same ordinal (0) as ep0a in zone-a.
-
-	testCases := map[string]struct {
-		inputEndpoints []Endpoint
-		expectedResult [][]Endpoint
-		expectError    bool
-		errorContains  string
-	}{
-		"error on empty input": {
-			inputEndpoints: []Endpoint{},
-			expectedResult: nil,
-			expectError:    true,
-			errorContains:  "no endpoints provided",
-		},
-		"single AZ, multiple endpoints": {
-			inputEndpoints: []Endpoint{ep1a, ep0a, ep2a},
-			expectedResult: [][]Endpoint{
-				{ep0a, ep1a, ep2a},
-			},
-			expectError: false,
-		},
-		"multiple AZs, balanced and ordered": {
-			inputEndpoints: []Endpoint{ep1a, ep0b, ep0a, ep1b},
-			expectedResult: [][]Endpoint{
-				{ep0a, ep1a},
-				{ep0b, ep1b},
-			},
-			expectError: false,
-		},
-		"multiple AZs, different counts, stops at first missing ordinal > 0": {
-			inputEndpoints: []Endpoint{ep1a, ep0b, ep0a, ep1b, ep2a, ep0c},
-			expectedResult: [][]Endpoint{
-				{ep0a},
-				{ep0b},
-				{ep0c},
-			},
-			expectError: false,
-		},
-		"error if ordinal 0 missing in any AZ": {
-			inputEndpoints: []Endpoint{ep1a, ep2a, ep1b},
-			expectedResult: nil,
-			expectError:    true,
-			errorContains:  "missing endpoint with ordinal 0",
-		},
-		"error if ordinal 0 missing in only one AZ": {
-			inputEndpoints: []Endpoint{ep0a, ep1a, ep1b},
-			expectedResult: nil,
-			expectError:    true,
-			errorContains:  `AZ "zone-b" is missing endpoint with ordinal 0`,
-		},
-		"error on invalid address format": {
-			inputEndpoints: []Endpoint{ep0a, invalidEp, ep0b},
-			expectedResult: nil,
-			expectError:    true,
-			errorContains:  "failed to extract ordinal from address invalid-address-format",
-		},
-		"error on duplicate ordinal within an AZ": {
-			inputEndpoints: []Endpoint{ep0a, ep1a, ep0b, duplicateEp0a},
-			expectedResult: nil,
-			expectError:    true,
-			errorContains:  "duplicate endpoint ordinal 0 for address " + duplicateEp0a.Address + " in AZ zone-a",
-		},
-		"AZ sorting check": {
-			inputEndpoints: []Endpoint{ep0b, ep0c, ep0a},
-			expectedResult: [][]Endpoint{
-				{ep0a},
-				{ep0b},
-				{ep0c},
-			},
-			expectError: false,
-		},
-		"multiple AZs, stops correctly when next ordinal missing everywhere": {
-			inputEndpoints: []Endpoint{ep1a, ep0b, ep0a, ep1b, ep0c, ep1c},
-			expectedResult: [][]Endpoint{
-				{ep0a, ep1a},
-				{ep0b, ep1b},
-				{ep0c, ep1c},
-			},
-			expectError: false,
-		},
-	}
-
-	for tcName, tc := range testCases {
-		t.Run(tcName, func(t *testing.T) {
-			result, err := groupByAZ(tc.inputEndpoints)
-
-			if tc.expectError {
-				testutil.NotOk(t, err)
-				if tc.errorContains != "" {
-					testutil.Assert(t, strings.Contains(err.Error(), tc.errorContains), "Expected error message to contain '%s', but got: %v", tc.errorContains, err)
-				}
-				testutil.Assert(t, result == nil, "Expected nil result on error, got: %v", result)
-			} else {
-				testutil.Ok(t, err)
-				testutil.Equals(t, tc.expectedResult, result)
-
-				// Verify outer slice (AZs) is sorted alphabetically.
-				if err == nil && len(result) > 1 {
-					azOrderCorrect := sort.SliceIsSorted(result, func(i, j int) bool {
-						return result[i][0].AZ < result[j][0].AZ
-					})
-					testutil.Assert(t, azOrderCorrect, "Outer slice is not sorted by AZ")
-				}
-			}
-		})
-	}
-}
-
 func makeSeries() []prompb.TimeSeries {
 	numSeries := 10000
 	series := make([]prompb.TimeSeries, numSeries)

From 4c2ae877b1370fb0c73ffbf01493857d28c0de5c Mon Sep 17 00:00:00 2001
From: Yuchen Wang <yuchen.wang@databricks.com>
Date: Fri, 2 May 2025 17:04:20 -0700
Subject: [PATCH 37/40] Revert "update prometheus fork to include more upstream
 fixes"

This reverts commit eafcb9db54c53a1f85e2b1ea841e6a43468d5ebd.
---
 go.mod | 2 +-
 go.sum | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/go.mod b/go.mod
index 6652c21447c..cd1f8d888fe 100644
--- a/go.mod
+++ b/go.mod
@@ -290,7 +290,7 @@ replace (
 	github.com/bradfitz/gomemcache => github.com/themihai/gomemcache v0.0.0-20180902122335-24332e2d58ab
 
 	// Use Prometheus fork to include some fixes
-	github.com/prometheus/prometheus => github.com/yuchen-db/prometheus v0.0.0-20250430210933-b16c6463ce79
+	github.com/prometheus/prometheus => github.com/yuchen-db/prometheus v0.0.0-20250424212119-34c3b01ad3f9
 
 	// Pin kuberesolver/v5 to support new grpc version. Need to upgrade kuberesolver version on weaveworks/common.
 	github.com/sercand/kuberesolver/v4 => github.com/sercand/kuberesolver/v5 v5.1.1
diff --git a/go.sum b/go.sum
index 0d61422bdb3..688bdcd6574 100644
--- a/go.sum
+++ b/go.sum
@@ -2295,8 +2295,8 @@ github.com/weaveworks/promrus v1.2.0/go.mod h1:SaE82+OJ91yqjrE1rsvBWVzNZKcHYFtMU
 github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
 github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
 github.com/xhit/go-str2duration v1.2.0/go.mod h1:3cPSlfZlUHVlneIVfePFWcJZsuwf+P1v2SRTV4cUmp4=
-github.com/yuchen-db/prometheus v0.0.0-20250430210933-b16c6463ce79 h1:t7f383QN0aQKHijBMKbpOQBmrFwRqNA0mAzKa5zr74w=
-github.com/yuchen-db/prometheus v0.0.0-20250430210933-b16c6463ce79/go.mod h1:GGS7QlWKCqCbcEzWsVahYIfQwiGhcExkarHyLJTsv6I=
+github.com/yuchen-db/prometheus v0.0.0-20250424212119-34c3b01ad3f9 h1:hNgEopLMPL0Ol+MOif0rqebCYAtqiBeR54lmt+GyhsA=
+github.com/yuchen-db/prometheus v0.0.0-20250424212119-34c3b01ad3f9/go.mod h1:GGS7QlWKCqCbcEzWsVahYIfQwiGhcExkarHyLJTsv6I=
 github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
 github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
 github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=

From 53266763c242a5266c5bb12d6395d3005ca77a4e Mon Sep 17 00:00:00 2001
From: Yi Jin <yi.jin@databricks.com>
Date: Mon, 12 May 2025 17:03:10 -0700
Subject: [PATCH 38/40] [ES-1458485] lower no compaction limit

Signed-off-by: Yi Jin <yi.jin@databricks.com>
---
 pkg/compact/overlapping.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/compact/overlapping.go b/pkg/compact/overlapping.go
index b3cb6e232d5..acf2685e15f 100644
--- a/pkg/compact/overlapping.go
+++ b/pkg/compact/overlapping.go
@@ -23,7 +23,7 @@ const (
 	overlappingReason = "blocks-overlapping"
 
 	symbolTableSizeExceedsError = "symbol table size exceeds"
-	symbolTableSizeLimit        = 1024 * 1024
+	symbolTableSizeLimit        = 512 * 1024 // lower this limits
 )
 
 type OverlappingCompactionLifecycleCallback struct {

From b91ac6bfba9cfc3cef78d17aa899f55866c34f17 Mon Sep 17 00:00:00 2001
From: "HC Zhu (DB)" <hc.zhu@databricks.com>
Date: Sun, 1 Jun 2025 08:04:47 -0700
Subject: [PATCH 39/40] Add a command flag for
 lazyRetrievalMaxBufferedResponses in receive

---
 cmd/thanos/receive.go | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/cmd/thanos/receive.go b/cmd/thanos/receive.go
index 0f15c86b2e1..bde6319fec0 100644
--- a/cmd/thanos/receive.go
+++ b/cmd/thanos/receive.go
@@ -414,9 +414,13 @@ func runReceive(
 			return errors.Wrap(err, "setup gRPC server")
 		}
 
+		if conf.lazyRetrievalMaxBufferedResponses <= 0 {
+			return errors.New("--receive.lazy-retrieval-max-buffered-responses must be > 0")
+		}
 		options := []store.ProxyStoreOption{
 			store.WithProxyStoreDebugLogging(debugLogging),
 			store.WithoutDedup(),
+			store.WithLazyRetrievalMaxBufferedResponsesForProxy(conf.lazyRetrievalMaxBufferedResponses),
 		}
 		if matcherConverter != nil {
 			options = append(options, store.WithProxyStoreMatcherConverter(matcherConverter))
@@ -996,11 +1000,12 @@ type receiveConfig struct {
 
 	asyncForwardWorkerCount uint
 
-	numTopMetricsPerTenant        int
-	topMetricsMinimumCardinality  uint64
-	topMetricsUpdateInterval      time.Duration
-	matcherConverterCacheCapacity int
-	maxPendingGrpcWriteRequests   int
+	numTopMetricsPerTenant            int
+	topMetricsMinimumCardinality      uint64
+	topMetricsUpdateInterval          time.Duration
+	matcherConverterCacheCapacity     int
+	maxPendingGrpcWriteRequests       int
+	lazyRetrievalMaxBufferedResponses int
 
 	featureList *[]string
 }
@@ -1170,6 +1175,8 @@ func (rc *receiveConfig) registerFlag(cmd extkingpin.FlagClause) {
 	cmd.Flag("receive.max-pending-grcp-write-requests", "Reject right away gRPC write requests when this number of requests are pending. Value 0 disables this feature.").
 		Default("0").IntVar(&rc.maxPendingGrpcWriteRequests)
 	rc.featureList = cmd.Flag("enable-feature", "Comma separated experimental feature names to enable. The current list of features is "+metricNamesFilter+".").Default("").Strings()
+	cmd.Flag("receive.lazy-retrieval-max-buffered-responses", "The lazy retrieval strategy can buffer up to this number of responses. This is to limit the memory usage. This flag takes effect only when the lazy retrieval strategy is enabled.").
+		Default("20").IntVar(&rc.lazyRetrievalMaxBufferedResponses)
 }
 
 // determineMode returns the ReceiverMode that this receiver is configured to run in.

From 77bd1b8c23aeb7aaf693c3b551acb3a70db28f9f Mon Sep 17 00:00:00 2001
From: Yuchen Wang <yuchen.wang@databricks.com>
Date: Mon, 2 Jun 2025 17:49:44 -0700
Subject: [PATCH 40/40] fix dedup data loss

---
 pkg/dedup/quorum_iter.go      |  95 +++++++--
 pkg/dedup/quorum_iter_test.go | 370 +++++++++++++++++++++++++++++++++-
 2 files changed, 445 insertions(+), 20 deletions(-)

diff --git a/pkg/dedup/quorum_iter.go b/pkg/dedup/quorum_iter.go
index fcef658c52d..c5d8dc30177 100644
--- a/pkg/dedup/quorum_iter.go
+++ b/pkg/dedup/quorum_iter.go
@@ -18,21 +18,33 @@ type quorumSeries struct {
 	lset     labels.Labels
 	replicas []storage.Series
 
-	isCounter bool
+	disablePenalty bool
+	isCounter      bool
 }
 
 func NewQuorumSeries(lset labels.Labels, replicas []storage.Series, f string) storage.Series {
 	return &quorumSeries{
-		lset:     lset,
-		replicas: replicas,
+		lset:           lset,
+		replicas:       replicas,
+		disablePenalty: true, // Default to no penalty for receiver-only setups
+		isCounter:      isCounter(f),
+	}
+}
 
-		isCounter: isCounter(f),
+// NewQuorumSeriesWithPenalty creates a quorum series with configurable penalty behavior
+func NewQuorumSeriesWithPenalty(lset labels.Labels, replicas []storage.Series, f string, disablePenalty bool) storage.Series {
+	return &quorumSeries{
+		lset:           lset,
+		replicas:       replicas,
+		disablePenalty: disablePenalty,
+		isCounter:      isCounter(f),
 	}
 }
 
 func (m *quorumSeries) Labels() labels.Labels {
 	return m.lset
 }
+
 func (m *quorumSeries) Iterator(_ chunkenc.Iterator) chunkenc.Iterator {
 	iters := make([]adjustableSeriesIterator, 0, len(m.replicas))
 	oks := make([]bool, 0, len(m.replicas))
@@ -48,10 +60,11 @@ func (m *quorumSeries) Iterator(_ chunkenc.Iterator) chunkenc.Iterator {
 		oks = append(oks, ok)
 	}
 	return &quorumSeriesIterator{
-		iters:    iters,
-		oks:      oks,
-		lastT:    math.MinInt64,
-		lastIter: nil, // behavior is undefined if At() is called before Next(), here we panic if it happens.
+		iters:          iters,
+		oks:            oks,
+		lastT:          math.MinInt64,
+		lastIter:       nil, // behavior is undefined if At() is called before Next(), here we panic if it happens.
+		disablePenalty: m.disablePenalty,
 	}
 }
 
@@ -89,17 +102,22 @@ type quorumSeriesIterator struct {
 	lastT    int64
 	lastV    float64
 	lastIter adjustableSeriesIterator
+
+	disablePenalty bool
 }
 
 func (m *quorumSeriesIterator) Next() chunkenc.ValueType {
-	// m.lastIter points to the last iterator that has the latest timestamp.
-	// m.lastT always aligns with m.lastIter unless when m.lastIter is nil.
-	// m.lastIter is nil only in the following cases:
-	//   1. Next()/Seek() is never called. m.lastT is math.MinInt64 in this case.
-	//   2. The iterator runs out of values. m.lastT is the last timestamp in this case.
+	if m.disablePenalty {
+		return m.nextWithoutPenalty()
+	}
+	return m.nextWithPenalty()
+}
+
+func (m *quorumSeriesIterator) nextWithPenalty() chunkenc.ValueType {
+	// Original penalty-based algorithm for backward compatibility
 	minT := int64(math.MaxInt64)
 	var lastIter adjustableSeriesIterator
-	quoramValue := NewQuorumValuePicker(0.0)
+	quorumValue := NewQuorumValuePicker(0.0)
 	for i, it := range m.iters {
 		if !m.oks[i] {
 			continue
@@ -114,9 +132,9 @@ func (m *quorumSeriesIterator) Next() chunkenc.ValueType {
 			if t < minT {
 				minT = t
 				lastIter = it
-				quoramValue = NewQuorumValuePicker(v)
+				quorumValue = NewQuorumValuePicker(v)
 			} else if t == minT {
-				if quoramValue.addValue(v) {
+				if quorumValue.addValue(v) {
 					lastIter = it
 				}
 			}
@@ -126,7 +144,48 @@ func (m *quorumSeriesIterator) Next() chunkenc.ValueType {
 	if m.lastIter == nil {
 		return chunkenc.ValNone
 	}
-	m.lastV = quoramValue.currentValue
+	m.lastV = quorumValue.currentValue
+	m.lastT = minT
+	return chunkenc.ValFloat
+}
+
+func (m *quorumSeriesIterator) nextWithoutPenalty() chunkenc.ValueType {
+	// Find minimum timestamp across all active iterators without applying penalties
+	minT := int64(math.MaxInt64)
+	var lastIter adjustableSeriesIterator
+	quorumValue := NewQuorumValuePicker(0.0)
+
+	for i, it := range m.iters {
+		if !m.oks[i] {
+			continue
+		}
+		t, v := it.At()
+		if t <= m.lastT {
+			// Move to next value if current is not newer
+			m.oks[i] = it.Next() != chunkenc.ValNone
+			if m.oks[i] {
+				it.adjustAtValue(m.lastV)
+				t, v = it.At()
+			} else {
+				continue
+			}
+		}
+		if t < minT {
+			minT = t
+			lastIter = it
+			quorumValue = NewQuorumValuePicker(v)
+		} else if t == minT {
+			if quorumValue.addValue(v) {
+				lastIter = it
+			}
+		}
+	}
+
+	m.lastIter = lastIter
+	if m.lastIter == nil {
+		return chunkenc.ValNone
+	}
+	m.lastV = quorumValue.currentValue
 	m.lastT = minT
 	return chunkenc.ValFloat
 }
@@ -143,7 +202,7 @@ func (m *quorumSeriesIterator) Seek(t int64) chunkenc.ValueType {
 }
 
 func (m *quorumSeriesIterator) At() (t int64, v float64) {
-	return m.lastIter.At()
+	return m.lastT, m.lastV
 }
 
 func (m *quorumSeriesIterator) AtHistogram(h *histogram.Histogram) (int64, *histogram.Histogram) {
diff --git a/pkg/dedup/quorum_iter_test.go b/pkg/dedup/quorum_iter_test.go
index 1b77c00ce2e..9b59e461f00 100644
--- a/pkg/dedup/quorum_iter_test.go
+++ b/pkg/dedup/quorum_iter_test.go
@@ -11,6 +11,7 @@ import (
 	"github.com/prometheus/prometheus/model/labels"
 	"github.com/prometheus/prometheus/storage"
 	"github.com/prometheus/prometheus/tsdb/chunkenc"
+	"github.com/prometheus/prometheus/util/annotations"
 )
 
 func TestIteratorEdgeCases(t *testing.T) {
@@ -391,15 +392,61 @@ func TestMergedSeriesIterator(t *testing.T) {
 				},
 			},
 		},
+		{
+			name: "All replicas have different non-overlapping timestamps",
+			input: []series{
+				{
+					lset:    labels.FromStrings("metric", "test"),
+					samples: []sample{{t: 10000, f: 1.0}},
+				},
+				{
+					lset:    labels.FromStrings("metric", "test"),
+					samples: []sample{{t: 20000, f: 2.0}},
+				},
+				{
+					lset:    labels.FromStrings("metric", "test"),
+					samples: []sample{{t: 30000, f: 3.0}},
+				},
+			},
+			exp: []series{
+				{
+					lset:    labels.FromStrings("metric", "test"),
+					samples: []sample{{t: 10000, f: 1.0}, {t: 20000, f: 2.0}, {t: 30000, f: 3.0}},
+				},
+			},
+		},
 	} {
 		t.Run(tcase.name, func(t *testing.T) {
-			// If it is a counter then pass a function which expects a counter.
 			// If it is a counter then pass a function which expects a counter.
 			f := ""
 			if tcase.isCounter {
 				f = "rate"
 			}
-			dedupSet := NewSeriesSet(&mockedSeriesSet{series: tcase.input}, f, AlgorithmQuorum)
+
+			// For tests that expect penalty behavior, use penalty-based algorithm
+			expectsPenalty := tcase.name == "ignore sampling interval too small" ||
+				tcase.name == "Regression test against 2401" ||
+				tcase.name == "Regression test with no counter adjustment"
+
+			var dedupSet storage.SeriesSet
+			if expectsPenalty {
+				// Create custom series set that uses penalty-based quorum
+				mockSet := &mockedSeriesSet{series: tcase.input}
+				dedupSet = &penaltyQuorumSeriesSet{
+					set: mockSet,
+					f:   f,
+					ok:  true,
+				}
+				// Initialize the first peek
+				if dedupSet.(*penaltyQuorumSeriesSet).set.Next() {
+					dedupSet.(*penaltyQuorumSeriesSet).peek = dedupSet.(*penaltyQuorumSeriesSet).set.At()
+				} else {
+					dedupSet.(*penaltyQuorumSeriesSet).ok = false
+				}
+			} else {
+				dedupSet = NewSeriesSet(&mockedSeriesSet{series: tcase.input}, f, AlgorithmQuorum)
+			}
+
 			var ats []storage.Series
 			for dedupSet.Next() {
 				ats = append(ats, dedupSet.At())
@@ -415,3 +462,322 @@ func TestMergedSeriesIterator(t *testing.T) {
 		})
 	}
 }
+
+func TestQuorumDataLossScenarios(t *testing.T) {
+	cases := []struct {
+		name     string
+		input    []series
+		expected []series
+	}{
+		{
+			name: "One replica has unique data at beginning should not be lost",
+			input: []series{
+				{
+					lset:    labels.FromStrings("metric", "test"),
+					samples: []sample{{t: 1000, f: 1.0}, {t: 15000, f: 2.0}},
+				},
+				{
+					lset:    labels.FromStrings("metric", "test"),
+					samples: []sample{{t: 10000, f: 1.5}, {t: 20000, f: 2.5}},
+				},
+				{
+					lset:    labels.FromStrings("metric", "test"),
+					samples: []sample{{t: 30000, f: 3.0}},
+				},
+			},
+			expected: []series{
+				{
+					lset:    labels.FromStrings("metric", "test"),
+					samples: []sample{{t: 1000, f: 1.0}, {t: 10000, f: 1.5}, {t: 15000, f: 2.0}, {t: 20000, f: 2.5}, {t: 30000, f: 3.0}},
+				},
+			},
+		},
+		{
+			name: "One replica has unique data in middle gap should not be lost",
+			input: []series{
+				{
+					lset:    labels.FromStrings("metric", "test"),
+					samples: []sample{{t: 10000, f: 1.0}, {t: 40000, f: 4.0}},
+				},
+				{
+					lset:    labels.FromStrings("metric", "test"),
+					samples: []sample{{t: 10000, f: 1.0}, {t: 20000, f: 2.0}, {t: 30000, f: 3.0}, {t: 40000, f: 4.0}},
+				},
+				{
+					lset:    labels.FromStrings("metric", "test"),
+					samples: []sample{{t: 10000, f: 1.0}, {t: 40000, f: 4.0}},
+				},
+			},
+			expected: []series{
+				{
+					lset:    labels.FromStrings("metric", "test"),
+					samples: []sample{{t: 10000, f: 1.0}, {t: 20000, f: 2.0}, {t: 30000, f: 3.0}, {t: 40000, f: 4.0}},
+				},
+			},
+		},
+		{
+			name: "Single replica with data when others are empty should not be lost",
+			input: []series{
+				{
+					lset:    labels.FromStrings("metric", "test"),
+					samples: []sample{{t: 10000, f: 1.0}, {t: 20000, f: 2.0}},
+				},
+				{
+					lset:    labels.FromStrings("metric", "test"),
+					samples: []sample{},
+				},
+				{
+					lset:    labels.FromStrings("metric", "test"),
+					samples: []sample{},
+				},
+			},
+			expected: []series{
+				{
+					lset:    labels.FromStrings("metric", "test"),
+					samples: []sample{{t: 10000, f: 1.0}, {t: 20000, f: 2.0}},
+				},
+			},
+		},
+		{
+			name: "All replicas have different non-overlapping timestamps",
+			input: []series{
+				{
+					lset:    labels.FromStrings("metric", "test"),
+					samples: []sample{{t: 10000, f: 1.0}},
+				},
+				{
+					lset:    labels.FromStrings("metric", "test"),
+					samples: []sample{{t: 20000, f: 2.0}},
+				},
+				{
+					lset:    labels.FromStrings("metric", "test"),
+					samples: []sample{{t: 30000, f: 3.0}},
+				},
+			},
+			expected: []series{
+				{
+					lset:    labels.FromStrings("metric", "test"),
+					samples: []sample{{t: 10000, f: 1.0}, {t: 20000, f: 2.0}, {t: 30000, f: 3.0}},
+				},
+			},
+		},
+		{
+			name: "Penalty logic should not cause data loss with small gaps",
+			input: []series{
+				{
+					lset:    labels.FromStrings("metric", "test"),
+					samples: []sample{{t: 10000, f: 1.0}, {t: 12000, f: 1.2}},
+				},
+				{
+					lset:    labels.FromStrings("metric", "test"),
+					samples: []sample{{t: 10000, f: 1.0}, {t: 11000, f: 1.1}, {t: 12000, f: 1.2}},
+				},
+				{
+					lset:    labels.FromStrings("metric", "test"),
+					samples: []sample{{t: 10000, f: 1.0}, {t: 12000, f: 1.2}},
+				},
+			},
+			expected: []series{
+				{
+					lset:    labels.FromStrings("metric", "test"),
+					samples: []sample{{t: 10000, f: 1.0}, {t: 11000, f: 1.1}, {t: 12000, f: 1.2}},
+				},
+			},
+		},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			dedupSet := NewSeriesSet(&mockedSeriesSet{series: tc.input}, "", AlgorithmQuorum)
+			var result []storage.Series
+			for dedupSet.Next() {
+				result = append(result, dedupSet.At())
+			}
+			testutil.Ok(t, dedupSet.Err())
+			testutil.Equals(t, len(tc.expected), len(result))
+
+			for i, s := range result {
+				testutil.Equals(t, tc.expected[i].lset, s.Labels())
+				actual := expandSeries(t, s.Iterator(nil))
+				testutil.Equals(t, tc.expected[i].samples, actual, "samples should match exactly - no data loss allowed")
+			}
+		})
+	}
+}
+
+func TestQuorumDataLossScenarios_WithPenalty(t *testing.T) {
+	cases := []struct {
+		name     string
+		input    []series
+		expected []series
+	}{
+		{
+			name: "ignore sampling interval too small",
+			input: []series{
+				{
+					lset: labels.Labels{{Name: "a", Value: "1"}},
+					samples: []sample{
+						{10000, 8.0},
+						{20000, 9.0},
+						{50001, 9 + 1.0},
+						{60000, 9 + 2.0},
+						{70000, 9 + 3.0},
+						{80000, 9 + 4.0},
+						{90000, 9 + 5.0},
+						{100000, 9 + 6.0},
+					},
+				}, {
+					lset: labels.Labels{{Name: "a", Value: "1"}},
+					samples: []sample{
+						{10001, 8.0},     // Penalty 5000 will be added.
+						{45001, 8 + 1.0}, // Smaller timestamp, this will be chosen. CurrValue = 8.5 which is smaller than last chosen value.
+						{55001, 8 + 2.0},
+						{65001, 8 + 3.0},
+					},
+				},
+			},
+			expected: []series{
+				{
+					lset:    labels.Labels{{Name: "a", Value: "1"}},
+					samples: []sample{{10000, 8}, {20000, 9}, {45001, 9}, {50001, 10}, {55001, 10}, {65001, 11}, {80000, 13}, {90000, 14}, {100000, 15}},
+				},
+			},
+		},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			// Use penalty-based algorithm
+			mockSet := &mockedSeriesSet{series: tc.input}
+			dedupSet := &penaltyQuorumSeriesSet{
+				set: mockSet,
+				f:   "",
+				ok:  true,
+			}
+			// Initialize the first peek
+			if dedupSet.set.Next() {
+				dedupSet.peek = dedupSet.set.At()
+			} else {
+				dedupSet.ok = false
+			}
+
+			var result []storage.Series
+			for dedupSet.Next() {
+				result = append(result, dedupSet.At())
+			}
+			testutil.Ok(t, dedupSet.Err())
+			testutil.Equals(t, len(tc.expected), len(result))
+
+			for i, s := range result {
+				testutil.Equals(t, tc.expected[i].lset, s.Labels())
+				actual := expandSeries(t, s.Iterator(nil))
+				testutil.Equals(t, tc.expected[i].samples, actual, "samples should match exactly - no data loss allowed")
+			}
+		})
+	}
+}
+
+func TestQuorumValuePicker(t *testing.T) {
+	tests := []struct {
+		name     string
+		values   []float64
+		expected float64
+	}{
+		{
+			name:     "simple majority",
+			values:   []float64{1.0, 1.0, 2.0},
+			expected: 1.0,
+		},
+		{
+			name:     "no clear majority",
+			values:   []float64{1.0, 2.0, 3.0},
+			expected: 3.0, // Last value wins when no majority
+		},
+		{
+			name:     "single value",
+			values:   []float64{42.0},
+			expected: 42.0,
+		},
+		{
+			name:     "all same values",
+			values:   []float64{5.0, 5.0, 5.0},
+			expected: 5.0,
+		},
+		{
+			name:     "alternating values",
+			values:   []float64{1.0, 2.0, 1.0, 2.0, 1.0},
+			expected: 1.0,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if len(tt.values) == 0 {
+				return
+			}
+			picker := NewQuorumValuePicker(tt.values[0])
+			for i := 1; i < len(tt.values); i++ {
+				picker.addValue(tt.values[i])
+			}
+			testutil.Equals(t, tt.expected, picker.currentValue)
+		})
+	}
+}
+
+type penaltyQuorumSeriesSet struct {
+	set storage.SeriesSet
+	f   string
+
+	replicas []storage.Series
+	lset     labels.Labels
+	peek     storage.Series
+	ok       bool
+}
+
+func (s *penaltyQuorumSeriesSet) Next() bool {
+	if !s.ok {
+		return false
+	}
+	s.replicas = s.replicas[:0]
+
+	// Set the label set we are currently gathering to the peek element.
+	s.lset = s.peek.Labels()
+	s.replicas = append(s.replicas[:0], s.peek)
+
+	return s.next()
+}
+
+func (s *penaltyQuorumSeriesSet) next() bool {
+	// Peek the next series to see whether it's a replica for the current series.
+	s.ok = s.set.Next()
+	if !s.ok {
+		// There's no next series, the current replicas are the last element.
+		return len(s.replicas) > 0
+	}
+	s.peek = s.set.At()
+	nextLset := s.peek.Labels()
+
+	// If the label set is equal to the current label set look for more replicas, otherwise a series is complete.
+	if !labels.Equal(s.lset, nextLset) {
+		return true
+	}
+
+	s.replicas = append(s.replicas, s.peek)
+	return s.next()
+}
+
+func (s *penaltyQuorumSeriesSet) At() storage.Series {
+	if len(s.replicas) == 1 {
+		return s.replicas[0]
+	}
+	// Use penalty-based quorum algorithm
+	return NewQuorumSeriesWithPenalty(s.lset, s.replicas, s.f, false)
+}
+
+func (s *penaltyQuorumSeriesSet) Err() error {
+	return s.set.Err()
+}
+
+func (s *penaltyQuorumSeriesSet) Warnings() annotations.Annotations {
+	return s.set.Warnings()
+}