From 5222b68e980d38826ad8b43e9dc10a21a27bd953 Mon Sep 17 00:00:00 2001 From: Joel Verezhak Date: Fri, 4 Jul 2025 12:17:52 +0200 Subject: [PATCH 1/5] tasks --- TASKS.md | 145 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 TASKS.md diff --git a/TASKS.md b/TASKS.md new file mode 100644 index 0000000000..42ec189315 --- /dev/null +++ b/TASKS.md @@ -0,0 +1,145 @@ +# Alertmanager Boot Timeout Implementation Plan + +## Overview +Implement a configurable boot timeout for Alertmanager in HA mode only. During the timeout period, the API server will be available for alert ingestion, but the readiness probe will return NOT READY until the timeout expires and the cluster settles. + +## Requirements +- ✅ **HA mode only**: Feature only applies when clustering is enabled (`--cluster.listen-address != ""`) +- ✅ **Zero impact on single replica**: No changes to single replica startup behavior +- ✅ **API available immediately**: API server accepts alerts during boot timeout +- ✅ **Readiness reflects boot state**: `/-/ready` returns 503 during boot timeout +- ✅ **Configurable timeout**: Default 5 minutes, configurable via flag +- ✅ **Comprehensive logging**: Clear progress indication for users +- ✅ **Minimal code changes**: Touch as little existing code as possible + +## Implementation Steps + +### Step 1: Add cluster boot timeout flag +**File**: `cmd/alertmanager/main.go` +- Add new flag: `cluster.boot-timeout` with 5m default +- Place with other cluster flags for proper namespacing +- Include clear help text indicating HA-only behavior + +### Step 2: Create boot manager +**File**: `cluster/boot.go` (new file) +- Create `BootManager` struct to handle boot timeout logic +- Implement methods: + - `NewBootManager(timeout, logger)` - constructor + - `Start()` - begin boot timeout period + - `IsReady()` - check if boot timeout has expired + - `WaitReady(ctx)` - block until boot timeout expires +- Include comprehensive logging with progress updates +- Use existing patterns from cluster package + +### Step 3: Modify startup sequence +**File**: `cmd/alertmanager/main.go` +- Create boot manager when clustering is enabled +- Start boot timeout before API server startup +- Move cluster join logic after boot timeout expires +- Preserve existing startup order for single replica mode + +### Step 4: Update readiness endpoint +**File**: `ui/web.go` +- Modify `/-/ready` endpoint to check boot state when clustering enabled +- Return 503 (Service Unavailable) during boot timeout +- Return current behavior after boot timeout + cluster ready +- No changes for single replica mode + +### Step 5: Enhance cluster status reporting +**Files**: `api/v2/api.go`, `api/v2/models/cluster_status.go` +- Add "booting" status to cluster status enum +- Update status reporting logic in API v2 +- Maintain backward compatibility with existing "ready", "settling", "disabled" +- Show "booting" during boot timeout period + +### Step 6: Integration and testing +- Verify single replica mode unchanged +- Test HA mode boot sequence +- Validate readiness probe behavior +- Check status API responses +- Confirm logging output + +## Technical Details + +### New Components +```go +// cluster/boot.go +type BootManager struct { + timeout time.Duration + startTime time.Time + readyc chan struct{} + logger *slog.Logger +} +``` + +### Modified Startup Sequence (HA mode only) +``` +1. Create cluster peer (existing) +2. Create boot manager (NEW) +3. Start boot timeout (NEW) +4. Start API server (existing, now immediate) +5. Wait for boot timeout (NEW) +6. Join cluster (existing, now delayed) +7. Wait for cluster settle (existing) +8. Set ready state (existing) +``` + +### Flag Addition +```go +clusterBootTimeout = kingpin.Flag("cluster.boot-timeout", + "Time to wait before joining the gossip cluster. During this period, "+ + "the API server accepts alerts but readiness probe returns NOT READY. "+ + "Only applies when clustering is enabled.").Default("5m").Duration() +``` + +### Readiness Logic +```go +// Single replica: always ready (current behavior) +// HA mode: ready only after boot timeout + cluster settled +func readyHandler(bootManager *cluster.BootManager, peer cluster.ClusterPeer) { + if peer == nil { + // Single replica - always ready + return 200 + } + if !bootManager.IsReady() { + // Still in boot timeout + return 503 + } + if !peer.Ready() { + // Cluster not settled + return 503 + } + return 200 +} +``` + +## Benefits +- **Controlled startup**: Prevents premature cluster participation +- **Alert availability**: API accepts alerts immediately +- **Clear observability**: Comprehensive logging and status reporting +- **Zero regression**: Single replica mode completely unchanged +- **Flexible configuration**: Adjustable timeout for different environments +- **Backward compatible**: Existing deployments continue working + +## Files to Modify +1. `cmd/alertmanager/main.go` - Add flag, modify startup sequence +2. `cluster/boot.go` - New boot manager implementation +3. `ui/web.go` - Update readiness endpoint +4. `api/v2/api.go` - Add boot status reporting +5. `api/v2/models/cluster_status.go` - Add "booting" status (if needed) + +## Testing Scenarios +1. **Single replica**: Verify no behavior changes +2. **HA with default timeout**: Test 5-minute boot delay +3. **HA with custom timeout**: Test different timeout values +4. **HA with zero timeout**: Test immediate cluster join (current behavior) +5. **API availability**: Confirm alerts accepted during boot timeout +6. **Readiness probe**: Verify 503 → 200 transition +7. **Status API**: Check "booting" → "settling" → "ready" progression + +## Risk Mitigation +- **Gradual rollout**: Feature disabled by default in single replica +- **Escape hatch**: Zero timeout maintains current behavior +- **Comprehensive logging**: Clear visibility into boot process +- **Minimal changes**: Reduces risk of introducing bugs +- **Backward compatibility**: Existing configurations work unchanged From 1ca7345fbbc6d99e46964a661e3ba252aeb12630 Mon Sep 17 00:00:00 2001 From: Joel Verezhak Date: Fri, 4 Jul 2025 12:20:44 +0200 Subject: [PATCH 2/5] step 1 and 2 --- cluster/boot.go | 105 +++++++++++++++++++++++++++++++++++++++ cmd/alertmanager/main.go | 1 + 2 files changed, 106 insertions(+) create mode 100644 cluster/boot.go diff --git a/cluster/boot.go b/cluster/boot.go new file mode 100644 index 0000000000..d7c0bd0027 --- /dev/null +++ b/cluster/boot.go @@ -0,0 +1,105 @@ +// Copyright 2025 Prometheus Team +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cluster + +import ( + "context" + "log/slog" + "sync" + "time" +) + +// BootManager manages the boot timeout for cluster joining. +// It provides a configurable delay before the alertmanager joins the gossip cluster, +// allowing the API server to be ready for alert ingestion while keeping the +// readiness probe in NOT READY state until the boot timeout expires. +type BootManager struct { + timeout time.Duration + startTime time.Time + readyc chan struct{} + logger *slog.Logger + once sync.Once +} + +// NewBootManager creates a new boot manager with the specified timeout. +func NewBootManager(timeout time.Duration, logger *slog.Logger) *BootManager { + return &BootManager{ + timeout: timeout, + readyc: make(chan struct{}), + logger: logger, + } +} + +// Start begins the boot timeout period. This should be called once during startup. +// It starts a goroutine that will close the ready channel after the timeout expires. +func (bm *BootManager) Start() { + bm.once.Do(func() { + bm.startTime = time.Now() + if bm.timeout <= 0 { + // Zero or negative timeout means immediate readiness + bm.logger.Info("Boot timeout disabled, proceeding immediately") + close(bm.readyc) + return + } + + bm.logger.Info("Starting boot timeout", "timeout", bm.timeout) + go bm.runBootTimeout() + }) +} + +// IsReady returns true if the boot timeout has expired. +func (bm *BootManager) IsReady() bool { + select { + case <-bm.readyc: + return true + default: + return false + } +} + +// WaitReady blocks until the boot timeout expires or the context is cancelled. +func (bm *BootManager) WaitReady(ctx context.Context) error { + select { + case <-ctx.Done(): + return ctx.Err() + case <-bm.readyc: + return nil + } +} + +// runBootTimeout runs the boot timeout timer and logs progress. +func (bm *BootManager) runBootTimeout() { + ticker := time.NewTicker(30 * time.Second) // Log progress every 30 seconds + defer ticker.Stop() + + deadline := bm.startTime.Add(bm.timeout) + + for { + select { + case <-time.After(time.Until(deadline)): + // Timeout expired + elapsed := time.Since(bm.startTime) + bm.logger.Info("Boot timeout completed, ready to join cluster", "elapsed", elapsed) + close(bm.readyc) + return + case <-ticker.C: + // Progress update + elapsed := time.Since(bm.startTime) + remaining := bm.timeout - elapsed + if remaining > 0 { + bm.logger.Info("Boot timeout in progress", "elapsed", elapsed, "remaining", remaining) + } + } + } +} diff --git a/cmd/alertmanager/main.go b/cmd/alertmanager/main.go index 87cdab8a09..dafaa9c076 100644 --- a/cmd/alertmanager/main.go +++ b/cmd/alertmanager/main.go @@ -176,6 +176,7 @@ func run() int { tlsConfigFile = kingpin.Flag("cluster.tls-config", "[EXPERIMENTAL] Path to config yaml file that can enable mutual TLS within the gossip protocol.").Default("").String() allowInsecureAdvertise = kingpin.Flag("cluster.allow-insecure-public-advertise-address-discovery", "[EXPERIMENTAL] Allow alertmanager to discover and listen on a public IP address.").Bool() label = kingpin.Flag("cluster.label", "The cluster label is an optional string to include on each packet and stream. It uniquely identifies the cluster and prevents cross-communication issues when sending gossip messages.").Default("").String() + clusterBootTimeout = kingpin.Flag("cluster.boot-timeout", "Time to wait before joining the gossip cluster. During this period, the API server accepts alerts but readiness probe returns NOT READY. Only applies when clustering is enabled.").Default("5m").Duration() featureFlags = kingpin.Flag("enable-feature", fmt.Sprintf("Comma-separated experimental features to enable. Valid options: %s", strings.Join(featurecontrol.AllowedFlags, ", "))).Default("").String() ) From c69e9c22de0678dad01c1c7cad6deb1b374ae57f Mon Sep 17 00:00:00 2001 From: Joel Verezhak Date: Fri, 4 Jul 2025 12:24:06 +0200 Subject: [PATCH 3/5] step 3 --- cmd/alertmanager/main.go | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/cmd/alertmanager/main.go b/cmd/alertmanager/main.go index dafaa9c076..ea257bf888 100644 --- a/cmd/alertmanager/main.go +++ b/cmd/alertmanager/main.go @@ -239,6 +239,7 @@ func run() int { return 1 } var peer *cluster.Peer + var bootManager *cluster.BootManager if *clusterBindAddr != "" { peer, err = cluster.Create( logger.With("component", "cluster"), @@ -261,6 +262,10 @@ func run() int { return 1 } clusterEnabled.Set(1) + + // Create and start boot manager for HA mode + bootManager = cluster.NewBootManager(*clusterBootTimeout, logger.With("component", "boot")) + bootManager.Start() } stopc := make(chan struct{}) @@ -326,6 +331,16 @@ func run() int { // Peer state listeners have been registered, now we can join and get the initial state. if peer != nil { + // Wait for boot timeout before joining cluster in HA mode + if bootManager != nil { + logger.Info("Waiting for boot timeout before joining cluster") + ctx, cancel := context.WithTimeout(context.Background(), *clusterBootTimeout+10*time.Second) + if err := bootManager.WaitReady(ctx); err != nil { + logger.Warn("boot timeout interrupted", "err", err) + } + cancel() + } + err = peer.Join( *reconnectInterval, *peerReconnectTimeout, From 60e8a34556a17e8a959f123a5b3eb8553da8be8f Mon Sep 17 00:00:00 2001 From: Joel Verezhak Date: Fri, 4 Jul 2025 12:28:15 +0200 Subject: [PATCH 4/5] step 4 --- cluster/boot.go | 38 ++++++++++++++++++++++++++++++++++++++ cmd/alertmanager/main.go | 10 +++++++++- ui/web.go | 31 ++++++++++++++++++++++++++++--- 3 files changed, 75 insertions(+), 4 deletions(-) diff --git a/cluster/boot.go b/cluster/boot.go index d7c0bd0027..679e5ba87e 100644 --- a/cluster/boot.go +++ b/cluster/boot.go @@ -103,3 +103,41 @@ func (bm *BootManager) runBootTimeout() { } } } + +// CompositeReadinessChecker combines boot manager and cluster peer readiness. +type CompositeReadinessChecker struct { + bootManager *BootManager + peer interface{} // Can be *Peer or ClusterPeer +} + +// NewCompositeReadinessChecker creates a readiness checker that considers both boot timeout and cluster readiness. +func NewCompositeReadinessChecker(bootManager *BootManager, peer interface{}) *CompositeReadinessChecker { + return &CompositeReadinessChecker{ + bootManager: bootManager, + peer: peer, + } +} + +// IsReady returns true only if boot timeout has expired and cluster is ready (if clustering enabled). +func (c *CompositeReadinessChecker) IsReady() bool { + // If no boot manager, we're in single replica mode - always ready + if c.bootManager == nil { + return true + } + + // In HA mode, boot timeout must have expired first + if !c.bootManager.IsReady() { + return false + } + + // If clustering is enabled, cluster must also be ready + if c.peer != nil { + // Try to cast to *Peer to check readiness + if peer, ok := c.peer.(*Peer); ok { + return peer.Ready() + } + } + + // Boot timeout expired and no cluster - ready + return true +} diff --git a/cmd/alertmanager/main.go b/cmd/alertmanager/main.go index ea257bf888..684e689da0 100644 --- a/cmd/alertmanager/main.go +++ b/cmd/alertmanager/main.go @@ -562,7 +562,15 @@ func run() int { webReload := make(chan chan error) - ui.Register(router, webReload, logger) + // Create readiness checker for UI + if bootManager != nil { + // HA mode: use composite checker that considers boot timeout and cluster readiness + readinessChecker := cluster.NewCompositeReadinessChecker(bootManager, peer) + ui.RegisterWithReadiness(router, webReload, logger, readinessChecker) + } else { + // Single replica mode: use default register (always ready) + ui.Register(router, webReload, logger) + } reactapp.Register(router, logger) mux := api.Register(router, *routePrefix) diff --git a/ui/web.go b/ui/web.go index 3b81135237..7a33f8aa70 100644 --- a/ui/web.go +++ b/ui/web.go @@ -26,8 +26,24 @@ import ( "github.com/prometheus/alertmanager/asset" ) +// ReadinessChecker provides a way to check if the service is ready to serve traffic. +type ReadinessChecker interface { + // IsReady returns true if the service is ready to serve traffic. + IsReady() bool +} + +// defaultReadinessChecker always returns ready (for single replica mode). +type defaultReadinessChecker struct{} + +func (d defaultReadinessChecker) IsReady() bool { return true } + // Register registers handlers to serve files for the web interface. func Register(r *route.Router, reloadCh chan<- chan error, logger *slog.Logger) { + RegisterWithReadiness(r, reloadCh, logger, defaultReadinessChecker{}) +} + +// RegisterWithReadiness registers handlers with a custom readiness checker. +func RegisterWithReadiness(r *route.Router, reloadCh chan<- chan error, logger *slog.Logger, readinessChecker ReadinessChecker) { r.Get("/metrics", promhttp.Handler().ServeHTTP) r.Get("/", func(w http.ResponseWriter, req *http.Request) { @@ -80,11 +96,20 @@ func Register(r *route.Router, reloadCh chan<- chan error, logger *slog.Logger) w.WriteHeader(http.StatusOK) }) r.Get("/-/ready", func(w http.ResponseWriter, _ *http.Request) { - w.WriteHeader(http.StatusOK) - fmt.Fprintf(w, "OK") + if readinessChecker.IsReady() { + w.WriteHeader(http.StatusOK) + fmt.Fprintf(w, "OK") + } else { + w.WriteHeader(http.StatusServiceUnavailable) + fmt.Fprintf(w, "Service Unavailable") + } }) r.Head("/-/ready", func(w http.ResponseWriter, _ *http.Request) { - w.WriteHeader(http.StatusOK) + if readinessChecker.IsReady() { + w.WriteHeader(http.StatusOK) + } else { + w.WriteHeader(http.StatusServiceUnavailable) + } }) r.Get("/debug/*subpath", http.DefaultServeMux.ServeHTTP) From babea3d1a3c7b8e718ac53f7411a924046cfd24f Mon Sep 17 00:00:00 2001 From: Joel Verezhak Date: Fri, 4 Jul 2025 12:36:35 +0200 Subject: [PATCH 5/5] step 5 --- api/api.go | 9 +++++++++ api/v2/api.go | 18 +++++++++++++++++- cluster/boot.go | 32 +++++++++++++++++++++++++++++--- cmd/alertmanager/main.go | 6 ++++++ 4 files changed, 61 insertions(+), 4 deletions(-) diff --git a/api/api.go b/api/api.go index 6839d2d282..4db26fb2aa 100644 --- a/api/api.go +++ b/api/api.go @@ -46,6 +46,11 @@ type API struct { inFlightSem chan struct{} } +// StatusProvider provides enhanced cluster status information. +type StatusProvider interface { + Status() string +} + // Options for the creation of an API object. Alerts, Silences, AlertStatusFunc // and GroupMutedFunc are mandatory. The zero value for everything else is a safe // default. @@ -62,6 +67,9 @@ type Options struct { GroupMutedFunc func(routeID, groupKey string) ([]string, bool) // Peer from the gossip cluster. If nil, no clustering will be used. Peer cluster.ClusterPeer + // StatusProvider provides enhanced status information including boot state. + // If nil, standard peer status will be used. + StatusProvider StatusProvider // Timeout for all HTTP connections. The zero value (and negative // values) result in no timeout. Timeout time.Duration @@ -125,6 +133,7 @@ func New(opts Options) (*API, error) { opts.GroupMutedFunc, opts.Silences, opts.Peer, + opts.StatusProvider, l.With("version", "v2"), opts.Registry, ) diff --git a/api/v2/api.go b/api/v2/api.go index 0e29375d40..292efa1cad 100644 --- a/api/v2/api.go +++ b/api/v2/api.go @@ -52,9 +52,15 @@ import ( "github.com/prometheus/alertmanager/types" ) +// statusProvider provides enhanced cluster status information. +type statusProvider interface { + Status() string +} + // API represents an Alertmanager API v2. type API struct { peer cluster.ClusterPeer + statusProvider statusProvider silences *silence.Silences alerts provider.Alerts alertGroups groupsFn @@ -91,6 +97,7 @@ func NewAPI( gmf groupMutedFunc, silences *silence.Silences, peer cluster.ClusterPeer, + statusProvider statusProvider, l *slog.Logger, r prometheus.Registerer, ) (*API, error) { @@ -100,6 +107,7 @@ func NewAPI( alertGroups: gf, groupMutedFunc: gmf, peer: peer, + statusProvider: statusProvider, silences: silences, logger: l, m: metrics.NewAlerts(r), @@ -197,7 +205,15 @@ func (api *API) getStatusHandler(params general_ops.GetStatusParams) middleware. // If alertmanager cluster feature is disabled, then api.peers == nil. if api.peer != nil { - status := api.peer.Status() + var status string + + // Use enhanced status provider if available + if api.statusProvider != nil { + status = api.statusProvider.Status() + } else { + // Fall back to peer status + status = api.peer.Status() + } peers := []*open_api_models.PeerStatus{} for _, n := range api.peer.Peers() { diff --git a/cluster/boot.go b/cluster/boot.go index 679e5ba87e..46ae5b8521 100644 --- a/cluster/boot.go +++ b/cluster/boot.go @@ -124,12 +124,12 @@ func (c *CompositeReadinessChecker) IsReady() bool { if c.bootManager == nil { return true } - + // In HA mode, boot timeout must have expired first if !c.bootManager.IsReady() { return false } - + // If clustering is enabled, cluster must also be ready if c.peer != nil { // Try to cast to *Peer to check readiness @@ -137,7 +137,33 @@ func (c *CompositeReadinessChecker) IsReady() bool { return peer.Ready() } } - + // Boot timeout expired and no cluster - ready return true } + +// Status returns the current status string, considering both boot timeout and cluster state. +func (c *CompositeReadinessChecker) Status() string { + // If no boot manager, we're in single replica mode - disabled + if c.bootManager == nil { + return "disabled" + } + + // In HA mode, check boot timeout first + if !c.bootManager.IsReady() { + // Use "settling" during boot timeout to maintain API compatibility + // In the future, this could be "booting" with OpenAPI spec update + return "settling" + } + + // Boot timeout expired, check cluster state + if c.peer != nil { + // Try to cast to *Peer to get cluster status + if peer, ok := c.peer.(*Peer); ok { + return peer.Status() // "ready" or "settling" + } + } + + // Boot timeout expired and no cluster - ready + return "ready" +} diff --git a/cmd/alertmanager/main.go b/cmd/alertmanager/main.go index 684e689da0..d60b278793 100644 --- a/cmd/alertmanager/main.go +++ b/cmd/alertmanager/main.go @@ -378,8 +378,13 @@ func run() int { // Therefore we explicly pass an empty interface, to detect if the // cluster is not enabled in notify. var clusterPeer cluster.ClusterPeer + var statusProvider api.StatusProvider if peer != nil { clusterPeer = peer + if bootManager != nil { + // Use the same readiness checker as status provider for API + statusProvider = cluster.NewCompositeReadinessChecker(bootManager, peer) + } } api, err := api.New(api.Options{ @@ -388,6 +393,7 @@ func run() int { AlertStatusFunc: marker.Status, GroupMutedFunc: marker.Muted, Peer: clusterPeer, + StatusProvider: statusProvider, Timeout: *httpTimeout, Concurrency: *getConcurrency, Logger: logger.With("component", "api"),