Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion comp/core/autodiscovery/autodiscoveryimpl/autoconfig.go
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,11 @@ func newAutoConfig(deps dependencies) autodiscovery.Component {

// createNewAutoConfig creates an AutoConfig instance (without starting).
func createNewAutoConfig(schedulerController *scheduler.Controller, secretResolver secrets.Component, wmeta option.Option[workloadmeta.Component], taggerComp tagger.Component, logs logComp.Component, telemetryComp telemetry.Component, filterStore workloadfilter.Component, hp option.Option[healthplatform.Component]) *AutoConfig {
cfgMgr := newReconcilingConfigManager(secretResolver)
var hpComp healthplatform.Component
Comment thread
mwdd146980 marked this conversation as resolved.
if h, ok := hp.Get(); ok {
hpComp = h
}
cfgMgr := newReconcilingConfigManager(secretResolver, hpComp)
ac := &AutoConfig{
configPollers: make([]*configPoller, 0, 9),
listenerCandidates: make(map[string]*listenerCandidate),
Expand Down
58 changes: 56 additions & 2 deletions comp/core/autodiscovery/autodiscoveryimpl/configmgr.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,15 @@ import (
"maps"
"sync"

healthplatformpayload "github.com/DataDog/agent-payload/v5/healthplatform"

"github.com/DataDog/datadog-agent/comp/core/autodiscovery/configresolver"
"github.com/DataDog/datadog-agent/comp/core/autodiscovery/integration"
"github.com/DataDog/datadog-agent/comp/core/autodiscovery/listeners"
"github.com/DataDog/datadog-agent/comp/core/autodiscovery/providers/names"
"github.com/DataDog/datadog-agent/comp/core/autodiscovery/providers/types"
secrets "github.com/DataDog/datadog-agent/comp/core/secrets/def"
healthplatformdef "github.com/DataDog/datadog-agent/comp/healthplatform/def"
checkid "github.com/DataDog/datadog-agent/pkg/collector/check/id"
"github.com/DataDog/datadog-agent/pkg/util/log"
)
Expand Down Expand Up @@ -105,12 +109,13 @@ type reconcilingConfigManager struct {
scheduledConfigs map[string]integration.Config

secretResolver secrets.Component
healthPlatform healthplatformdef.Component
}

var _ configManager = &reconcilingConfigManager{}

// newReconcilingConfigManager creates a new, empty reconcilingConfigManager.
func newReconcilingConfigManager(secretResolver secrets.Component) configManager {
func newReconcilingConfigManager(secretResolver secrets.Component, healthPlatform healthplatformdef.Component) configManager {
return &reconcilingConfigManager{
activeConfigs: map[string]integration.Config{},
activeServices: map[string]serviceAndADIDs{},
Expand All @@ -119,6 +124,7 @@ func newReconcilingConfigManager(secretResolver secrets.Component) configManager
serviceResolutions: map[string]map[string]string{},
scheduledConfigs: map[string]integration.Config{},
secretResolver: secretResolver,
healthPlatform: healthPlatform,
}
}

Expand Down Expand Up @@ -368,6 +374,10 @@ func (cm *reconcilingConfigManager) reconcileService(svcID string) integration.C
if _, found = expectedResolutions[templateDigest]; !found {
changes.UnscheduleConfig(cm.scheduledConfigs[resolvedDigest])
delete(existingResolutions, templateDigest)
// Clear any health issue for this template+service pair
if tpl, ok := cm.activeConfigs[templateDigest]; ok {
cm.clearTemplateResolutionFailureByID(tpl.Name, tpl.Digest(), svcID)
}
}
}

Expand Down Expand Up @@ -401,8 +411,9 @@ func (cm *reconcilingConfigManager) resolveTemplateForService(tpl integration.Co
config, err := configresolver.Resolve(tpl, svc)
if err != nil {
msg := fmt.Sprintf("error resolving template %s for service %s: %v", tpl.Name, svc.GetServiceID(), err)
log.Debug(msg)
log.Errorf("autodiscovery: skipping config - %s", msg)
errorStats.setResolveWarning(tpl.Name, msg)
cm.reportTemplateResolutionFailure(tpl, svc, err)
return tpl, false
}
resolvedConfig, err := decryptConfig(config, cm.secretResolver, digest)
Expand All @@ -412,9 +423,52 @@ func (cm *reconcilingConfigManager) resolveTemplateForService(tpl integration.Co
return config, false
}
errorStats.removeResolveWarnings(tpl.Name)
cm.clearTemplateResolutionFailure(tpl, svc)
Comment thread
mwdd146980 marked this conversation as resolved.
return resolvedConfig, true
}

// reportTemplateResolutionFailure reports a template resolution failure to the health platform.
func (cm *reconcilingConfigManager) reportTemplateResolutionFailure(tpl integration.Config, svc listeners.Service, err error) {
if cm.healthPlatform == nil {
return
}
Comment thread
mwdd146980 marked this conversation as resolved.
checkID := "ad-template:" + tpl.Name + ":" + svc.GetServiceID() + ":" + tpl.Digest()
report := &healthplatformpayload.IssueReport{
IssueId: healthplatformdef.ADMisconfigurationIssueID,
Context: map[string]string{
"entityName": tpl.Name + " (" + svc.GetServiceID() + ")",
"errorMessage": err.Error(),
"errorSource": string(types.TemplateResolutionSource),
},
}
if reportErr := cm.healthPlatform.ReportIssue(checkID, healthplatformdef.ADMisconfigurationCheckName, report); reportErr != nil {
log.Debugf("Failed to report template resolution issue: %v", reportErr)
}
}

// clearTemplateResolutionFailure clears a previously reported template resolution failure.
func (cm *reconcilingConfigManager) clearTemplateResolutionFailure(tpl integration.Config, svc listeners.Service) {
if cm.healthPlatform == nil {
return
}
checkID := "ad-template:" + tpl.Name + ":" + svc.GetServiceID() + ":" + tpl.Digest()
if err := cm.healthPlatform.ReportIssue(checkID, healthplatformdef.ADMisconfigurationCheckName, nil); err != nil {
log.Debugf("Failed to clear template resolution issue %s: %v", checkID, err)
}
}

// clearTemplateResolutionFailureByID clears a health issue using string identifiers.
// Used in deletion paths where the service object may no longer be available.
func (cm *reconcilingConfigManager) clearTemplateResolutionFailureByID(tplName, tplDigest, svcID string) {
if cm.healthPlatform == nil {
return
}
checkID := "ad-template:" + tplName + ":" + svcID + ":" + tplDigest
if err := cm.healthPlatform.ReportIssue(checkID, healthplatformdef.ADMisconfigurationCheckName, nil); err != nil {
log.Debugf("Failed to clear template resolution issue %s: %v", checkID, err)
}
}

// applyChanges applies the given changes to cm.scheduledConfigs
//
// This method must be called with cm.m locked.
Expand Down
80 changes: 79 additions & 1 deletion comp/core/autodiscovery/autodiscoveryimpl/configmgr_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,12 @@ import (
"github.com/stretchr/testify/require"
"github.com/stretchr/testify/suite"

healthplatformpayload "github.com/DataDog/agent-payload/v5/healthplatform"

"github.com/DataDog/datadog-agent/comp/core/autodiscovery/integration"
"github.com/DataDog/datadog-agent/comp/core/autodiscovery/listeners"
"github.com/DataDog/datadog-agent/comp/core/autodiscovery/providers/names"
healthplatformmock "github.com/DataDog/datadog-agent/comp/healthplatform/mock"
checkid "github.com/DataDog/datadog-agent/pkg/collector/check/id"
"github.com/DataDog/datadog-agent/pkg/util/testutil"
)
Expand Down Expand Up @@ -542,7 +545,82 @@ func TestReconcilingConfigManagement(t *testing.T) {
mockResolver := MockSecretResolver{}
suite.Run(t, &ReconcilingConfigManagerSuite{
ConfigManagerSuite{factory: func() configManager {
return newReconcilingConfigManager(&mockResolver)
return newReconcilingConfigManager(&mockResolver, nil)
}},
})
}

// dummyServiceWithExtraConfigError is a dummyService that returns an error for GetExtraConfig
type dummyServiceWithExtraConfigError struct {
dummyService
}

func (s *dummyServiceWithExtraConfigError) GetExtraConfig(key string) (string, error) {
return "", fmt.Errorf("extra config %q is not supported", key)
}

func TestResolveTemplateForService_ReportsToHealthPlatform(t *testing.T) {
mockResolver := MockSecretResolver{}
hp := healthplatformmock.Mock(t)

cm := newReconcilingConfigManager(&mockResolver, hp).(*reconcilingConfigManager)

tpl := integration.Config{
Name: "postgres",
ADIdentifiers: []string{"postgres"},
Instances: []integration.Data{integration.Data("host: %%host%%\ntags:\n - dbid:%%extra_dbinstanceidentifier%%")},
Provider: "file",
Source: "file:/etc/datadog-agent/conf.d/postgres.d/conf.yaml",
}

svc := &dummyServiceWithExtraConfigError{
dummyService: dummyService{
ID: "docker://abc123",
ADIdentifiers: []string{"postgres"},
Hosts: map[string]string{"main": "myhost"},
},
}

_, ok := cm.resolveTemplateForService(tpl, svc)
assert.False(t, ok, "resolveTemplateForService should return false on resolution failure")

count, issues := hp.GetAllIssues()
assert.Equal(t, 1, count, "expected 1 health issue to be reported")
expectedCheckID := "ad-template:postgres:docker://abc123:" + tpl.Digest()
issue := issues[expectedCheckID]
require.NotNil(t, issue, "expected health issue at checkID %s", expectedCheckID)
assert.Equal(t, "ad-misconfiguration", issue.Id)
}

func TestResolveTemplateForService_ClearsHealthPlatformOnSuccess(t *testing.T) {
mockResolver := MockSecretResolver{}
hp := healthplatformmock.Mock(t)

cm := newReconcilingConfigManager(&mockResolver, hp).(*reconcilingConfigManager)

tpl := integration.Config{
Name: "redis",
ADIdentifiers: []string{"redis"},
LogsConfig: []byte("source: %%host%%"),
}

svc := &dummyService{
ID: "docker://def456",
ADIdentifiers: []string{"redis"},
Hosts: map[string]string{"main": "myhost"},
}

// Pre-populate a health issue using the same checkID format the code uses
hp.ReportIssue("ad-template:redis:docker://def456:"+tpl.Digest(), "redis", &healthplatformpayload.IssueReport{
IssueId: "ad-misconfiguration",
Context: map[string]string{"entityName": "redis"},
})
count, _ := hp.GetAllIssues()
require.Equal(t, 1, count)

_, ok := cm.resolveTemplateForService(tpl, svc)
assert.True(t, ok)

count, _ = hp.GetAllIssues()
assert.Equal(t, 0, count, "health issue should be cleared after successful resolution")
}
2 changes: 2 additions & 0 deletions comp/core/autodiscovery/providers/types/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ const (
ContainerLabelSource ErrorSource = "container_label"
// PodAnnotationSource indicates errors from Kubernetes pod annotations (ad.datadoghq.com/*)
PodAnnotationSource ErrorSource = "pod_annotation"
// TemplateResolutionSource indicates errors from template variable resolution failures
TemplateResolutionSource ErrorSource = "template_resolution"
)

// ConfigProviderFactory is any function capable to create a ConfigProvider instance
Expand Down
21 changes: 18 additions & 3 deletions comp/healthplatform/impl/issues/admisconfig/issue.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,12 @@ const (
impactMsg = "Metrics, and logs may not be collected due to misconfigured autodiscovery settings"
)

// containerLabelSource matches the string value of types.ContainerLabelSource
// to avoid a cross-package import. The value is passed as a string in the issue context.
const containerLabelSource = "container_label"
// These constants match the string values of types.ErrorSource to avoid a
// cross-package import. The values are passed as strings in the issue context.
const (
containerLabelSource = "container_label"
templateResolutionSource = "template_resolution"
)

type issueContent struct {
title string
Expand Down Expand Up @@ -93,6 +96,18 @@ func (t *ADMisconfigurationIssue) BuildIssue(context map[string]string) (*health
func buildSourceSpecificContent(entityName, errorMessage, errorSource string) issueContent {
title := fmt.Sprintf("AD Misconfiguration on '%s'", entityName)
switch errorSource {
case templateResolutionSource:
return issueContent{
title: title,
description: "Autodiscovery template resolution error: " + errorMessage,
summary: "Verify that all template variables are supported by the autodiscovery listener for this service",
steps: []*healthplatform.RemediationStep{
{Order: 1, Text: "Check that all template variables (%%var%%) are supported by the listener type for this service"},
{Order: 2, Text: "Review the AD identifiers and ensure they match the correct listener (e.g., RDS vs Aurora have different supported variables)"},
{Order: 3, Text: "Run 'datadog-agent configcheck' to see all configuration resolution warnings"},
{Order: 4, Text: "See docs: https://docs.datadoghq.com/containers/guide/template_variables/"},
},
}
case containerLabelSource:
return issueContent{
title: title,
Expand Down
11 changes: 11 additions & 0 deletions comp/healthplatform/impl/issues/admisconfig/issue_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,17 @@ func TestBuildIssue(t *testing.T) {
expectedDescSub: "container label error",
expectedStepCount: 3,
},
{
name: "template resolution error",
context: map[string]string{
"entityName": "postgres (docker://abc123)",
"errorMessage": "failed to get extra info for service docker://abc123, skipping config - extra config \"dbinstanceidentifier\" is not supported",
"errorSource": "template_resolution",
},
expectedTitle: "AD Misconfiguration on 'postgres (docker://abc123)'",
expectedDescSub: "template resolution error",
expectedStepCount: 4,
},
{
name: "empty context defaults to pod annotation remediation",
context: map[string]string{},
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
enhancements:
- |
Autodiscovery template resolution failures are now logged at ERROR level
instead of DEBUG, making them visible without enabling debug logging.
Additionally, when the health platform is enabled, these failures are
reported as AD misconfiguration health events with actionable remediation
steps, providing proactive visibility when an autodiscovered check config
is silently skipped due to unsupported template variables.
Loading