Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions autorca_core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from autorca_core.model.graph import Service, Dependency, IncidentNode
from autorca_core.reasoning.loop import run_rca, RCARunResult
from autorca_core.logging import configure_logging, get_logger
from autorca_core.config import ThresholdConfig

__all__ = [
"Event",
Expand All @@ -26,4 +27,5 @@
"RCARunResult",
"configure_logging",
"get_logger",
"ThresholdConfig",
]
126 changes: 126 additions & 0 deletions autorca_core/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
"""
Configuration module for AutoRCA-Core.

Provides configurable thresholds and settings for anomaly detection and RCA analysis.
"""

import os
from dataclasses import dataclass, field

Check failure on line 8 in autorca_core/config.py

View workflow job for this annotation

GitHub Actions / Code Quality Checks

Ruff (F401)

autorca_core/config.py:8:36: F401 `dataclasses.field` imported but unused
from typing import Optional

Check failure on line 9 in autorca_core/config.py

View workflow job for this annotation

GitHub Actions / Code Quality Checks

Ruff (F401)

autorca_core/config.py:9:20: F401 `typing.Optional` imported but unused


@dataclass
class ThresholdConfig:
"""
Configurable thresholds for anomaly detection.

These thresholds control when incidents are detected from observability data.
Different environments have different baselines, so these can be tuned accordingly.

Attributes:
# Error detection
error_spike_count: Minimum number of errors to consider a spike
error_spike_window_seconds: Time window for error spike detection

# Latency detection
latency_spike_ms: Latency threshold in milliseconds
latency_spike_count: Minimum number of high-latency samples

# Resource detection
resource_exhaustion_percent: Resource usage percentage threshold
resource_exhaustion_count: Minimum number of high-usage samples

# Correlation windows
change_correlation_seconds: Time window for correlating changes with incidents
"""

# Error detection
error_spike_count: int = 3
error_spike_window_seconds: int = 300 # 5 minutes

# Latency detection
latency_spike_ms: float = 1000.0
latency_spike_count: int = 2

# Resource detection
resource_exhaustion_percent: float = 90.0
resource_exhaustion_count: int = 2

# Correlation windows
change_correlation_seconds: int = 600 # 10 minutes

@classmethod
def from_env(cls) -> "ThresholdConfig":
"""
Load thresholds from environment variables.

Environment variables:
AUTORCA_ERROR_SPIKE_COUNT: Error spike count threshold
AUTORCA_ERROR_SPIKE_WINDOW: Error spike window in seconds
AUTORCA_LATENCY_SPIKE_MS: Latency spike threshold in ms
AUTORCA_LATENCY_SPIKE_COUNT: Latency spike count threshold
AUTORCA_RESOURCE_EXHAUSTION_PERCENT: Resource exhaustion percentage
AUTORCA_RESOURCE_EXHAUSTION_COUNT: Resource exhaustion count threshold
AUTORCA_CHANGE_CORRELATION_SECONDS: Change correlation window in seconds

Returns:
ThresholdConfig instance with values from environment
"""
return cls(
error_spike_count=int(os.getenv("AUTORCA_ERROR_SPIKE_COUNT", 3)),
error_spike_window_seconds=int(os.getenv("AUTORCA_ERROR_SPIKE_WINDOW", 300)),
latency_spike_ms=float(os.getenv("AUTORCA_LATENCY_SPIKE_MS", 1000.0)),
latency_spike_count=int(os.getenv("AUTORCA_LATENCY_SPIKE_COUNT", 2)),
resource_exhaustion_percent=float(os.getenv("AUTORCA_RESOURCE_EXHAUSTION_PERCENT", 90.0)),
resource_exhaustion_count=int(os.getenv("AUTORCA_RESOURCE_EXHAUSTION_COUNT", 2)),
change_correlation_seconds=int(os.getenv("AUTORCA_CHANGE_CORRELATION_SECONDS", 600)),
)

@classmethod
def strict(cls) -> "ThresholdConfig":
"""
Create a strict configuration (more sensitive to anomalies).

Returns:
ThresholdConfig with strict thresholds
"""
return cls(
error_spike_count=2,
error_spike_window_seconds=180, # 3 minutes
latency_spike_ms=500.0,
latency_spike_count=1,
resource_exhaustion_percent=80.0,
resource_exhaustion_count=1,
change_correlation_seconds=900, # 15 minutes
)

@classmethod
def relaxed(cls) -> "ThresholdConfig":
"""
Create a relaxed configuration (less sensitive to anomalies).

Returns:
ThresholdConfig with relaxed thresholds
"""
return cls(
error_spike_count=5,
error_spike_window_seconds=600, # 10 minutes
latency_spike_ms=2000.0,
latency_spike_count=3,
resource_exhaustion_percent=95.0,
resource_exhaustion_count=3,
change_correlation_seconds=300, # 5 minutes
)

def to_dict(self) -> dict:
"""Serialize to dictionary."""
return {
"error_spike_count": self.error_spike_count,
"error_spike_window_seconds": self.error_spike_window_seconds,
"latency_spike_ms": self.latency_spike_ms,
"latency_spike_count": self.latency_spike_count,
"resource_exhaustion_percent": self.resource_exhaustion_percent,
"resource_exhaustion_count": self.resource_exhaustion_count,
"change_correlation_seconds": self.change_correlation_seconds,
}

32 changes: 18 additions & 14 deletions autorca_core/graph_engine/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
Builds a ServiceGraph from logs, metrics, traces, and config changes.
"""

from typing import List, Dict, Set, Tuple
from typing import List, Dict, Set, Tuple, Optional
from datetime import datetime, timedelta

Check failure on line 8 in autorca_core/graph_engine/builder.py

View workflow job for this annotation

GitHub Actions / Code Quality Checks

Ruff (F401)

autorca_core/graph_engine/builder.py:8:32: F401 `datetime.timedelta` imported but unused

Check failure on line 8 in autorca_core/graph_engine/builder.py

View workflow job for this annotation

GitHub Actions / Code Quality Checks

Ruff (F401)

autorca_core/graph_engine/builder.py:8:22: F401 `datetime.datetime` imported but unused
from collections import defaultdict

from autorca_core.model.events import LogEvent, MetricPoint, Span, ConfigChange
Expand All @@ -17,6 +17,7 @@
IncidentNode,
IncidentType,
)
from autorca_core.config import ThresholdConfig


class GraphBuilder:
Expand All @@ -29,9 +30,10 @@
- Incident nodes (detected anomalies/errors)
"""

def __init__(self):
def __init__(self, thresholds: Optional[ThresholdConfig] = None):
self.graph = ServiceGraph()
self._service_metadata: Dict[str, Dict] = defaultdict(dict)
self.thresholds = thresholds or ThresholdConfig()

def add_logs(self, logs: List[LogEvent]) -> None:
"""
Expand Down Expand Up @@ -154,17 +156,17 @@
for log in error_logs:
by_service[log.service].append(log)

# Detect spikes (simple threshold: 3+ errors)
# Detect spikes using configurable thresholds
for service, service_errors in by_service.items():
if len(service_errors) >= 3:
if len(service_errors) >= self.thresholds.error_spike_count:
# Sort by timestamp
service_errors.sort(key=lambda e: e.timestamp)
first_error = service_errors[0]
last_error = service_errors[-1]

# If errors span less than 5 minutes, it's a spike
# If errors span less than threshold window, it's a spike
time_span = (last_error.timestamp - first_error.timestamp).total_seconds()
if time_span <= 300: # 5 minutes
if time_span <= self.thresholds.error_spike_window_seconds:
evidence = [f"Error: {e.message}" for e in service_errors[:5]] # Show first 5
self.graph.add_incident(IncidentNode(
service=service,
Expand Down Expand Up @@ -193,11 +195,11 @@
# Sort by timestamp
metric_points.sort(key=lambda m: m.timestamp)

# Simple threshold-based detection
# Simple threshold-based detection using configurable thresholds
if 'latency' in metric_name.lower() or 'duration' in metric_name.lower():
# Detect latency spike (value > 1000ms)
high_latency = [m for m in metric_points if m.value > 1000]
if len(high_latency) >= 2:
# Detect latency spike using configured threshold
high_latency = [m for m in metric_points if m.value > self.thresholds.latency_spike_ms]
if len(high_latency) >= self.thresholds.latency_spike_count:
self.graph.add_incident(IncidentNode(
service=service,
incident_type=IncidentType.LATENCY_SPIKE,
Expand All @@ -208,9 +210,9 @@
))

elif 'cpu' in metric_name.lower() or 'memory' in metric_name.lower():
# Detect resource exhaustion (value > 90%)
high_usage = [m for m in metric_points if m.value > 90]
if len(high_usage) >= 2:
# Detect resource exhaustion using configured threshold
high_usage = [m for m in metric_points if m.value > self.thresholds.resource_exhaustion_percent]
if len(high_usage) >= self.thresholds.resource_exhaustion_count:
self.graph.add_incident(IncidentNode(
service=service,
incident_type=IncidentType.RESOURCE_EXHAUSTION,
Expand Down Expand Up @@ -247,6 +249,7 @@
metrics: List[MetricPoint] = None,
traces: List[Span] = None,
configs: List[ConfigChange] = None,
thresholds: Optional[ThresholdConfig] = None,
) -> ServiceGraph:
"""
Convenience function to build a ServiceGraph from observability data.
Expand All @@ -256,11 +259,12 @@
metrics: Metric data points
traces: Trace spans
configs: Config/deployment changes
thresholds: Optional threshold configuration for anomaly detection

Returns:
Constructed ServiceGraph
"""
builder = GraphBuilder()
builder = GraphBuilder(thresholds=thresholds)

if logs:
builder.add_logs(logs)
Expand Down
18 changes: 17 additions & 1 deletion autorca_core/reasoning/loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from autorca_core.reasoning.rules import apply_rules, RootCauseCandidate
from autorca_core.reasoning.llm import LLMInterface, DummyLLM
from autorca_core.logging import get_logger
from autorca_core.config import ThresholdConfig

logger = get_logger(__name__)

Expand Down Expand Up @@ -75,6 +76,7 @@ def run_rca(
primary_symptom: str,
data_sources: DataSourcesConfig,
llm: Optional[LLMInterface] = None,
thresholds: Optional[ThresholdConfig] = None,
) -> RCARunResult:
"""
Run root cause analysis.
Expand All @@ -92,6 +94,7 @@ def run_rca(
primary_symptom: Description of the symptom (e.g., "Checkout API 500 errors")
data_sources: Configuration for data sources
llm: Optional LLM interface for enhanced analysis
thresholds: Optional threshold configuration for anomaly detection

Returns:
RCARunResult with root cause candidates and analysis
Expand Down Expand Up @@ -139,6 +142,7 @@ def run_rca(
logger.info(f" Loaded {len(configs)} config changes")

# Step 2: Build service graph
<<<<<<< HEAD
logger.info("Building service graph...")
graph = build_service_graph(logs=logs, metrics=metrics, traces=traces, configs=configs)
logger.info(f" Graph: {len(graph.services)} services, {len(graph.dependencies)} dependencies, {len(graph.incidents)} incidents")
Expand All @@ -147,6 +151,16 @@ def run_rca(
logger.info("Applying RCA rules...")
candidates = apply_rules(graph)
logger.info(f" Identified {len(candidates)} root cause candidates")
=======
print("Building service graph...")
graph = build_service_graph(logs=logs, metrics=metrics, traces=traces, configs=configs, thresholds=thresholds)
print(f" Graph: {len(graph.services)} services, {len(graph.dependencies)} dependencies, {len(graph.incidents)} incidents")

# Step 3: Run rule-based analysis
print("Applying RCA rules...")
candidates = apply_rules(graph, thresholds=thresholds)
print(f" Identified {len(candidates)} root cause candidates")
>>>>>>> b2361b9 (feat: add configurable detection thresholds for anomaly detection)

# Step 4: Generate summary using LLM
logger.info("Generating RCA summary...")
Expand Down Expand Up @@ -197,6 +211,7 @@ def run_rca_from_files(
configs_path: Optional[str] = None,
primary_symptom: str = "Unknown incident",
window_minutes: int = 60,
thresholds: Optional[ThresholdConfig] = None,
) -> RCARunResult:
"""
Convenience function to run RCA from file paths.
Expand All @@ -210,6 +225,7 @@ def run_rca_from_files(
configs_path: Path to configs file/directory
primary_symptom: Description of the symptom
window_minutes: Size of the analysis window in minutes
thresholds: Optional threshold configuration for anomaly detection

Returns:
RCARunResult
Expand Down Expand Up @@ -245,4 +261,4 @@ def run_rca_from_files(
configs_dir=configs_path,
)

return run_rca((time_from, time_to), primary_symptom, sources)
return run_rca((time_from, time_to), primary_symptom, sources, thresholds=thresholds)
17 changes: 11 additions & 6 deletions autorca_core/reasoning/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@
Simple, deterministic rules for identifying root causes without requiring an LLM.
"""

from typing import List, Dict, Set
from typing import List, Dict, Set, Optional
from dataclasses import dataclass
from datetime import timedelta

from autorca_core.model.graph import ServiceGraph, IncidentNode, IncidentType
from autorca_core.graph_engine.queries import GraphQueries, CausalChain
from autorca_core.config import ThresholdConfig


@dataclass
Expand Down Expand Up @@ -44,7 +45,7 @@ def to_dict(self) -> Dict:
}


def apply_rules(graph: ServiceGraph) -> List[RootCauseCandidate]:
def apply_rules(graph: ServiceGraph, thresholds: Optional[ThresholdConfig] = None) -> List[RootCauseCandidate]:
"""
Apply rule-based heuristics to identify root cause candidates.

Expand All @@ -57,15 +58,19 @@ def apply_rules(graph: ServiceGraph) -> List[RootCauseCandidate]:

Args:
graph: ServiceGraph with incidents and dependencies
thresholds: Optional threshold configuration for correlation windows

Returns:
List of RootCauseCandidate objects, sorted by confidence (highest first)
"""
if thresholds is None:
thresholds = ThresholdConfig()

candidates: List[RootCauseCandidate] = []
queries = GraphQueries(graph)

# Rule 1: Recent deployments/config changes
candidates.extend(_rule_recent_changes(graph, queries))
candidates.extend(_rule_recent_changes(graph, queries, thresholds))

# Rule 2: Resource exhaustion
candidates.extend(_rule_resource_exhaustion(graph, queries))
Expand All @@ -85,7 +90,7 @@ def apply_rules(graph: ServiceGraph) -> List[RootCauseCandidate]:
return candidates


def _rule_recent_changes(graph: ServiceGraph, queries: GraphQueries) -> List[RootCauseCandidate]:
def _rule_recent_changes(graph: ServiceGraph, queries: GraphQueries, thresholds: ThresholdConfig) -> List[RootCauseCandidate]:
"""
Rule: Services with recent deployments or config changes are strong root cause candidates.
"""
Expand All @@ -111,11 +116,11 @@ def _rule_recent_changes(graph: ServiceGraph, queries: GraphQueries) -> List[Roo
if i.incident_type not in (IncidentType.DEPLOYMENT, IncidentType.CONFIG_CHANGE)
]

# Check if other incidents occurred shortly after the change
# Check if other incidents occurred shortly after the change using configurable threshold
for change in change_incidents:
nearby_incidents = [
i for i in other_incidents
if abs((i.timestamp - change.timestamp).total_seconds()) < 600 # Within 10 minutes
if abs((i.timestamp - change.timestamp).total_seconds()) < thresholds.change_correlation_seconds
]

if nearby_incidents:
Expand Down
Loading