nik-kale · nik-kale · Dec 26, 2025 · Dec 26, 2025
diff --git a/autorca_core/__init__.py b/autorca_core/__init__.py
@@ -13,6 +13,7 @@
 from autorca_core.model.graph import Service, Dependency, IncidentNode
 from autorca_core.reasoning.loop import run_rca, RCARunResult
 from autorca_core.logging import configure_logging, get_logger
+from autorca_core.config import ThresholdConfig
 
 __all__ = [
     "Event",
@@ -26,4 +27,5 @@
     "RCARunResult",
     "configure_logging",
     "get_logger",
+    "ThresholdConfig",
 ]
diff --git a/autorca_core/config.py b/autorca_core/config.py
@@ -0,0 +1,126 @@
+"""
+Configuration module for AutoRCA-Core.
+
+Provides configurable thresholds and settings for anomaly detection and RCA analysis.
+"""
+
+import os
+from dataclasses import dataclass, field
+from typing import Optional
+
+
+@dataclass
+class ThresholdConfig:
+    """
+    Configurable thresholds for anomaly detection.
+
+    These thresholds control when incidents are detected from observability data.
+    Different environments have different baselines, so these can be tuned accordingly.
+
+    Attributes:
+        # Error detection
+        error_spike_count: Minimum number of errors to consider a spike
+        error_spike_window_seconds: Time window for error spike detection
+
+        # Latency detection
+        latency_spike_ms: Latency threshold in milliseconds
+        latency_spike_count: Minimum number of high-latency samples
+
+        # Resource detection
+        resource_exhaustion_percent: Resource usage percentage threshold
+        resource_exhaustion_count: Minimum number of high-usage samples
+
+        # Correlation windows
+        change_correlation_seconds: Time window for correlating changes with incidents
+    """
+
+    # Error detection
+    error_spike_count: int = 3
+    error_spike_window_seconds: int = 300  # 5 minutes
+
+    # Latency detection
+    latency_spike_ms: float = 1000.0
+    latency_spike_count: int = 2
+
+    # Resource detection
+    resource_exhaustion_percent: float = 90.0
+    resource_exhaustion_count: int = 2
+
+    # Correlation windows
+    change_correlation_seconds: int = 600  # 10 minutes
+
+    @classmethod
+    def from_env(cls) -> "ThresholdConfig":
+        """
+        Load thresholds from environment variables.
+
+        Environment variables:
+            AUTORCA_ERROR_SPIKE_COUNT: Error spike count threshold
+            AUTORCA_ERROR_SPIKE_WINDOW: Error spike window in seconds
+            AUTORCA_LATENCY_SPIKE_MS: Latency spike threshold in ms
+            AUTORCA_LATENCY_SPIKE_COUNT: Latency spike count threshold
+            AUTORCA_RESOURCE_EXHAUSTION_PERCENT: Resource exhaustion percentage
+            AUTORCA_RESOURCE_EXHAUSTION_COUNT: Resource exhaustion count threshold
+            AUTORCA_CHANGE_CORRELATION_SECONDS: Change correlation window in seconds
+
+        Returns:
+            ThresholdConfig instance with values from environment
+        """
+        return cls(
+            error_spike_count=int(os.getenv("AUTORCA_ERROR_SPIKE_COUNT", 3)),
+            error_spike_window_seconds=int(os.getenv("AUTORCA_ERROR_SPIKE_WINDOW", 300)),
+            latency_spike_ms=float(os.getenv("AUTORCA_LATENCY_SPIKE_MS", 1000.0)),
+            latency_spike_count=int(os.getenv("AUTORCA_LATENCY_SPIKE_COUNT", 2)),
+            resource_exhaustion_percent=float(os.getenv("AUTORCA_RESOURCE_EXHAUSTION_PERCENT", 90.0)),
+            resource_exhaustion_count=int(os.getenv("AUTORCA_RESOURCE_EXHAUSTION_COUNT", 2)),
+            change_correlation_seconds=int(os.getenv("AUTORCA_CHANGE_CORRELATION_SECONDS", 600)),
+        )
+
+    @classmethod
+    def strict(cls) -> "ThresholdConfig":
+        """
+        Create a strict configuration (more sensitive to anomalies).
+
+        Returns:
+            ThresholdConfig with strict thresholds
+        """
+        return cls(
+            error_spike_count=2,
+            error_spike_window_seconds=180,  # 3 minutes
+            latency_spike_ms=500.0,
+            latency_spike_count=1,
+            resource_exhaustion_percent=80.0,
+            resource_exhaustion_count=1,
+            change_correlation_seconds=900,  # 15 minutes
+        )
+
+    @classmethod
+    def relaxed(cls) -> "ThresholdConfig":
+        """
+        Create a relaxed configuration (less sensitive to anomalies).
+
+        Returns:
+            ThresholdConfig with relaxed thresholds
+        """
+        return cls(
+            error_spike_count=5,
+            error_spike_window_seconds=600,  # 10 minutes
+            latency_spike_ms=2000.0,
+            latency_spike_count=3,
+            resource_exhaustion_percent=95.0,
+            resource_exhaustion_count=3,
+            change_correlation_seconds=300,  # 5 minutes
+        )
+
+    def to_dict(self) -> dict:
+        """Serialize to dictionary."""
+        return {
+            "error_spike_count": self.error_spike_count,
+            "error_spike_window_seconds": self.error_spike_window_seconds,
+            "latency_spike_ms": self.latency_spike_ms,
+            "latency_spike_count": self.latency_spike_count,
+            "resource_exhaustion_percent": self.resource_exhaustion_percent,
+            "resource_exhaustion_count": self.resource_exhaustion_count,
+            "change_correlation_seconds": self.change_correlation_seconds,
+        }
+
diff --git a/autorca_core/graph_engine/builder.py b/autorca_core/graph_engine/builder.py
@@ -4,8 +4,8 @@
 Builds a ServiceGraph from logs, metrics, traces, and config changes.
 """
 
-from typing import List, Dict, Set, Tuple
+from typing import List, Dict, Set, Tuple, Optional
 from datetime import datetime, timedelta
 from collections import defaultdict
 
 from autorca_core.model.events import LogEvent, MetricPoint, Span, ConfigChange
@@ -17,6 +17,7 @@
     IncidentNode,
     IncidentType,
 )
+from autorca_core.config import ThresholdConfig
 
 
 class GraphBuilder:
@@ -29,9 +30,10 @@
     - Incident nodes (detected anomalies/errors)
     """
 
-    def __init__(self):
+    def __init__(self, thresholds: Optional[ThresholdConfig] = None):
         self.graph = ServiceGraph()
         self._service_metadata: Dict[str, Dict] = defaultdict(dict)
+        self.thresholds = thresholds or ThresholdConfig()
 
     def add_logs(self, logs: List[LogEvent]) -> None:
         """
@@ -154,17 +156,17 @@
         for log in error_logs:
             by_service[log.service].append(log)
 
-        # Detect spikes (simple threshold: 3+ errors)
+        # Detect spikes using configurable thresholds
         for service, service_errors in by_service.items():
-            if len(service_errors) >= 3:
+            if len(service_errors) >= self.thresholds.error_spike_count:
                 # Sort by timestamp
                 service_errors.sort(key=lambda e: e.timestamp)
                 first_error = service_errors[0]
                 last_error = service_errors[-1]
 
-                # If errors span less than 5 minutes, it's a spike
+                # If errors span less than threshold window, it's a spike
                 time_span = (last_error.timestamp - first_error.timestamp).total_seconds()
-                if time_span <= 300:  # 5 minutes
+                if time_span <= self.thresholds.error_spike_window_seconds:
                     evidence = [f"Error: {e.message}" for e in service_errors[:5]]  # Show first 5
                     self.graph.add_incident(IncidentNode(
                         service=service,
@@ -193,11 +195,11 @@
             # Sort by timestamp
             metric_points.sort(key=lambda m: m.timestamp)
 
-            # Simple threshold-based detection
+            # Simple threshold-based detection using configurable thresholds
             if 'latency' in metric_name.lower() or 'duration' in metric_name.lower():
-                # Detect latency spike (value > 1000ms)
-                high_latency = [m for m in metric_points if m.value > 1000]
-                if len(high_latency) >= 2:
+                # Detect latency spike using configured threshold
+                high_latency = [m for m in metric_points if m.value > self.thresholds.latency_spike_ms]
+                if len(high_latency) >= self.thresholds.latency_spike_count:
                     self.graph.add_incident(IncidentNode(
                         service=service,
                         incident_type=IncidentType.LATENCY_SPIKE,
@@ -208,9 +210,9 @@
                     ))
 
             elif 'cpu' in metric_name.lower() or 'memory' in metric_name.lower():
-                # Detect resource exhaustion (value > 90%)
-                high_usage = [m for m in metric_points if m.value > 90]
-                if len(high_usage) >= 2:
+                # Detect resource exhaustion using configured threshold
+                high_usage = [m for m in metric_points if m.value > self.thresholds.resource_exhaustion_percent]
+                if len(high_usage) >= self.thresholds.resource_exhaustion_count:
                     self.graph.add_incident(IncidentNode(
                         service=service,
                         incident_type=IncidentType.RESOURCE_EXHAUSTION,
@@ -247,6 +249,7 @@
     metrics: List[MetricPoint] = None,
     traces: List[Span] = None,
     configs: List[ConfigChange] = None,
+    thresholds: Optional[ThresholdConfig] = None,
 ) -> ServiceGraph:
     """
     Convenience function to build a ServiceGraph from observability data.
@@ -256,11 +259,12 @@
         metrics: Metric data points
         traces: Trace spans
         configs: Config/deployment changes
+        thresholds: Optional threshold configuration for anomaly detection
 
     Returns:
         Constructed ServiceGraph
     """
-    builder = GraphBuilder()
+    builder = GraphBuilder(thresholds=thresholds)
 
     if logs:
         builder.add_logs(logs)

diff --git a/autorca_core/reasoning/loop.py b/autorca_core/reasoning/loop.py
@@ -17,6 +17,7 @@
 from autorca_core.reasoning.rules import apply_rules, RootCauseCandidate
 from autorca_core.reasoning.llm import LLMInterface, DummyLLM
 from autorca_core.logging import get_logger
+from autorca_core.config import ThresholdConfig
 
 logger = get_logger(__name__)
 
@@ -75,6 +76,7 @@ def run_rca(
     primary_symptom: str,
     data_sources: DataSourcesConfig,
     llm: Optional[LLMInterface] = None,
+    thresholds: Optional[ThresholdConfig] = None,
 ) -> RCARunResult:
     """
     Run root cause analysis.
@@ -92,6 +94,7 @@ def run_rca(
         primary_symptom: Description of the symptom (e.g., "Checkout API 500 errors")
         data_sources: Configuration for data sources
         llm: Optional LLM interface for enhanced analysis
+        thresholds: Optional threshold configuration for anomaly detection
 
     Returns:
         RCARunResult with root cause candidates and analysis
@@ -139,6 +142,7 @@ def run_rca(
         logger.info(f"  Loaded {len(configs)} config changes")
 
     # Step 2: Build service graph
+<<<<<<< HEAD
     logger.info("Building service graph...")
     graph = build_service_graph(logs=logs, metrics=metrics, traces=traces, configs=configs)
     logger.info(f"  Graph: {len(graph.services)} services, {len(graph.dependencies)} dependencies, {len(graph.incidents)} incidents")
@@ -147,6 +151,16 @@ def run_rca(
     logger.info("Applying RCA rules...")
     candidates = apply_rules(graph)
     logger.info(f"  Identified {len(candidates)} root cause candidates")
+=======
+    print("Building service graph...")
+    graph = build_service_graph(logs=logs, metrics=metrics, traces=traces, configs=configs, thresholds=thresholds)
+    print(f"  Graph: {len(graph.services)} services, {len(graph.dependencies)} dependencies, {len(graph.incidents)} incidents")
+
+    # Step 3: Run rule-based analysis
+    print("Applying RCA rules...")
+    candidates = apply_rules(graph, thresholds=thresholds)
+    print(f"  Identified {len(candidates)} root cause candidates")
+>>>>>>> b2361b9 (feat: add configurable detection thresholds for anomaly detection)
 
     # Step 4: Generate summary using LLM
     logger.info("Generating RCA summary...")
@@ -197,6 +211,7 @@ def run_rca_from_files(
     configs_path: Optional[str] = None,
     primary_symptom: str = "Unknown incident",
     window_minutes: int = 60,
+    thresholds: Optional[ThresholdConfig] = None,
 ) -> RCARunResult:
     """
     Convenience function to run RCA from file paths.
@@ -210,6 +225,7 @@ def run_rca_from_files(
         configs_path: Path to configs file/directory
         primary_symptom: Description of the symptom
         window_minutes: Size of the analysis window in minutes
+        thresholds: Optional threshold configuration for anomaly detection
 
     Returns:
         RCARunResult
@@ -245,4 +261,4 @@ def run_rca_from_files(
         configs_dir=configs_path,
     )
 
-    return run_rca((time_from, time_to), primary_symptom, sources)
+    return run_rca((time_from, time_to), primary_symptom, sources, thresholds=thresholds)
diff --git a/autorca_core/reasoning/rules.py b/autorca_core/reasoning/rules.py
@@ -4,12 +4,13 @@
 Simple, deterministic rules for identifying root causes without requiring an LLM.
 """
 
-from typing import List, Dict, Set
+from typing import List, Dict, Set, Optional
 from dataclasses import dataclass
 from datetime import timedelta
 
 from autorca_core.model.graph import ServiceGraph, IncidentNode, IncidentType
 from autorca_core.graph_engine.queries import GraphQueries, CausalChain
+from autorca_core.config import ThresholdConfig
 
 
 @dataclass
@@ -44,7 +45,7 @@ def to_dict(self) -> Dict:
         }
 
 
-def apply_rules(graph: ServiceGraph) -> List[RootCauseCandidate]:
+def apply_rules(graph: ServiceGraph, thresholds: Optional[ThresholdConfig] = None) -> List[RootCauseCandidate]:
     """
     Apply rule-based heuristics to identify root cause candidates.
 
@@ -57,15 +58,19 @@ def apply_rules(graph: ServiceGraph) -> List[RootCauseCandidate]:
 
     Args:
         graph: ServiceGraph with incidents and dependencies
+        thresholds: Optional threshold configuration for correlation windows
 
     Returns:
         List of RootCauseCandidate objects, sorted by confidence (highest first)
     """
+    if thresholds is None:
+        thresholds = ThresholdConfig()
+
     candidates: List[RootCauseCandidate] = []
     queries = GraphQueries(graph)
 
     # Rule 1: Recent deployments/config changes
-    candidates.extend(_rule_recent_changes(graph, queries))
+    candidates.extend(_rule_recent_changes(graph, queries, thresholds))
 
     # Rule 2: Resource exhaustion
     candidates.extend(_rule_resource_exhaustion(graph, queries))
@@ -85,7 +90,7 @@ def apply_rules(graph: ServiceGraph) -> List[RootCauseCandidate]:
     return candidates
 
 
-def _rule_recent_changes(graph: ServiceGraph, queries: GraphQueries) -> List[RootCauseCandidate]:
+def _rule_recent_changes(graph: ServiceGraph, queries: GraphQueries, thresholds: ThresholdConfig) -> List[RootCauseCandidate]:
     """
     Rule: Services with recent deployments or config changes are strong root cause candidates.
     """
@@ -111,11 +116,11 @@ def _rule_recent_changes(graph: ServiceGraph, queries: GraphQueries) -> List[Roo
             if i.incident_type not in (IncidentType.DEPLOYMENT, IncidentType.CONFIG_CHANGE)
         ]
 
-        # Check if other incidents occurred shortly after the change
+        # Check if other incidents occurred shortly after the change using configurable threshold
         for change in change_incidents:
             nearby_incidents = [
                 i for i in other_incidents
-                if abs((i.timestamp - change.timestamp).total_seconds()) < 600  # Within 10 minutes
+                if abs((i.timestamp - change.timestamp).total_seconds()) < thresholds.change_correlation_seconds
             ]
 
             if nearby_incidents: