elastic
diff --git a/‎.env‎
Lines changed: 2 additions & 0 deletions b/‎.env‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.scannerwork/.sonar_lock‎ b/‎.scannerwork/.sonar_lock‎
diff --git a/‎.scannerwork/report-task.txt‎
Lines changed: 6 additions & 0 deletions b/‎.scannerwork/report-task.txt‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎.serena/.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.serena/.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.serena/project.yml‎
Lines changed: 67 additions & 0 deletions b/‎.serena/project.yml‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎01-introduction.md‎
Lines changed: 198 additions & 0 deletions b/‎01-introduction.md‎
Lines changed: 198 additions & 0 deletions
@@ -0,0 +1,2 @@
+SONARQUBE_TOKEN=squ_850690da2c61b8473ac4ebf8dddfbbfa89d10c50
+SONARQUBE_URL="https://sonar.elastic.dev"
@@ -29,6 +29,7 @@ generated-resources/
 
 # python environment stuff
 **/env/*
+**/venv/*
 *.pyc
 
 # testing stuff
 
@@ -0,0 +1,6 @@
+projectKey=elastic_ml-cpp_271ade36-31fc-4c6b-966e-80245560ad14
+serverUrl=https://sonar.elastic.dev
+serverVersion=10.4.1.88267
+dashboardUrl=https://sonar.elastic.dev/dashboard?id=elastic_ml-cpp_271ade36-31fc-4c6b-966e-80245560ad14
+ceTaskId=8a6dc50c-b755-468d-8ab1-321c74008419
+ceTaskUrl=https://sonar.elastic.dev/api/ce/task?id=8a6dc50c-b755-468d-8ab1-321c74008419
@@ -0,0 +1 @@
+/cache
@@ -0,0 +1,67 @@
+# language of the project (csharp, python, rust, java, typescript, go, cpp, or ruby)
+#  * For C, use cpp
+#  * For JavaScript, use typescript
+# Special requirements:
+#  * csharp: Requires the presence of a .sln file in the project folder.
+language: cpp
+
+# whether to use the project's gitignore file to ignore files
+# Added on 2025-04-07
+ignore_all_files_in_gitignore: true
+# list of additional paths to ignore
+# same syntax as gitignore, so you can use * and **
+# Was previously called `ignored_dirs`, please update your config if you are using that.
+# Added (renamed) on 2025-04-07
+ignored_paths: []
+
+# whether the project is in read-only mode
+# If set to true, all editing tools will be disabled and attempts to use them will result in an error
+# Added on 2025-04-18
+read_only: false
+
+# list of tool names to exclude. We recommend not excluding any tools, see the readme for more details.
+# Below is the complete list of tools for convenience.
+# To make sure you have the latest list of tools, and to view their descriptions, 
+# execute `uv run scripts/print_tool_overview.py`.
+#
+#  * `activate_project`: Activates a project by name.
+#  * `check_onboarding_performed`: Checks whether project onboarding was already performed.
+#  * `create_text_file`: Creates/overwrites a file in the project directory.
+#  * `delete_lines`: Deletes a range of lines within a file.
+#  * `delete_memory`: Deletes a memory from Serena's project-specific memory store.
+#  * `execute_shell_command`: Executes a shell command.
+#  * `find_referencing_code_snippets`: Finds code snippets in which the symbol at the given location is referenced.
+#  * `find_referencing_symbols`: Finds symbols that reference the symbol at the given location (optionally filtered by type).
+#  * `find_symbol`: Performs a global (or local) search for symbols with/containing a given name/substring (optionally filtered by type).
+#  * `get_current_config`: Prints the current configuration of the agent, including the active and available projects, tools, contexts, and modes.
+#  * `get_symbols_overview`: Gets an overview of the top-level symbols defined in a given file.
+#  * `initial_instructions`: Gets the initial instructions for the current project.
+#     Should only be used in settings where the system prompt cannot be set,
+#     e.g. in clients you have no control over, like Claude Desktop.
+#  * `insert_after_symbol`: Inserts content after the end of the definition of a given symbol.
+#  * `insert_at_line`: Inserts content at a given line in a file.
+#  * `insert_before_symbol`: Inserts content before the beginning of the definition of a given symbol.
+#  * `list_dir`: Lists files and directories in the given directory (optionally with recursion).
+#  * `list_memories`: Lists memories in Serena's project-specific memory store.
+#  * `onboarding`: Performs onboarding (identifying the project structure and essential tasks, e.g. for testing or building).
+#  * `prepare_for_new_conversation`: Provides instructions for preparing for a new conversation (in order to continue with the necessary context).
+#  * `read_file`: Reads a file within the project directory.
+#  * `read_memory`: Reads the memory with the given name from Serena's project-specific memory store.
+#  * `remove_project`: Removes a project from the Serena configuration.
+#  * `replace_lines`: Replaces a range of lines within a file with new content.
+#  * `replace_symbol_body`: Replaces the full definition of a symbol.
+#  * `restart_language_server`: Restarts the language server, may be necessary when edits not through Serena happen.
+#  * `search_for_pattern`: Performs a search for a pattern in the project.
+#  * `summarize_changes`: Provides instructions for summarizing the changes made to the codebase.
+#  * `switch_modes`: Activates modes by providing a list of their names
+#  * `think_about_collected_information`: Thinking tool for pondering the completeness of collected information.
+#  * `think_about_task_adherence`: Thinking tool for determining whether the agent is still on track with the current task.
+#  * `think_about_whether_you_are_done`: Thinking tool for determining whether the task is truly completed.
+#  * `write_memory`: Writes a named memory (for future reference) to Serena's project-specific memory store.
+excluded_tools: []
+
+# initial prompt for the project. It will always be given to the LLM upon activating the project
+# (contrary to the memories, which are loaded on demand).
+initial_prompt: ""
+
+project_name: "ml-cpp"
@@ -0,0 +1,198 @@
+# ML-CPP: Elastic Machine Learning Core
+
+## Overview
+
+The ML-CPP repository contains the C++ core implementation of Elastic's Machine Learning capabilities, providing high-performance analytics for anomaly detection, data frame analytics, and PyTorch model inference within the Elastic Stack.
+
+## Purpose and Scope
+
+This codebase implements the computational engine for:
+
+- **Time Series Anomaly Detection**: Real-time detection of anomalies in time series data using statistical models
+- **Data Frame Analytics**: Supervised learning (classification/regression) and unsupervised learning (outlier detection) on structured data
+- **PyTorch Model Inference**: High-performance inference for custom PyTorch models
+- **Data Categorization**: Automatic categorization of log messages and text data
+
+## High-Level Architecture
+
+The system follows a layered architecture with clear separation of concerns:
+
+```mermaid
+graph TB
+    subgraph "Executables (bin/)"
+        A[autodetect] --> B[controller]
+        C[data_frame_analyzer] --> B
+        D[pytorch_inference] --> B
+        E[categorize] --> B
+        F[normalize] --> B
+    end
+    
+    subgraph "API Layer (lib/api/)"
+        G[CAnomalyJob] --> H[CDataFrameAnalyzer]
+        I[CIoManager] --> J[CPersistenceManager]
+    end
+    
+    subgraph "Model Layer (lib/model/)"
+        K[CAnomalyDetector] --> L[CDataGatherer]
+        M[CModelFactory] --> N[CResourceMonitor]
+    end
+    
+    subgraph "Mathematics (lib/maths/)"
+        O[Time Series] --> P[Analytics]
+        Q[Common] --> R[Linear Algebra]
+    end
+    
+    subgraph "Core (lib/core/)"
+        S[CLogger] --> T[CDataFrame]
+        U[CMemoryUsage] --> V[CStatePersistInserter]
+    end
+    
+    A --> G
+    C --> H
+    G --> K
+    H --> M
+    K --> O
+    M --> Q
+    O --> S
+    Q --> S
+```
+
+## Key Design Principles
+
+### 1. Memory-Conscious Design
+- **Resource Monitoring**: Continuous tracking of memory usage with configurable limits
+- **Memory Circuit Breakers**: Automatic process termination when memory limits are exceeded
+- **Efficient Data Structures**: Specialized containers for time series and sparse data
+
+### 2. State Management
+- **Persistence**: Complete model state can be saved and restored
+- **Incremental Updates**: Models update incrementally as new data arrives
+- **Fault Tolerance**: Robust handling of state corruption and version mismatches
+
+### 3. Performance Optimization
+- **Parallel Processing**: Multi-threaded execution where beneficial
+- **SIMD Operations**: Vectorized mathematical operations
+- **Memory Pooling**: Efficient memory allocation patterns
+- **Caching**: Strategic caching of expensive computations
+
+### 4. Extensibility
+- **Plugin Architecture**: Modular design for different model types
+- **Factory Pattern**: Dynamic model creation based on configuration
+- **Interface-Based Design**: Clear abstractions for different components
+
+## Core Components
+
+### Executables (`bin/`)
+
+| Executable | Purpose | Key Features |
+|------------|---------|--------------|
+| `autodetect` | Time series anomaly detection | Real-time processing, multiple detector types |
+| `controller` | Process management | Spawns and manages other ML processes |
+| `data_frame_analyzer` | Supervised/unsupervised learning | Boosted trees, outlier detection |
+| `pytorch_inference` | PyTorch model inference | Custom model support, batch processing |
+| `categorize` | Text categorization | Tokenization, pattern matching |
+| `normalize` | Data normalization | Feature scaling, outlier handling |
+
+### Core Libraries
+
+#### `lib/core/` - Fundamental Utilities
+- **Logging**: Multi-level logging with named pipe support
+- **I/O Management**: Efficient data streaming and parsing
+- **Memory Management**: Usage tracking and circuit breakers
+- **State Persistence**: Serialization and restoration
+- **Concurrency**: Thread-safe operations and synchronization
+
+#### `lib/maths/` - Mathematical Foundation
+- **Common**: Statistical functions, linear algebra, probability distributions
+- **Time Series**: Seasonal decomposition, trend analysis, forecasting
+- **Analytics**: Boosted trees, clustering, feature importance
+
+#### `lib/model/` - Anomaly Detection Models
+- **Detectors**: Individual and population-based anomaly detection
+- **Data Gatherers**: Time series data collection and bucketing
+- **Model Factory**: Dynamic model creation and management
+- **Resource Monitoring**: Memory and CPU usage tracking
+
+#### `lib/api/` - High-Level API
+- **Job Management**: Configuration and lifecycle management
+- **Data Processing**: Input parsing and output formatting
+- **Persistence**: State management and restoration
+- **I/O Coordination**: Stream management and error handling
+
+## Data Flow Overview
+
+```mermaid
+sequenceDiagram
+    participant ES as Elasticsearch
+    participant C as Controller
+    participant A as Autodetect
+    participant M as Model
+    participant O as Output
+    
+    ES->>C: Start job
+    C->>A: Spawn process
+    A->>M: Initialize model
+    ES->>A: Stream data
+    A->>M: Process records
+    M->>M: Update model
+    M->>A: Generate results
+    A->>O: Write output
+    O->>ES: Return results
+    A->>A: Persist state
+```
+
+## Key Algorithms
+
+### Time Series Anomaly Detection
+- **Statistical Models**: Normal, Poisson, and Gamma distributions
+- **Seasonal Decomposition**: Automatic detection of seasonal patterns
+- **Change Point Detection**: Identification of regime changes
+- **Population Analysis**: Multi-dimensional anomaly detection
+
+### Data Frame Analytics
+- **Boosted Trees**: Gradient boosting for classification and regression
+- **Outlier Detection**: Distance-based and density-based methods
+- **Feature Engineering**: Automatic feature selection and encoding
+- **Cross-Validation**: Model validation and hyperparameter tuning
+
+### PyTorch Integration
+- **Model Loading**: TorchScript model deserialization
+- **Inference Pipeline**: Batch processing and result formatting
+- **Memory Management**: Efficient tensor operations
+- **Security**: Sandboxed execution environment
+
+## Performance Characteristics
+
+- **Memory Efficiency**: Sub-linear memory growth with data size
+- **CPU Optimization**: SIMD operations and parallel processing
+- **I/O Efficiency**: Streaming data processing with minimal buffering
+- **Scalability**: Horizontal scaling through process spawning
+
+## Development Philosophy
+
+The codebase emphasizes:
+
+1. **Correctness**: Extensive testing and validation
+2. **Performance**: Optimized for production workloads
+3. **Maintainability**: Clear interfaces and documentation
+4. **Reliability**: Robust error handling and recovery
+5. **Security**: Sandboxed execution and input validation
+
+## Getting Started
+
+For developers new to the codebase:
+
+1. **Start with Core**: Understand `lib/core/` utilities and abstractions
+2. **Explore Models**: Study `lib/model/` for anomaly detection concepts
+3. **Examine APIs**: Review `lib/api/` for high-level interfaces
+4. **Run Examples**: Use the executables with sample data
+5. **Read Tests**: Unit tests provide excellent usage examples
+
+## Next Steps
+
+- [Architecture Details](02-architecture.md) - Deep dive into system design
+- [Core Libraries](03-core-libraries.md) - Fundamental utilities and abstractions
+- [Mathematical Foundation](04-mathematics.md) - Algorithms and statistical methods
+- [Model Layer](05-model-layer.md) - Anomaly detection implementation
+
+
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+SONARQUBE_TOKEN=squ_850690da2c61b8473ac4ebf8dddfbbfa89d10c50`
	`2`	`+SONARQUBE_URL="https://sonar.elastic.dev"`