From 4f64565c24cb9f143faa70b64e1b76f30530b40b Mon Sep 17 00:00:00 2001 From: Barak Amar Date: Thu, 17 Jul 2025 19:34:37 +0300 Subject: [PATCH 1/2] wip --- docs/mkdocs.yml | 39 +- .../_templates/api-reference-template.md | 207 ++ .../_templates/code-example-template.md | 134 + .../_templates/content-categorization.md | 307 ++ .../_templates/cross-reference-conventions.md | 260 ++ .../_templates/documentation-standards.md | 254 ++ .../python/_templates/tutorial-template.md | 226 ++ .../python/boto3/configuration.md | 497 +++ docs/src/integrations/python/boto3/index.md | 154 + .../python/boto3/s3-operations.md | 775 +++++ .../integrations/python/boto3/s3-router.md | 934 +++++ .../python/boto3/troubleshooting.md | 1178 +++++++ .../python/generated-sdk/api-reference.md | 1265 +++++++ .../python/generated-sdk/direct-access.md | 378 ++ .../python/generated-sdk/examples.md | 867 +++++ .../python/generated-sdk/index.md | 383 ++ .../integrations/python/getting-started.md | 511 +++ .../python/high-level-sdk/advanced.md | 1284 +++++++ .../high-level-sdk/branches-and-commits.md | 743 ++++ .../high-level-sdk/imports-and-exports.md | 746 ++++ .../python/high-level-sdk/index.md | 166 + .../python/high-level-sdk/objects-and-io.md | 709 ++++ .../python/high-level-sdk/quickstart.md | 371 ++ .../python/high-level-sdk/repositories.md | 587 ++++ .../python/high-level-sdk/transactions.md | 962 +++++ docs/src/integrations/python/index.md | 244 ++ .../python/lakefs-spec/filesystem-api.md | 346 ++ .../integrations/python/lakefs-spec/index.md | 398 +++ .../python/lakefs-spec/integrations.md | 478 +++ .../python/lakefs-spec/transactions.md | 491 +++ .../python/reference/api-comparison.md | 374 ++ .../python/reference/best-practices.md | 1033 ++++++ .../python/reference/changelog.md | 14 + .../integrations/python/reference/index.md | 83 + .../python/reference/troubleshooting.md | 1026 ++++++ .../python/tutorials/data-science-workflow.md | 14 + .../python/tutorials/etl-pipeline.md | 3097 +++++++++++++++++ .../integrations/python/tutorials/index.md | 70 + .../tutorials/ml-experiment-tracking.md | 3018 ++++++++++++++++ 39 files changed, 24622 insertions(+), 1 deletion(-) create mode 100644 docs/src/integrations/python/_templates/api-reference-template.md create mode 100644 docs/src/integrations/python/_templates/code-example-template.md create mode 100644 docs/src/integrations/python/_templates/content-categorization.md create mode 100644 docs/src/integrations/python/_templates/cross-reference-conventions.md create mode 100644 docs/src/integrations/python/_templates/documentation-standards.md create mode 100644 docs/src/integrations/python/_templates/tutorial-template.md create mode 100644 docs/src/integrations/python/boto3/configuration.md create mode 100644 docs/src/integrations/python/boto3/index.md create mode 100644 docs/src/integrations/python/boto3/s3-operations.md create mode 100644 docs/src/integrations/python/boto3/s3-router.md create mode 100644 docs/src/integrations/python/boto3/troubleshooting.md create mode 100644 docs/src/integrations/python/generated-sdk/api-reference.md create mode 100644 docs/src/integrations/python/generated-sdk/direct-access.md create mode 100644 docs/src/integrations/python/generated-sdk/examples.md create mode 100644 docs/src/integrations/python/generated-sdk/index.md create mode 100644 docs/src/integrations/python/getting-started.md create mode 100644 docs/src/integrations/python/high-level-sdk/advanced.md create mode 100644 docs/src/integrations/python/high-level-sdk/branches-and-commits.md create mode 100644 docs/src/integrations/python/high-level-sdk/imports-and-exports.md create mode 100644 docs/src/integrations/python/high-level-sdk/index.md create mode 100644 docs/src/integrations/python/high-level-sdk/objects-and-io.md create mode 100644 docs/src/integrations/python/high-level-sdk/quickstart.md create mode 100644 docs/src/integrations/python/high-level-sdk/repositories.md create mode 100644 docs/src/integrations/python/high-level-sdk/transactions.md create mode 100644 docs/src/integrations/python/index.md create mode 100644 docs/src/integrations/python/lakefs-spec/filesystem-api.md create mode 100644 docs/src/integrations/python/lakefs-spec/index.md create mode 100644 docs/src/integrations/python/lakefs-spec/integrations.md create mode 100644 docs/src/integrations/python/lakefs-spec/transactions.md create mode 100644 docs/src/integrations/python/reference/api-comparison.md create mode 100644 docs/src/integrations/python/reference/best-practices.md create mode 100644 docs/src/integrations/python/reference/changelog.md create mode 100644 docs/src/integrations/python/reference/index.md create mode 100644 docs/src/integrations/python/reference/troubleshooting.md create mode 100644 docs/src/integrations/python/tutorials/data-science-workflow.md create mode 100644 docs/src/integrations/python/tutorials/etl-pipeline.md create mode 100644 docs/src/integrations/python/tutorials/index.md create mode 100644 docs/src/integrations/python/tutorials/ml-experiment-tracking.md diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index ee773e2e6b8..5418b87f8ff 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -214,7 +214,44 @@ nav: - Apache Airflow: integrations/airflow.md - Airbyte: integrations/airbyte.md - Dev & Tools: - - Python: integrations/python.md + - Python: + - Overview: integrations/python/index.md + - Getting Started: integrations/python/getting-started.md + - High-Level SDK: + - Overview: integrations/python/high-level-sdk/index.md + - Quickstart: integrations/python/high-level-sdk/quickstart.md + - Repositories: integrations/python/high-level-sdk/repositories.md + - Branches & Commits: integrations/python/high-level-sdk/branches-and-commits.md + - Objects & I/O: integrations/python/high-level-sdk/objects-and-io.md + - Imports & Exports: integrations/python/high-level-sdk/imports-and-exports.md + - Transactions: integrations/python/high-level-sdk/transactions.md + - Advanced Features: integrations/python/high-level-sdk/advanced.md + - Generated SDK: + - Overview: integrations/python/generated-sdk/index.md + - API Reference: integrations/python/generated-sdk/api-reference.md + - Examples: integrations/python/generated-sdk/examples.md + - Direct Access: integrations/python/generated-sdk/direct-access.md + - lakefs-spec: + - Overview: integrations/python/lakefs-spec/index.md + - Filesystem API: integrations/python/lakefs-spec/filesystem-api.md + - Integrations: integrations/python/lakefs-spec/integrations.md + - Transactions: integrations/python/lakefs-spec/transactions.md + - Boto3: + - Overview: integrations/python/boto3/index.md + - Configuration: integrations/python/boto3/configuration.md + - S3 Operations: integrations/python/boto3/s3-operations.md + - S3 Router: integrations/python/boto3/s3-router.md + - Tutorials: + - Overview: integrations/python/tutorials/index.md + - Data Science Workflow: integrations/python/tutorials/data-science-workflow.md + - ETL Pipeline: integrations/python/tutorials/etl-pipeline.md + - ML Experiment Tracking: integrations/python/tutorials/ml-experiment-tracking.md + - Reference: + - Overview: integrations/python/reference/index.md + - API Comparison: integrations/python/reference/api-comparison.md + - Best Practices: integrations/python/reference/best-practices.md + - Troubleshooting: integrations/python/reference/troubleshooting.md + - Changelog: integrations/python/reference/changelog.md - AWS CLI: integrations/aws_cli.md - Git: integrations/git.md - R: integrations/r.md diff --git a/docs/src/integrations/python/_templates/api-reference-template.md b/docs/src/integrations/python/_templates/api-reference-template.md new file mode 100644 index 00000000000..6a261b39488 --- /dev/null +++ b/docs/src/integrations/python/_templates/api-reference-template.md @@ -0,0 +1,207 @@ +# API Reference Template + +Use this template for consistent API documentation throughout the Python documentation. + +## Method Documentation Format + +```markdown +#### `method_name(param1, param2=None, **kwargs)` + +Brief description of the method and its purpose. Explain what it does and when to use it. + +**Parameters:** +- `param1` (str): Description of the parameter, including valid values or constraints +- `param2` (Optional[int], default=None): Description of optional parameter with default value +- `**kwargs`: Additional keyword arguments passed to underlying implementation + +**Returns:** +- `ReturnType`: Description of what the method returns, including type information + +**Raises:** +- `SpecificException`: When this specific exception occurs and why +- `ValueError`: When invalid parameters are provided +- `ConnectionError`: When connection to lakeFS fails + +**Example:** +```python +# Basic usage +result = obj.method_name("required_value") + +# With optional parameters +result = obj.method_name("required_value", param2=42, extra_option=True) + +# Error handling +try: + result = obj.method_name("value") +except SpecificException as e: + print(f"Operation failed: {e}") +``` + +**See Also:** +- [Related Method](../path/to/related-method.md) +- [Usage Guide](../path/to/usage-guide.md) +``` + +## Class Documentation Format + +```markdown +### ClassName + +Brief description of the class and its purpose. + +**Initialization:** +```python +obj = ClassName(param1, param2=default_value) +``` + +**Parameters:** +- `param1` (str): Required parameter description +- `param2` (Optional[type], default=default_value): Optional parameter description + +**Attributes:** +- `attribute_name` (type): Description of public attribute +- `another_attr` (type): Description of another attribute + +**Methods:** +- [`method1()`](#method1): Brief description +- [`method2()`](#method2): Brief description + +**Example:** +```python +# Create instance +obj = ClassName("required_param") + +# Use methods +result = obj.method1() +``` +``` + +## Exception Documentation Format + +```markdown +### ExceptionName + +Description of when this exception is raised and what it indicates. + +**Inheritance:** `BaseException` → `CustomException` → `ExceptionName` + +**Attributes:** +- `message` (str): Error message +- `error_code` (Optional[int]): Specific error code if applicable + +**Example:** +```python +try: + # Operation that might raise this exception + result = risky_operation() +except ExceptionName as e: + print(f"Error: {e.message}") + if e.error_code: + print(f"Code: {e.error_code}") +``` +``` + +## Type Hints and Annotations + +Use consistent type hints throughout documentation: + +```python +from typing import Optional, List, Dict, Union, Any + +def example_function( + required_param: str, + optional_param: Optional[int] = None, + list_param: List[str] = None, + dict_param: Dict[str, Any] = None +) -> Union[str, None]: + """ + Example function with proper type hints. + + Args: + required_param: Required string parameter + optional_param: Optional integer parameter + list_param: List of strings + dict_param: Dictionary with string keys + + Returns: + String result or None if operation fails + """ + pass +``` + +## SDK-Specific API Documentation + +### High-Level SDK API Format +Focus on the simplified, Pythonic interface: + +```markdown +#### `repository(name)` + +Get a repository object for the specified repository name. + +**Parameters:** +- `name` (str): Repository name + +**Returns:** +- `Repository`: Repository object for performing operations + +**Example:** +```python +repo = lakefs.repository("my-repo") +``` +``` + +### Generated SDK API Format +Include full OpenAPI-based documentation: + +```markdown +#### `create_repository(repository_creation)` + +Create a new repository using the Generated SDK. + +**Parameters:** +- `repository_creation` (RepositoryCreation): Repository creation object containing: + - `name` (str): Repository name + - `storage_namespace` (str): Storage namespace URI + - `default_branch` (Optional[str]): Default branch name + +**Returns:** +- `Repository`: Created repository object + +**Raises:** +- `ConflictException`: Repository already exists +- `ValidationException`: Invalid parameters + +**Example:** +```python +from lakefs_sdk import RepositoryCreation + +repo_creation = RepositoryCreation( + name="my-repo", + storage_namespace="s3://bucket/path" +) +repo = repositories_api.create_repository(repository_creation=repo_creation) +``` +``` + +## Documentation Standards + +### Consistency Guidelines +1. **Parameter Names**: Use consistent parameter names across similar operations +2. **Return Types**: Always specify return types clearly +3. **Error Handling**: Document all possible exceptions +4. **Examples**: Provide practical, runnable examples +5. **Cross-References**: Link to related documentation + +### Writing Style +- Use active voice +- Be concise but complete +- Explain the "why" not just the "what" +- Include practical use cases +- Use consistent terminology + +### Code Quality +- All examples must be syntactically correct +- Use realistic parameter values +- Include error handling where appropriate +- Show both basic and advanced usage patterns \ No newline at end of file diff --git a/docs/src/integrations/python/_templates/code-example-template.md b/docs/src/integrations/python/_templates/code-example-template.md new file mode 100644 index 00000000000..e202282432b --- /dev/null +++ b/docs/src/integrations/python/_templates/code-example-template.md @@ -0,0 +1,134 @@ +# Code Example Template + +Use this template for consistent code examples throughout the Python documentation. + +## Basic Code Example Format + +```markdown +### Operation Name + +Brief description of what this operation does and when to use it. + +```python +# Complete, runnable example with clear comments +import lakefs + +# Setup (if needed) +client = lakefs.Client(host="http://localhost:8000", ...) + +# Main operation with descriptive variable names +result = client.operation_name(parameter="value") +print(result) +``` + +**Expected Output:** +``` +Expected output here (when applicable) +``` + +**Key Points:** +- Important concept 1 +- Important concept 2 +- When to use this pattern + +**See Also:** +- [Related Topic](../path/to/related.md) +- [Advanced Usage](../path/to/advanced.md) +``` + +## Code Example Guidelines + +### Code Style +- Use clear, descriptive variable names +- Include necessary imports at the top +- Add comments explaining non-obvious operations +- Show complete, runnable examples when possible +- Use consistent indentation (4 spaces) + +### Content Structure +- Start with a brief description +- Provide the code example +- Show expected output when relevant +- List key points or concepts +- Include cross-references to related topics + +### Error Handling Examples +```python +try: + # Operation that might fail + result = client.risky_operation() + print(f"Success: {result}") +except SpecificException as e: + print(f"Specific error: {e}") + # Handle specific error case +except Exception as e: + print(f"General error: {e}") + # Handle general error case +``` + +### Multi-Step Examples +For complex operations, break into clear steps: + +```python +# Step 1: Setup +client = lakefs.Client(...) + +# Step 2: Prepare data +data = prepare_data() + +# Step 3: Execute operation +result = client.complex_operation(data) + +# Step 4: Process results +processed_result = process_results(result) +``` + +## SDK-Specific Templates + +### High-Level SDK Example +```python +import lakefs + +# Use High-Level SDK patterns +repo = lakefs.repository("my-repo") +branch = repo.branch("main") +obj = branch.object("path/to/file.txt") + +# Show the operation +result = obj.upload(data="content") +``` + +### Generated SDK Example +```python +import lakefs_sdk +from lakefs_sdk.client import LakeFSClient + +# Initialize Generated SDK client +client = LakeFSClient(configuration=lakefs_sdk.Configuration(...)) +api = lakefs_sdk.ObjectsApi(client) + +# Show the operation +result = api.upload_object(...) +``` + +### lakefs-spec Example +```python +from lakefs_spec import LakeFSFileSystem + +# Initialize filesystem +fs = LakeFSFileSystem() + +# Show the operation +fs.write_text("lakefs://repo/branch/file.txt", "content") +``` + +### Boto3 Example +```python +import boto3 + +# Initialize Boto3 client +s3 = boto3.client('s3', endpoint_url='http://localhost:8000', ...) + +# Show the operation +s3.put_object(Bucket='repo', Key='branch/file.txt', Body=b'content') +``` \ No newline at end of file diff --git a/docs/src/integrations/python/_templates/content-categorization.md b/docs/src/integrations/python/_templates/content-categorization.md new file mode 100644 index 00000000000..e8ee6243b64 --- /dev/null +++ b/docs/src/integrations/python/_templates/content-categorization.md @@ -0,0 +1,307 @@ +# Content Categorization and Discoverability + +This document provides a comprehensive categorization system for the Python documentation to improve search and discoverability. + +## Content Organization by Difficulty + +### Beginner Level +**Target Audience:** New to lakeFS, basic Python knowledge +**Learning Path:** Start here → Intermediate → Advanced + +#### Getting Started Content +- [Python SDK Overview](../index.md) - Compare all SDK options +- [Getting Started Guide](../getting-started.md) - Installation and setup +- [High-Level SDK Quickstart](../high-level-sdk/quickstart.md) - Basic operations +- [Repository Management](../high-level-sdk/repositories.md) - Repository basics + +#### SDK-Specific Beginner Content +- [High-Level SDK Overview](../high-level-sdk/index.md) - Architecture and concepts +- [lakefs-spec Overview](../lakefs-spec/index.md) - Filesystem interface +- [lakefs-spec Filesystem API](../lakefs-spec/filesystem-api.md) - Basic file operations +- [lakefs-spec Integrations](../lakefs-spec/integrations.md) - Data science libraries +- [Boto3 Overview](../boto3/index.md) - S3-compatible operations +- [Boto3 Configuration](../boto3/configuration.md) - Setup and authentication + +#### Tutorial Content +- [Tutorial Index](../tutorials/index.md) - Learning resources overview +- [Data Science Tutorial](../tutorials/data-science-workflow.md) - End-to-end workflow + +### Intermediate Level +**Target Audience:** Familiar with lakeFS basics, need advanced features +**Learning Path:** Build on beginner knowledge + +#### Advanced Operations +- [Branches and Commits](../high-level-sdk/branches-and-commits.md) - Version control +- [Objects and I/O](../high-level-sdk/objects-and-io.md) - File operations +- [Transaction Handling](../high-level-sdk/transactions.md) - Atomic operations +- [Data Imports and Exports](../high-level-sdk/imports-and-exports.md) - Bulk operations + +#### SDK Integration +- [Generated SDK Overview](../generated-sdk/index.md) - Direct API access +- [Generated SDK Examples](../generated-sdk/examples.md) - Usage patterns +- [Direct Access from High-Level SDK](../generated-sdk/direct-access.md) - Hybrid usage +- [lakefs-spec Transactions](../lakefs-spec/transactions.md) - Atomic filesystem operations +- [Boto3 S3 Router](../boto3/s3-router.md) - Hybrid workflows + +#### Tutorials and Examples +- [ETL Pipeline Tutorial](../tutorials/etl-pipeline.md) - Data pipeline patterns +- [ML Experiment Tracking](../tutorials/ml-experiment-tracking.md) - Model versioning + +#### Reference Materials +- [API Comparison](../reference/api-comparison.md) - Feature comparison +- [Troubleshooting Guide](../reference/troubleshooting.md) - Common issues +- [Boto3 Troubleshooting](../boto3/troubleshooting.md) - S3 compatibility issues + +### Advanced Level +**Target Audience:** Production deployments, performance optimization +**Learning Path:** Master advanced patterns and optimization + +#### Production and Performance +- [Advanced Features](../high-level-sdk/advanced.md) - Optimization techniques +- [Best Practices](../reference/best-practices.md) - Production deployment +- [Generated SDK API Reference](../generated-sdk/api-reference.md) - Complete API access + +#### Reference and Maintenance +- [Reference Index](../reference/index.md) - All reference materials +- [Changelog](../reference/changelog.md) - Version updates + +## Content Organization by Use Cases + +### Getting Started +**Goal:** Learn lakeFS basics and choose the right SDK + +#### Primary Resources +- [Python SDK Overview](../index.md) - SDK comparison and selection +- [Getting Started Guide](../getting-started.md) - Installation and authentication +- [SDK Decision Matrix](../index.md#sdk-selection-decision-matrix) - Choose the right SDK + +#### Next Steps +- [High-Level SDK Quickstart](../high-level-sdk/quickstart.md) - Basic operations +- [Tutorial Collection](../tutorials/index.md) - Real-world examples + +### Data Science Workflows +**Goal:** Use lakeFS for data analysis, ML, and research + +#### Primary Resources +- [lakefs-spec Overview](../lakefs-spec/index.md) - Filesystem interface +- [Data Science Integrations](../lakefs-spec/integrations.md) - pandas, dask, etc. +- [Data Science Tutorial](../tutorials/data-science-workflow.md) - End-to-end workflow + +#### Supporting Resources +- [Filesystem API](../lakefs-spec/filesystem-api.md) - File operations +- [lakefs-spec Transactions](../lakefs-spec/transactions.md) - Atomic operations +- [ML Experiment Tracking](../tutorials/ml-experiment-tracking.md) - Model versioning + +### Data Engineering and ETL +**Goal:** Build production data pipelines + +#### Primary Resources +- [High-Level SDK Overview](../high-level-sdk/index.md) - Comprehensive SDK +- [ETL Pipeline Tutorial](../tutorials/etl-pipeline.md) - Pipeline patterns +- [Transaction Handling](../high-level-sdk/transactions.md) - Atomic operations + +#### Supporting Resources +- [Data Imports and Exports](../high-level-sdk/imports-and-exports.md) - Bulk operations +- [Objects and I/O](../high-level-sdk/objects-and-io.md) - File handling +- [Best Practices](../reference/best-practices.md) - Production guidance + +### S3 Migration +**Goal:** Migrate existing S3 workflows to lakeFS + +#### Primary Resources +- [Boto3 Overview](../boto3/index.md) - S3-compatible interface +- [Boto3 Configuration](../boto3/configuration.md) - Setup guide +- [S3 Operations](../boto3/s3-operations.md) - S3-compatible patterns + +#### Supporting Resources +- [Boto3 S3 Router](../boto3/s3-router.md) - Hybrid workflows +- [Boto3 Troubleshooting](../boto3/troubleshooting.md) - Migration issues +- [API Comparison](../reference/api-comparison.md) - Feature differences + +### Direct API Access +**Goal:** Custom integrations and advanced operations + +#### Primary Resources +- [Generated SDK Overview](../generated-sdk/index.md) - Direct API access +- [Generated SDK API Reference](../generated-sdk/api-reference.md) - Complete API +- [Generated SDK Examples](../generated-sdk/examples.md) - Usage patterns + +#### Supporting Resources +- [Direct Access from High-Level SDK](../generated-sdk/direct-access.md) - Hybrid usage +- [Advanced Features](../high-level-sdk/advanced.md) - Optimization techniques + +### Production Deployment +**Goal:** Deploy lakeFS applications in production + +#### Primary Resources +- [Best Practices](../reference/best-practices.md) - Production guidance +- [Advanced Features](../high-level-sdk/advanced.md) - Performance optimization +- [Troubleshooting Guide](../reference/troubleshooting.md) - Issue resolution + +#### Supporting Resources +- [API Comparison](../reference/api-comparison.md) - Choose the right SDK +- [Reference Index](../reference/index.md) - All reference materials + +## Content Organization by Topics + +### SDK Comparison and Selection +- [Python SDK Overview](../index.md) +- [API Comparison](../reference/api-comparison.md) +- [Getting Started Guide](../getting-started.md) + +### Authentication and Configuration +- [Getting Started Guide](../getting-started.md#authentication-and-configuration) +- [Boto3 Configuration](../boto3/configuration.md) +- [Best Practices](../reference/best-practices.md#security) + +### Repository Management +- [Repository Management](../high-level-sdk/repositories.md) +- [High-Level SDK Overview](../high-level-sdk/index.md) +- [Generated SDK API Reference](../generated-sdk/api-reference.md) + +### Version Control Operations +- [Branches and Commits](../high-level-sdk/branches-and-commits.md) +- [Transaction Handling](../high-level-sdk/transactions.md) +- [lakefs-spec Transactions](../lakefs-spec/transactions.md) + +### Object and File Operations +- [Objects and I/O](../high-level-sdk/objects-and-io.md) +- [Filesystem API](../lakefs-spec/filesystem-api.md) +- [S3 Operations](../boto3/s3-operations.md) + +### Data Import and Export +- [Data Imports and Exports](../high-level-sdk/imports-and-exports.md) +- [ETL Pipeline Tutorial](../tutorials/etl-pipeline.md) +- [Best Practices](../reference/best-practices.md#data-operations) + +### Data Science Integration +- [Data Science Integrations](../lakefs-spec/integrations.md) +- [Data Science Tutorial](../tutorials/data-science-workflow.md) +- [ML Experiment Tracking](../tutorials/ml-experiment-tracking.md) + +### Performance and Optimization +- [Advanced Features](../high-level-sdk/advanced.md) +- [Best Practices](../reference/best-practices.md#performance) +- [API Comparison](../reference/api-comparison.md#performance-characteristics) + +### Troubleshooting and Support +- [Troubleshooting Guide](../reference/troubleshooting.md) +- [Boto3 Troubleshooting](../boto3/troubleshooting.md) +- [Reference Index](../reference/index.md) + +## Content Organization by Audience + +### Data Scientists and Analysts +**Primary Interests:** Data analysis, ML workflows, Jupyter notebooks + +#### Recommended Learning Path +1. [Python SDK Overview](../index.md) - Understand options +2. [lakefs-spec Overview](../lakefs-spec/index.md) - Filesystem interface +3. [Data Science Integrations](../lakefs-spec/integrations.md) - pandas, dask +4. [Data Science Tutorial](../tutorials/data-science-workflow.md) - End-to-end example +5. [ML Experiment Tracking](../tutorials/ml-experiment-tracking.md) - Model versioning + +#### Key Resources +- [Filesystem API](../lakefs-spec/filesystem-api.md) - File operations +- [lakefs-spec Transactions](../lakefs-spec/transactions.md) - Atomic operations +- [High-Level SDK](../high-level-sdk/index.md) - Alternative for advanced features + +### Data Engineers +**Primary Interests:** ETL pipelines, production systems, data operations + +#### Recommended Learning Path +1. [Python SDK Overview](../index.md) - Compare options +2. [High-Level SDK Overview](../high-level-sdk/index.md) - Comprehensive SDK +3. [High-Level SDK Quickstart](../high-level-sdk/quickstart.md) - Basic operations +4. [ETL Pipeline Tutorial](../tutorials/etl-pipeline.md) - Pipeline patterns +5. [Best Practices](../reference/best-practices.md) - Production deployment + +#### Key Resources +- [Transaction Handling](../high-level-sdk/transactions.md) - Atomic operations +- [Data Imports and Exports](../high-level-sdk/imports-and-exports.md) - Bulk operations +- [Advanced Features](../high-level-sdk/advanced.md) - Performance optimization +- [Troubleshooting Guide](../reference/troubleshooting.md) - Issue resolution + +### Python Developers +**Primary Interests:** API integration, custom applications, SDK usage + +#### Recommended Learning Path +1. [Getting Started Guide](../getting-started.md) - Setup and authentication +2. [High-Level SDK Quickstart](../high-level-sdk/quickstart.md) - Basic operations +3. [Generated SDK Overview](../generated-sdk/index.md) - Direct API access +4. [API Comparison](../reference/api-comparison.md) - Feature comparison +5. [Advanced Features](../high-level-sdk/advanced.md) - Optimization + +#### Key Resources +- [Generated SDK Examples](../generated-sdk/examples.md) - Usage patterns +- [Direct Access from High-Level SDK](../generated-sdk/direct-access.md) - Hybrid usage +- [Generated SDK API Reference](../generated-sdk/api-reference.md) - Complete API + +### System Administrators and DevOps +**Primary Interests:** Deployment, configuration, troubleshooting + +#### Recommended Learning Path +1. [Getting Started Guide](../getting-started.md) - Installation and setup +2. [Best Practices](../reference/best-practices.md) - Production deployment +3. [Troubleshooting Guide](../reference/troubleshooting.md) - Issue resolution +4. [API Comparison](../reference/api-comparison.md) - Choose the right SDK + +#### Key Resources +- [Reference Index](../reference/index.md) - All reference materials +- [Changelog](../reference/changelog.md) - Version updates +- [Advanced Features](../high-level-sdk/advanced.md) - Performance tuning + +### Legacy S3 Users +**Primary Interests:** S3 migration, compatibility, minimal code changes + +#### Recommended Learning Path +1. [Boto3 Overview](../boto3/index.md) - S3-compatible interface +2. [Boto3 Configuration](../boto3/configuration.md) - Setup guide +3. [S3 Operations](../boto3/s3-operations.md) - Migration patterns +4. [Boto3 S3 Router](../boto3/s3-router.md) - Hybrid workflows +5. [Boto3 Troubleshooting](../boto3/troubleshooting.md) - Common issues + +#### Key Resources +- [API Comparison](../reference/api-comparison.md) - Feature differences +- [High-Level SDK](../high-level-sdk/index.md) - Advanced features alternative + +## Search and Discoverability Features + +### Metadata Tags for Search +Each documentation page includes structured metadata: + +```yaml +sdk_types: ["high-level", "generated", "lakefs-spec", "boto3"] +difficulty: "beginner|intermediate|advanced" +use_cases: ["data-science", "etl", "s3-migration", "direct-api", etc.] +topics: ["authentication", "repositories", "objects", "transactions", etc.] +audience: ["data-scientists", "data-engineers", "developers", etc.] +``` + +### Content Filtering +Content can be filtered by: +- **SDK Type**: Show only High-Level SDK, Generated SDK, lakefs-spec, or Boto3 content +- **Difficulty Level**: Filter by beginner, intermediate, or advanced content +- **Use Case**: Filter by specific use cases like data science, ETL, migration +- **Audience**: Filter by target audience like data scientists, engineers, developers + +### Cross-Reference System +Comprehensive "See Also" sections provide: +- **Related Topics**: Links to related concepts and operations +- **Learning Path**: Suggested next steps and prerequisites +- **Alternative Approaches**: Different ways to accomplish the same task +- **External Resources**: Links to official documentation and resources + +### Topic-Based Navigation +Content is organized by topics: +- **Getting Started**: Installation, setup, authentication +- **Core Operations**: Repositories, branches, objects, transactions +- **Integration**: Data science libraries, S3 compatibility, API access +- **Advanced Topics**: Performance, production deployment, troubleshooting + +This categorization system enables users to: +1. **Find relevant content quickly** based on their role and use case +2. **Follow structured learning paths** from beginner to advanced +3. **Discover related content** through comprehensive cross-references +4. **Filter content** by difficulty, SDK type, and use case +5. **Navigate efficiently** through topic-based organization \ No newline at end of file diff --git a/docs/src/integrations/python/_templates/cross-reference-conventions.md b/docs/src/integrations/python/_templates/cross-reference-conventions.md new file mode 100644 index 00000000000..2467c4141fd --- /dev/null +++ b/docs/src/integrations/python/_templates/cross-reference-conventions.md @@ -0,0 +1,260 @@ +# Cross-Reference Conventions + +This document defines the conventions for linking and cross-referencing content within the Python documentation. + +## Link Types and Conventions + +### Internal Links + +#### Relative Path Links +Use relative paths for all internal links: + +```markdown + +[Getting Started](../getting-started.md) +[High-Level SDK](../high-level-sdk/index.md) +[API Reference](../reference/api-comparison.md) + + +[Getting Started](/docs/src/integrations/python/getting-started.md) +``` + +#### Section Links +Link to specific sections within pages: + +```markdown +[Repository Operations](repositories.md#creating-repositories) +[Error Handling](../reference/troubleshooting.md#common-errors) +``` + +#### Cross-SDK References +When referencing other SDK options: + +```markdown + +For direct API access, see the [Generated SDK](../generated-sdk/index.md). + + +For transaction support, consider the [High-Level SDK](../high-level-sdk/transactions.md). +``` + +### External Links + +#### Official Documentation +```markdown +[High-Level SDK Documentation](https://pydocs-lakefs.lakefs.io){:target="_blank"} +[Generated SDK Documentation](https://pydocs-sdk.lakefs.io){:target="_blank"} +[lakefs-spec Documentation](https://lakefs-spec.org/){:target="_blank"} +``` + +#### Third-Party Resources +```markdown +[pandas Documentation](https://pandas.pydata.org/docs/){:target="_blank"} +[Boto3 Documentation](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html){:target="_blank"} +``` + +## "See Also" Sections + +### Standard Format +Include "See Also" sections at the end of major topics: + +```markdown +## See Also + +- [Related Topic 1](../path/to/topic1.md) - Brief description +- [Related Topic 2](../path/to/topic2.md) - Brief description +- [External Resource](https://example.com){:target="_blank"} - Brief description +``` + +### Context-Specific Examples + +#### From Code Examples +```markdown +**See Also:** +- [Advanced Usage](../advanced/patterns.md) - More complex examples +- [Error Handling](../reference/troubleshooting.md) - Common issues and solutions +- [Best Practices](../reference/best-practices.md) - Production recommendations +``` + +#### From API Documentation +```markdown +**See Also:** +- [Related Methods](#related-method) - Similar functionality +- [Usage Examples](../examples/usage.md) - Practical implementations +- [Generated SDK Equivalent](../generated-sdk/api-reference.md#equivalent-method) - Direct API access +``` + +## Navigation Patterns + +### Breadcrumb References +Include contextual navigation hints: + +```markdown + +← Back to [High-Level SDK Overview](index.md) + + +**Next:** [Branches and Commits](branches-and-commits.md) → +``` + +### Progressive Learning Paths +Guide users through logical learning sequences: + +```markdown +## Learning Path + +1. **Start Here:** [Getting Started](../getting-started.md) +2. **Basic Operations:** [Repository Management](repositories.md) +3. **Advanced Features:** [Transactions](transactions.md) +4. **Best Practices:** [Production Guide](../reference/best-practices.md) +``` + +## SDK Comparison References + +### Feature Comparison Links +When discussing features, link to comparisons: + +```markdown +The High-Level SDK provides simplified transaction support. For a complete comparison of transaction features across all SDKs, see the [API Comparison](../reference/api-comparison.md#transactions). +``` + +### Alternative Approaches +Show users alternative ways to accomplish tasks: + +```markdown +### Alternative Approaches + +- **High-Level SDK:** [Simple upload method](../high-level-sdk/objects-and-io.md#simple-upload) +- **Generated SDK:** [Direct API upload](../generated-sdk/examples.md#upload-operations) +- **lakefs-spec:** [Filesystem write](../lakefs-spec/filesystem-api.md#writing-files) +- **Boto3:** [S3-compatible upload](../boto3/s3-operations.md#upload-objects) +``` + +## Contextual Cross-References + +### In Code Examples +Reference related concepts within code examples: + +```python +# Create a repository (see Repository Management guide) +repo = lakefs.repository("my-repo").create( + storage_namespace="s3://bucket/path" +) + +# Create a branch (see Branches and Commits guide) +branch = repo.branch("feature").create(source_reference="main") + +# Use transactions for atomic operations (see Transactions guide) +with branch.transact(commit_message="Atomic update") as tx: + # Operations here... + pass +``` + +### In Error Messages +Link to troubleshooting information: + +```markdown +If you encounter authentication errors, see the [Authentication Troubleshooting](../reference/troubleshooting.md#authentication-issues) section. +``` + +## Topic Clustering + +### Related Content Groups +Group related topics together: + +```markdown +## Repository Operations +- [Creating Repositories](repositories.md#creating-repositories) +- [Listing Repositories](repositories.md#listing-repositories) +- [Repository Configuration](repositories.md#configuration) + +## Branch Operations +- [Creating Branches](branches-and-commits.md#creating-branches) +- [Merging Branches](branches-and-commits.md#merging-branches) +- [Branch Protection](branches-and-commits.md#branch-protection) +``` + +### Workflow-Based References +Link content based on common workflows: + +```markdown +## Data Science Workflow +1. [Setup Environment](../getting-started.md#installation) +2. [Load Data](../lakefs-spec/integrations.md#pandas-integration) +3. [Process Data](../tutorials/data-science-workflow.md#data-processing) +4. [Version Results](../high-level-sdk/transactions.md#data-versioning) +``` + +## Link Maintenance + +### Link Validation +- Use automated tools to check for broken links +- Regularly review and update external links +- Test internal links after restructuring content + +### Consistency Checks +- Ensure consistent link text for the same destinations +- Use the same relative paths throughout +- Maintain consistent "See Also" formatting + +### Update Procedures +When moving or renaming files: +1. Update all internal references +2. Add redirects if necessary +3. Update navigation menus +4. Test all affected links + +## Best Practices + +### Link Text Guidelines +```markdown + +[Learn about repository management](repositories.md) +[See the complete API reference](../reference/api-comparison.md) + + +[Click here](repositories.md) +[Read more](../reference/api-comparison.md) +``` + +### Context-Aware Linking +```markdown + +For production deployments, review the [security best practices](../reference/best-practices.md#security) before configuring authentication. + + +See [best practices](../reference/best-practices.md). +``` + +### Balanced Cross-Referencing +- Don't over-link common terms +- Focus on genuinely helpful references +- Avoid circular references +- Prioritize the most relevant links + +## Templates for Common Patterns + +### Tutorial Cross-References +```markdown +## Prerequisites +- Complete the [Getting Started guide](../getting-started.md) +- Understand [basic repository operations](../high-level-sdk/repositories.md) + +## Next Steps +- Try the [ETL Pipeline tutorial](etl-pipeline.md) +- Learn about [production deployment](../reference/best-practices.md#deployment) +``` + +### API Method Cross-References +```markdown +**Related Methods:** +- [`create_branch()`](branches-and-commits.md#create-branch) - Create new branches +- [`merge_into()`](branches-and-commits.md#merge-into) - Merge branches +- [`diff()`](branches-and-commits.md#diff) - Compare branches + +**See Also:** +- [Branch Management Tutorial](../tutorials/branch-management.md) +- [Version Control Best Practices](../reference/best-practices.md#version-control) +``` + +This cross-reference system ensures users can easily navigate between related concepts and find the information they need efficiently. \ No newline at end of file diff --git a/docs/src/integrations/python/_templates/documentation-standards.md b/docs/src/integrations/python/_templates/documentation-standards.md new file mode 100644 index 00000000000..ccfd2942de7 --- /dev/null +++ b/docs/src/integrations/python/_templates/documentation-standards.md @@ -0,0 +1,254 @@ +# Documentation Standards + +This document defines the standards and conventions for Python documentation in the lakeFS project. + +## Content Organization + +### File Structure +``` +docs/src/integrations/python/ +├── index.md # Main overview page +├── getting-started.md # Installation and setup +├── high-level-sdk/ # High-Level SDK documentation +├── generated-sdk/ # Generated SDK documentation +├── lakefs-spec/ # lakefs-spec documentation +├── boto3/ # Boto3 integration documentation +├── tutorials/ # Real-world examples +├── reference/ # Reference materials +└── _templates/ # Documentation templates +``` + +### Page Naming Conventions +- Use lowercase with hyphens: `getting-started.md` +- Be descriptive: `branches-and-commits.md` not `branches.md` +- Group related content in subdirectories +- Use `index.md` for section overview pages + +## Markdown Standards + +### Front Matter +Every page must include front matter: + +```yaml +--- +title: Page Title +description: Brief description for SEO and navigation (max 160 characters) +--- +``` + +Optional front matter fields: +```yaml +--- +title: Page Title +description: Brief description +sdk_types: ["high-level", "generated", "lakefs-spec", "boto3"] +difficulty: "beginner|intermediate|advanced" +last_updated: "2024-01-15" +--- +``` + +### Heading Structure +- Use only one H1 (`#`) per page (the title) +- Use H2 (`##`) for major sections +- Use H3 (`###`) for subsections +- Use H4 (`####`) for API methods and detailed items +- Don't skip heading levels + +### Code Blocks + +#### Python Code +Always specify the language for syntax highlighting: + +```python +import lakefs + +# Example code with comments +repo = lakefs.repository("my-repo") +``` + +#### Shell Commands +```bash +pip install lakefs +``` + +#### Configuration Files +```yaml +# YAML configuration +key: value +``` + +#### Output Examples +``` +Expected output here +``` + +### Links and Cross-References + +#### Internal Links +- Use relative paths: `[link text](../path/to/page.md)` +- Link to specific sections: `[section](page.md#section-heading)` +- Use descriptive link text, not "click here" + +#### External Links +- Open in new tab for external sites: `[text](https://example.com){:target="_blank"}` +- Include protocol: `https://` not `example.com` + +### Lists and Formatting + +#### Unordered Lists +- Use `-` for bullet points +- Be consistent with indentation (2 spaces) +- Use parallel structure in list items + +#### Ordered Lists +1. Use numbers for sequential steps +2. Start each item with a capital letter +3. End with periods if items are complete sentences + +#### Emphasis +- Use **bold** for UI elements and important terms +- Use *italics* for emphasis and variable names +- Use `code` for inline code, filenames, and commands + +## Code Standards + +### Code Quality +- All code examples must be syntactically correct +- Test code examples before publishing +- Use realistic, meaningful variable names +- Include necessary imports +- Show complete, runnable examples when possible + +### Code Style +- Follow PEP 8 Python style guidelines +- Use 4 spaces for indentation +- Keep lines under 88 characters when possible +- Use descriptive variable names + +### Error Handling +Include error handling in examples: + +```python +try: + result = operation() + print(f"Success: {result}") +except SpecificException as e: + print(f"Specific error: {e}") +except Exception as e: + print(f"General error: {e}") +``` + +### Comments +- Explain non-obvious operations +- Don't comment obvious code +- Use comments to explain the "why" not the "what" + +## Content Guidelines + +### Writing Style +- Use active voice: "Create a repository" not "A repository is created" +- Use second person: "you" not "we" or "one" +- Be concise but complete +- Use present tense for instructions +- Use parallel structure in lists and headings + +### Technical Accuracy +- Verify all technical information +- Keep content up-to-date with latest SDK versions +- Test all code examples +- Review for accuracy before publishing + +### Accessibility +- Use descriptive alt text for images +- Ensure good color contrast +- Use semantic HTML elements +- Provide text alternatives for visual content + +## SDK-Specific Standards + +### High-Level SDK +- Focus on the simplified, Pythonic interface +- Show transaction patterns where applicable +- Emphasize ease of use and best practices +- Include streaming I/O examples + +### Generated SDK +- Show direct API access patterns +- Include complete parameter documentation +- Demonstrate error handling +- Show how to access from High-Level SDK + +### lakefs-spec +- Emphasize filesystem semantics +- Show data science library integrations +- Include transaction examples +- Demonstrate fsspec compatibility + +### Boto3 +- Show S3-compatible operations +- Include migration examples from pure S3 +- Demonstrate configuration options +- Show hybrid S3/lakeFS patterns + +## Templates and Consistency + +### Use Provided Templates +- [Code Example Template](_templates/code-example-template.md) +- [API Reference Template](_templates/api-reference-template.md) +- [Tutorial Template](_templates/tutorial-template.md) + +### Consistent Patterns +- Use the same parameter names across similar operations +- Follow the same structure for similar content types +- Use consistent terminology throughout +- Maintain the same level of detail across sections + +## Review Process + +### Content Review Checklist +- [ ] Technical accuracy verified +- [ ] Code examples tested +- [ ] Links work correctly +- [ ] Spelling and grammar checked +- [ ] Follows style guidelines +- [ ] Includes proper cross-references +- [ ] Accessible to target audience + +### Code Review Checklist +- [ ] Syntactically correct +- [ ] Follows Python best practices +- [ ] Includes proper error handling +- [ ] Uses realistic examples +- [ ] Properly commented +- [ ] Tested and working + +## Maintenance + +### Regular Updates +- Review content quarterly for accuracy +- Update code examples with new SDK versions +- Fix broken links +- Update screenshots and diagrams +- Refresh external references + +### Version Management +- Tag documentation versions with SDK releases +- Maintain compatibility matrices +- Document breaking changes +- Provide migration guides + +## Tools and Automation + +### Recommended Tools +- **Linting**: Use markdownlint for consistency +- **Link Checking**: Automated link validation +- **Code Testing**: Automated testing of code examples +- **Spell Check**: Automated spell checking + +### Automation +- Set up CI/CD for documentation testing +- Automate link checking +- Test code examples in CI +- Generate API documentation from code + +This standards document ensures consistency, quality, and maintainability across all Python documentation in the lakeFS project. \ No newline at end of file diff --git a/docs/src/integrations/python/_templates/tutorial-template.md b/docs/src/integrations/python/_templates/tutorial-template.md new file mode 100644 index 00000000000..35231105341 --- /dev/null +++ b/docs/src/integrations/python/_templates/tutorial-template.md @@ -0,0 +1,226 @@ +# Tutorial Template + +Use this template for creating comprehensive tutorials in the Python documentation. + +## Tutorial Structure + +```markdown +--- +title: Tutorial Title +description: Brief description of what the tutorial covers +--- + +# Tutorial Title + +Brief introduction explaining what readers will learn and build in this tutorial. + +## What You'll Learn + +- Key concept 1 +- Key concept 2 +- Key concept 3 +- Practical skill or outcome + +## Prerequisites + +### Required Knowledge +- Python programming basics +- Familiarity with [specific concepts] +- Understanding of [domain knowledge] + +### Required Setup +- Python 3.8+ installed +- lakeFS server running (see [setup guide](../getting-started.md)) +- Required Python packages: + ```bash + pip install lakefs pandas numpy + ``` + +### Sample Data +Download or create the sample data used in this tutorial: +```python +# Code to generate or download sample data +``` + +## Overview + +High-level overview of what we'll build, including: +- Architecture diagram (if applicable) +- Data flow description +- Key components + +## Step-by-Step Implementation + +### Step 1: Setup and Configuration + +Description of the first step. + +```python +# Complete code for step 1 +import lakefs + +# Setup code with explanations +client = lakefs.Client(...) +``` + +**What's happening:** +- Explanation of the code +- Why this step is necessary +- Key concepts introduced + +### Step 2: [Next Major Step] + +Description of the second step. + +```python +# Complete code for step 2 +# Build on previous steps +``` + +**Key Points:** +- Important concept from this step +- How it relates to previous steps +- Common pitfalls to avoid + +### Step 3: [Continue with remaining steps] + +[Continue pattern for all major steps] + +## Complete Code + +Provide the complete, working code for the entire tutorial: + +```python +#!/usr/bin/env python3 +""" +Complete tutorial implementation +""" + +import lakefs +import pandas as pd +# Other imports + +def main(): + """Main tutorial function""" + # Complete implementation + pass + +if __name__ == "__main__": + main() +``` + +## Testing Your Implementation + +How to verify the tutorial works correctly: + +```python +# Test code or verification steps +def test_implementation(): + # Verification logic + pass +``` + +## Troubleshooting + +### Common Issues + +#### Issue 1: [Common Problem] +**Symptom:** Description of what users might see +**Cause:** Why this happens +**Solution:** How to fix it + +```python +# Code example showing the fix +``` + +#### Issue 2: [Another Common Problem] +[Same format as above] + +## Next Steps + +What readers can do after completing this tutorial: + +- **Extend the Example:** Suggestions for modifications +- **Related Tutorials:** Links to follow-up tutorials +- **Advanced Topics:** Links to more advanced documentation +- **Production Considerations:** What to think about for real-world use + +## Additional Resources + +- [Related Documentation](../path/to/docs.md) +- [API Reference](../reference/api.md) +- [Best Practices](../reference/best-practices.md) +- External resources (GitHub repos, blog posts, etc.) +``` + +## Tutorial Guidelines + +### Content Structure +1. **Clear Learning Objectives**: State what readers will accomplish +2. **Prerequisite Check**: Ensure readers have necessary background +3. **Progressive Complexity**: Start simple, build complexity gradually +4. **Complete Examples**: Every code snippet should be runnable +5. **Explanation**: Don't just show code, explain the reasoning +6. **Testing**: Include ways to verify the implementation works +7. **Troubleshooting**: Address common issues proactively + +### Code Quality Standards +- All code must be tested and working +- Use realistic, meaningful examples +- Include proper error handling +- Follow Python best practices +- Use consistent variable naming +- Add comments for complex logic + +### Writing Style +- Use second person ("you will", "your code") +- Be encouraging and supportive +- Explain concepts before showing code +- Use active voice +- Keep paragraphs concise +- Use bullet points for lists + +### Visual Elements +- Include diagrams for complex concepts +- Use code blocks with syntax highlighting +- Add screenshots for UI elements (if applicable) +- Use callout boxes for important notes + +## Tutorial Types + +### Beginner Tutorials +- Focus on fundamental concepts +- Provide extensive explanation +- Include more basic examples +- Cover common pitfalls +- Link to prerequisite learning + +### Intermediate Tutorials +- Assume basic knowledge +- Focus on practical applications +- Show real-world scenarios +- Include performance considerations +- Demonstrate best practices + +### Advanced Tutorials +- Assume strong foundation +- Cover complex use cases +- Show optimization techniques +- Include architectural decisions +- Discuss trade-offs and alternatives + +## Example Tutorial Outline + +### Data Science Workflow Tutorial +1. **Introduction** - What we're building +2. **Setup** - Environment and data preparation +3. **Data Ingestion** - Loading data into lakeFS +4. **Exploration** - Using pandas with lakeFS +5. **Processing** - Data transformation pipeline +6. **Versioning** - Creating branches for experiments +7. **Analysis** - Statistical analysis and visualization +8. **Results** - Saving and sharing results +9. **Collaboration** - Working with team members +10. **Production** - Deploying the workflow + +Each section follows the step-by-step format with complete code examples. \ No newline at end of file diff --git a/docs/src/integrations/python/boto3/configuration.md b/docs/src/integrations/python/boto3/configuration.md new file mode 100644 index 00000000000..0b2e3889a8f --- /dev/null +++ b/docs/src/integrations/python/boto3/configuration.md @@ -0,0 +1,497 @@ +--- +title: Boto3 Configuration +description: Comprehensive setup and configuration guide for using Boto3 with lakeFS +sdk_types: ["boto3"] +difficulty: "beginner" +use_cases: ["s3-migration", "legacy-integration"] +--- + +# Boto3 Configuration + +This guide covers comprehensive setup and configuration options for using Boto3 with lakeFS, including SSL settings, proxy configuration, and checksum handling for newer Boto3 versions. + +## Prerequisites + +Before configuring Boto3 with lakeFS, ensure you have: + +- **lakeFS Server** - Running lakeFS instance (local or remote) +- **Access Credentials** - lakeFS access key ID and secret access key +- **Boto3 Installed** - `pip install boto3` +- **Network Access** - Connectivity to lakeFS endpoint + +## Basic Configuration + +### Minimal Setup + +The simplest way to configure Boto3 with lakeFS: + +```python +import boto3 + +# Basic lakeFS configuration +s3_client = boto3.client('s3', + endpoint_url='http://localhost:8000', # lakeFS endpoint + aws_access_key_id='AKIAIOSFODNN7EXAMPLE', + aws_secret_access_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY' +) + +# Test the connection +try: + response = s3_client.list_buckets() + print(f"Connected! Found {len(response['Buckets'])} repositories") +except Exception as e: + print(f"Connection failed: {e}") +``` + +**Expected Output:** +``` +Connected! Found 3 repositories +``` + +### Environment-Based Configuration + +For production environments, use environment variables: + +```python +import boto3 +import os + +# Set environment variables first +os.environ['LAKEFS_ENDPOINT'] = 'https://lakefs.example.com' +os.environ['LAKEFS_ACCESS_KEY_ID'] = 'AKIAIOSFODNN7EXAMPLE' +os.environ['LAKEFS_SECRET_ACCESS_KEY'] = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY' + +# Production configuration with environment variables +s3_client = boto3.client('s3', + endpoint_url=os.getenv('LAKEFS_ENDPOINT'), + aws_access_key_id=os.getenv('LAKEFS_ACCESS_KEY_ID'), + aws_secret_access_key=os.getenv('LAKEFS_SECRET_ACCESS_KEY') +) +``` + +### Configuration Validation + +Always validate your configuration: + +```python +def validate_lakefs_config(s3_client): + """Validate lakeFS Boto3 configuration""" + try: + # Test basic connectivity + response = s3_client.list_buckets() + print("✓ Connection successful") + + # Test repository access + repos = response.get('Buckets', []) + print(f"✓ Found {len(repos)} repositories") + + if repos: + # Test object operations on first repository + test_repo = repos[0]['Name'] + try: + s3_client.list_objects_v2(Bucket=test_repo, MaxKeys=1) + print(f"✓ Repository access confirmed: {test_repo}") + except Exception as e: + print(f"⚠ Repository access limited: {e}") + + return True + + except Exception as e: + print(f"✗ Configuration invalid: {e}") + return False + +# Usage +if validate_lakefs_config(s3_client): + print("Configuration is ready for use!") +``` + +## Advanced Configuration Options + +### SSL and Security Settings +```python +import boto3 + +# HTTPS configuration +s3_client = boto3.client('s3', + endpoint_url='https://lakefs.example.com', + aws_access_key_id='AKIAIOSFODNN7EXAMPLE', + aws_secret_access_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', + verify=True, # Verify SSL certificates + use_ssl=True +) + +# Custom CA certificate +s3_client = boto3.client('s3', + endpoint_url='https://lakefs.example.com', + aws_access_key_id='AKIAIOSFODNN7EXAMPLE', + aws_secret_access_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', + verify='/path/to/ca-bundle.pem' +) + +# Disable SSL verification (development only) +s3_client = boto3.client('s3', + endpoint_url='https://lakefs.example.com', + aws_access_key_id='AKIAIOSFODNN7EXAMPLE', + aws_secret_access_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', + verify=False +) +``` + +### Checksum Configuration + +For newer versions of Boto3, configure checksum settings to avoid issues: + +```python +import boto3 +from botocore.config import Config + +# Configure checksum settings for newer Boto3 versions +config = Config( + request_checksum_calculation='when_required', + response_checksum_validation='when_required' +) + +s3_client = boto3.client('s3', + endpoint_url='https://lakefs.example.com', + aws_access_key_id='AKIAIOSFODNN7EXAMPLE', + aws_secret_access_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', + config=config +) +``` + +### Proxy Configuration +```python +import boto3 +from botocore.config import Config + +# Configure proxy settings +config = Config( + proxies={ + 'http': 'http://proxy.example.com:8080', + 'https': 'https://proxy.example.com:8080' + } +) + +s3_client = boto3.client('s3', + endpoint_url='http://localhost:8000', + aws_access_key_id='AKIAIOSFODNN7EXAMPLE', + aws_secret_access_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', + config=config +) +``` + +## Connection and Performance Settings + +### Timeout Configuration +```python +import boto3 +from botocore.config import Config + +# Configure timeouts +config = Config( + connect_timeout=10, # Connection timeout in seconds + read_timeout=30, # Read timeout in seconds + retries={ + 'max_attempts': 3, + 'mode': 'adaptive' + } +) + +s3_client = boto3.client('s3', + endpoint_url='http://localhost:8000', + aws_access_key_id='AKIAIOSFODNN7EXAMPLE', + aws_secret_access_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', + config=config +) +``` + +### Connection Pooling +```python +import boto3 +from botocore.config import Config + +# Configure connection pooling +config = Config( + max_pool_connections=50, # Maximum connections in pool + retries={ + 'max_attempts': 3, + 'mode': 'adaptive' + } +) + +s3_client = boto3.client('s3', + endpoint_url='http://localhost:8000', + aws_access_key_id='AKIAIOSFODNN7EXAMPLE', + aws_secret_access_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', + config=config +) +``` + +## Configuration Patterns + +### Environment-Based Configuration +```python +import boto3 +import os +from botocore.config import Config + +def create_lakefs_client(): + """Create lakeFS S3 client with environment-based configuration""" + + # Required environment variables + endpoint_url = os.getenv('LAKEFS_ENDPOINT') + access_key = os.getenv('LAKEFS_ACCESS_KEY_ID') + secret_key = os.getenv('LAKEFS_SECRET_ACCESS_KEY') + + if not all([endpoint_url, access_key, secret_key]): + raise ValueError("Missing required lakeFS environment variables") + + # Optional configuration + config_params = {} + + # SSL configuration + if os.getenv('LAKEFS_VERIFY_SSL', 'true').lower() == 'false': + config_params['verify'] = False + + ca_cert_path = os.getenv('LAKEFS_CA_CERT_PATH') + if ca_cert_path: + config_params['verify'] = ca_cert_path + + # Proxy configuration + http_proxy = os.getenv('HTTP_PROXY') + https_proxy = os.getenv('HTTPS_PROXY') + if http_proxy or https_proxy: + proxies = {} + if http_proxy: + proxies['http'] = http_proxy + if https_proxy: + proxies['https'] = https_proxy + + config_params['config'] = Config(proxies=proxies) + + # Create client + return boto3.client('s3', + endpoint_url=endpoint_url, + aws_access_key_id=access_key, + aws_secret_access_key=secret_key, + **config_params + ) + +# Usage +s3_client = create_lakefs_client() +``` + +### Configuration Class +```python +import boto3 +from botocore.config import Config +from dataclasses import dataclass +from typing import Optional, Dict + +@dataclass +class LakeFSConfig: + """Configuration class for lakeFS Boto3 client""" + endpoint_url: str + access_key_id: str + secret_access_key: str + verify_ssl: bool = True + ca_cert_path: Optional[str] = None + proxies: Optional[Dict[str, str]] = None + connect_timeout: int = 10 + read_timeout: int = 30 + max_retries: int = 3 + max_pool_connections: int = 50 + + def create_client(self): + """Create Boto3 S3 client with this configuration""" + + # Build config + config_params = { + 'connect_timeout': self.connect_timeout, + 'read_timeout': self.read_timeout, + 'max_pool_connections': self.max_pool_connections, + 'retries': { + 'max_attempts': self.max_retries, + 'mode': 'adaptive' + } + } + + if self.proxies: + config_params['proxies'] = self.proxies + + # Handle SSL verification + verify = self.verify_ssl + if self.ca_cert_path: + verify = self.ca_cert_path + + return boto3.client('s3', + endpoint_url=self.endpoint_url, + aws_access_key_id=self.access_key_id, + aws_secret_access_key=self.secret_access_key, + verify=verify, + config=Config(**config_params) + ) + +# Usage +config = LakeFSConfig( + endpoint_url='http://localhost:8000', + access_key_id='AKIAIOSFODNN7EXAMPLE', + secret_access_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', + verify_ssl=True, + max_retries=5 +) + +s3_client = config.create_client() +``` + +## Session and Credential Management + +### Using Boto3 Sessions +```python +import boto3 + +# Create session with lakeFS credentials +session = boto3.Session( + aws_access_key_id='AKIAIOSFODNN7EXAMPLE', + aws_secret_access_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY' +) + +# Create S3 client from session +s3_client = session.client('s3', + endpoint_url='http://localhost:8000' +) + +# Create S3 resource from session +s3_resource = session.resource('s3', + endpoint_url='http://localhost:8000' +) +``` + +### Credential Providers +```python +import boto3 +from botocore.credentials import StaticCredentialsProvider + +# Custom credential provider +credentials = StaticCredentialsProvider( + access_key='AKIAIOSFODNN7EXAMPLE', + secret_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY' +) + +# Create client with custom credentials +s3_client = boto3.client('s3', + endpoint_url='http://localhost:8000' +) +``` + +## Testing Configuration + +### Configuration Validation +```python +def validate_lakefs_connection(s3_client): + """Validate lakeFS connection and configuration""" + try: + # Test basic connectivity + response = s3_client.list_buckets() + print("✓ Connection successful") + print(f"✓ Found {len(response.get('Buckets', []))} repositories") + return True + + except Exception as e: + print(f"✗ Connection failed: {e}") + return False + +def test_lakefs_operations(s3_client, test_repo='test-repo'): + """Test basic lakeFS operations""" + try: + # Test object operations + test_key = 'main/test-file.txt' + test_content = b'Test content' + + # Upload test object + s3_client.put_object( + Bucket=test_repo, + Key=test_key, + Body=test_content + ) + print("✓ Upload successful") + + # Download test object + response = s3_client.get_object(Bucket=test_repo, Key=test_key) + downloaded_content = response['Body'].read() + + if downloaded_content == test_content: + print("✓ Download successful") + else: + print("✗ Download content mismatch") + + # Clean up + s3_client.delete_object(Bucket=test_repo, Key=test_key) + print("✓ Cleanup successful") + + return True + + except Exception as e: + print(f"✗ Operation test failed: {e}") + return False + +# Usage +s3_client = create_lakefs_client() +if validate_lakefs_connection(s3_client): + test_lakefs_operations(s3_client) +``` + +## Common Configuration Issues + +### Troubleshooting SSL Issues +```python +import boto3 +import ssl +import urllib3 + +# Disable SSL warnings for development +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + +# For SSL certificate issues +try: + s3_client = boto3.client('s3', + endpoint_url='https://lakefs.example.com', + aws_access_key_id='AKIAIOSFODNN7EXAMPLE', + aws_secret_access_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', + verify=True + ) + s3_client.list_buckets() +except ssl.SSLError as e: + print(f"SSL Error: {e}") + print("Consider using verify=False for development or provide CA certificate") +``` + +### Handling Checksum Errors +```python +import boto3 +from botocore.config import Config + +# Configuration to handle checksum issues with newer Boto3 +def create_compatible_client(): + """Create client compatible with newer Boto3 versions""" + config = Config( + # Disable automatic checksums that may cause issues + request_checksum_calculation='when_required', + response_checksum_validation='when_required', + # Alternative: disable checksums entirely + # disable_request_compression=True + ) + + return boto3.client('s3', + endpoint_url='http://localhost:8000', + aws_access_key_id='AKIAIOSFODNN7EXAMPLE', + aws_secret_access_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', + config=config + ) +``` + +## Next Steps + +- Learn about [S3 operations](s3-operations.md) with lakeFS +- Explore [S3 Router](s3-router.md) for hybrid workflows +- Review [troubleshooting guide](../reference/troubleshooting.md) for common issues \ No newline at end of file diff --git a/docs/src/integrations/python/boto3/index.md b/docs/src/integrations/python/boto3/index.md new file mode 100644 index 00000000000..099b1ca791c --- /dev/null +++ b/docs/src/integrations/python/boto3/index.md @@ -0,0 +1,154 @@ +--- +title: Boto3 Integration +description: Comprehensive guide for using Boto3 with lakeFS for S3-compatible operations +sdk_types: ["boto3"] +difficulty: "beginner" +use_cases: ["s3-migration", "legacy-integration", "hybrid-workflows"] +--- + +# Boto3 Integration + +Boto3 provides S3-compatible access to lakeFS, enabling seamless migration of existing S3-based applications with minimal code changes. This integration allows you to leverage lakeFS's versioning and branching capabilities while maintaining familiar S3 operations. + +## Overview + +lakeFS provides full S3 API compatibility through its S3 Gateway, allowing you to use Boto3 with minimal configuration changes. This approach is ideal for: + +- **Existing S3 Applications**: Migrate applications with minimal code changes +- **Legacy Systems**: Integrate lakeFS into established workflows +- **Team Familiarity**: Leverage existing S3/Boto3 expertise +- **Gradual Migration**: Incrementally adopt lakeFS features + +### How It Works + +lakeFS repositories appear as S3 buckets, and branches/commits are represented in the object key path: + +``` +s3://my-repo/main/path/to/file.txt # main branch +s3://my-repo/feature-branch/path/to/file.txt # feature branch +s3://my-repo/c1a2b3c4d5e6f7g8/path/to/file.txt # specific commit +``` + +## Key Features + +### S3 API Compatibility +- **Complete S3 Operations** - PUT, GET, DELETE, LIST, HEAD operations +- **Multipart Uploads** - Support for large file uploads +- **Presigned URLs** - Generate temporary access URLs +- **Object Metadata** - Custom metadata and tagging support +- **Bucket Operations** - List repositories as buckets + +### lakeFS Integration Benefits +- **Version Control** - Every change is versioned and tracked +- **Branching** - Create isolated development environments +- **Atomic Operations** - Commit multiple changes atomically +- **Rollback Capability** - Easily revert to previous states +- **Audit Trail** - Complete history of all changes + +### Migration Advantages +- **Minimal Code Changes** - Usually just endpoint URL modification +- **Gradual Adoption** - Migrate services one at a time +- **Risk Reduction** - Test changes in isolated branches +- **Backward Compatibility** - Existing S3 tools continue to work + +## When to Use Boto3 with lakeFS + +### Ideal Use Cases +- **S3 Migration** - Moving existing S3-based applications to lakeFS +- **Legacy Integration** - Adding version control to existing systems +- **Data Pipeline Migration** - Converting ETL workflows to use lakeFS +- **Multi-Cloud Strategy** - Standardizing on S3 API across providers + +### Consider Alternatives When +- **New Development** - [High-Level SDK](../high-level-sdk/) offers more features +- **Advanced Features** - Need transactions, streaming, or advanced operations +- **Performance Critical** - Direct API access may be more efficient +- **Complex Workflows** - [lakefs-spec](../lakefs-spec/) better for data science + +## Documentation Sections + +- **[Configuration](configuration.md)** - Setup and configuration options +- **[S3 Operations](s3-operations.md)** - S3-compatible operations with lakeFS +- **[S3 Router](s3-router.md)** - Hybrid S3/lakeFS routing + +## Quick Example + +```python +import boto3 + +# Configure Boto3 client for lakeFS +s3 = boto3.client('s3', + endpoint_url='http://localhost:8000', + aws_access_key_id='AKIAIOSFODNN7EXAMPLE', + aws_secret_access_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY' +) + +# Use standard S3 operations +s3.put_object( + Bucket='my-repo', + Key='main/data/file.txt', + Body=b'Hello, lakeFS!' +) + +# List objects +response = s3.list_objects_v2(Bucket='my-repo', Prefix='main/') +for obj in response.get('Contents', []): + print(obj['Key']) +``` + +## Installation + +```bash +pip install boto3 +``` + +## Next Steps + +- Start with [configuration setup](configuration.md) +- Learn about [S3 operations](s3-operations.md) +- Explore [S3 Router](s3-router.md) for hybrid workflows + +## See Also + +**SDK Selection and Comparison:** +- [Python SDK Overview](../index.md) - Compare all Python SDK options +- [SDK Decision Matrix](../index.md#sdk-selection-decision-matrix) - Choose the right SDK for your use case +- [API Feature Comparison](../reference/api-comparison.md) - Detailed feature comparison across SDKs + +**Boto3 Integration Documentation:** +- [Configuration Guide](configuration.md) - Complete setup and authentication options +- [S3 Operations](s3-operations.md) - S3-compatible operations with lakeFS +- [S3 Router](s3-router.md) - Hybrid S3/lakeFS routing for gradual migration +- [Troubleshooting](troubleshooting.md) - Common issues and solutions + +**Migration and Integration:** +- [S3 Migration Patterns](s3-operations.md#migration-patterns) - Convert existing S3 code +- [Hybrid Workflows](s3-router.md#hybrid-configurations) - Combine S3 and lakeFS +- [Legacy System Integration](../reference/best-practices.md#legacy-integration) - Integration strategies + +**Alternative SDK Options:** +- [High-Level SDK](../high-level-sdk/index.md) - More features for new development +- [High-Level SDK Quickstart](../high-level-sdk/quickstart.md) - Object-oriented interface +- [Generated SDK](../generated-sdk/index.md) - Direct API access for custom operations +- [lakefs-spec](../lakefs-spec/index.md) - Filesystem interface for data science + +**Setup and Configuration:** +- [Installation Guide](../getting-started.md) - Complete setup instructions for all SDKs +- [Authentication Methods](../getting-started.md#authentication-and-configuration) - All credential configuration options +- [Best Practices](../reference/best-practices.md#boto3-configuration) - Production configuration guidance + +**Learning Resources:** +- [ETL Pipeline Tutorial](../tutorials/etl-pipeline.md) - Building data pipelines with S3 operations +- [Data Migration Examples](s3-operations.md#data-migration) - Real-world migration scenarios +- [Batch Processing Patterns](../tutorials/etl-pipeline.md#batch-processing) - Large-scale data operations + +**Reference Materials:** +- [S3 API Compatibility](s3-operations.md#api-compatibility) - Supported S3 operations +- [Error Handling](../reference/troubleshooting.md#boto3-issues) - Common issues and solutions +- [Performance Optimization](../reference/best-practices.md#boto3-performance) - Optimize S3 operations + +**External Resources:** +- [Boto3 Documentation](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html){:target="_blank"} - Official Boto3 documentation +- [AWS S3 API Reference](https://docs.aws.amazon.com/s3/latest/API/){:target="_blank"} - S3 API specification +- [lakeFS S3 Gateway](https://docs.lakefs.io/integrations/aws_cli.html){:target="_blank"} - lakeFS S3 compatibility documentation +- [S3 Migration Best Practices](https://aws.amazon.com/s3/migration/){:target="_blank"} - AWS migration guidance \ No newline at end of file diff --git a/docs/src/integrations/python/boto3/s3-operations.md b/docs/src/integrations/python/boto3/s3-operations.md new file mode 100644 index 00000000000..49b60aee972 --- /dev/null +++ b/docs/src/integrations/python/boto3/s3-operations.md @@ -0,0 +1,775 @@ +--- +title: S3 Operations with lakeFS +description: Comprehensive guide to S3-compatible operations using Boto3 with lakeFS +sdk_types: ["boto3"] +difficulty: "beginner" +use_cases: ["s3-migration", "object-storage"] +--- + +# S3 Operations with lakeFS + +This guide covers all S3-compatible operations available when using Boto3 with lakeFS, including object management, multipart uploads, and presigned URLs. + +## Understanding lakeFS Object Paths + +In lakeFS, object keys include the branch or commit reference: + +``` +Format: {branch-or-commit}/{path/to/object} + +Examples: +main/data/users.csv # File on main branch +feature-branch/data/users.csv # File on feature branch +c1a2b3c4d5e6f7g8/data/users.csv # File at specific commit +``` + +## Basic Object Operations + +### Upload Objects (PUT) + +Upload objects to lakeFS repositories using standard S3 operations: + +```python +import boto3 + +s3_client = boto3.client('s3', + endpoint_url='http://localhost:8000', + aws_access_key_id='AKIAIOSFODNN7EXAMPLE', + aws_secret_access_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY' +) + +# Upload string content +s3_client.put_object( + Bucket='my-repo', + Key='main/data/hello.txt', + Body='Hello, lakeFS!', + ContentType='text/plain' +) + +# Upload binary content +with open('local-file.pdf', 'rb') as f: + s3_client.put_object( + Bucket='my-repo', + Key='main/documents/file.pdf', + Body=f, + ContentType='application/pdf' + ) + +# Upload with metadata +s3_client.put_object( + Bucket='my-repo', + Key='main/data/processed.json', + Body='{"status": "processed"}', + ContentType='application/json', + Metadata={ + 'processed-by': 'data-pipeline', + 'version': '1.0', + 'timestamp': '2024-01-15T10:30:00Z' + } +) + +print("Objects uploaded successfully!") +``` + +### Download Objects (GET) + +Retrieve objects from lakeFS: + +```python +# Download object content +response = s3_client.get_object( + Bucket='my-repo', + Key='main/data/hello.txt' +) + +content = response['Body'].read().decode('utf-8') +print(f"Content: {content}") + +# Download with metadata +response = s3_client.get_object( + Bucket='my-repo', + Key='main/data/processed.json' +) + +# Access content +content = response['Body'].read().decode('utf-8') +print(f"Content: {content}") + +# Access metadata +metadata = response.get('Metadata', {}) +print(f"Metadata: {metadata}") + +# Access standard attributes +print(f"Content Type: {response['ContentType']}") +print(f"Content Length: {response['ContentLength']}") +print(f"Last Modified: {response['LastModified']}") +print(f"ETag: {response['ETag']}") +``` + +**Expected Output:** +``` +Content: Hello, lakeFS! +Content: {"status": "processed"} +Metadata: {'processed-by': 'data-pipeline', 'version': '1.0', 'timestamp': '2024-01-15T10:30:00Z'} +Content Type: application/json +Content Length: 23 +Last Modified: 2024-01-15 10:30:00+00:00 +ETag: "d41d8cd98f00b204e9800998ecf8427e" +``` + +### Download to File + +```python +# Download directly to file +s3_client.download_file( + Bucket='my-repo', + Key='main/documents/file.pdf', + Filename='downloaded-file.pdf' +) + +# Download with error handling +try: + s3_client.download_file( + Bucket='my-repo', + Key='main/data/large-dataset.csv', + Filename='local-dataset.csv' + ) + print("File downloaded successfully!") +except Exception as e: + print(f"Download failed: {e}") +``` + +### Object Information (HEAD) + +Get object metadata without downloading content: + +```python +# Get object metadata +response = s3_client.head_object( + Bucket='my-repo', + Key='main/data/processed.json' +) + +print(f"Content Type: {response['ContentType']}") +print(f"Content Length: {response['ContentLength']}") +print(f"Last Modified: {response['LastModified']}") +print(f"Custom Metadata: {response.get('Metadata', {})}") + +# Check if object exists +def object_exists(bucket, key): + try: + s3_client.head_object(Bucket=bucket, Key=key) + return True + except s3_client.exceptions.NoSuchKey: + return False + except Exception as e: + print(f"Error checking object: {e}") + return False + +if object_exists('my-repo', 'main/data/hello.txt'): + print("Object exists!") +``` + +### Delete Objects + +Remove objects from lakeFS: + +```python +# Delete single object +s3_client.delete_object( + Bucket='my-repo', + Key='main/temp/temporary-file.txt' +) + +# Delete multiple objects +objects_to_delete = [ + {'Key': 'main/temp/file1.txt'}, + {'Key': 'main/temp/file2.txt'}, + {'Key': 'main/temp/file3.txt'} +] + +response = s3_client.delete_objects( + Bucket='my-repo', + Delete={ + 'Objects': objects_to_delete, + 'Quiet': False # Set to True to suppress response details + } +) + +# Check deletion results +for deleted in response.get('Deleted', []): + print(f"Deleted: {deleted['Key']}") + +for error in response.get('Errors', []): + print(f"Error deleting {error['Key']}: {error['Message']}") +``` + +## List Operations + +### List Objects + +List objects in repositories and branches: + +```python +# List all objects in main branch +response = s3_client.list_objects_v2( + Bucket='my-repo', + Prefix='main/' +) + +print(f"Found {response.get('KeyCount', 0)} objects:") +for obj in response.get('Contents', []): + print(f" {obj['Key']} ({obj['Size']} bytes)") + +# List with pagination +paginator = s3_client.get_paginator('list_objects_v2') +page_iterator = paginator.paginate( + Bucket='my-repo', + Prefix='main/data/', + PaginationConfig={'PageSize': 100} +) + +for page in page_iterator: + for obj in page.get('Contents', []): + print(f"{obj['Key']} - {obj['LastModified']}") +``` + +### List with Filtering + +```python +# List objects with specific extension +def list_objects_by_extension(bucket, prefix, extension): + """List objects with specific file extension""" + response = s3_client.list_objects_v2( + Bucket=bucket, + Prefix=prefix + ) + + filtered_objects = [] + for obj in response.get('Contents', []): + if obj['Key'].endswith(extension): + filtered_objects.append(obj) + + return filtered_objects + +# Find all CSV files in data directory +csv_files = list_objects_by_extension('my-repo', 'main/data/', '.csv') +print(f"Found {len(csv_files)} CSV files:") +for obj in csv_files: + print(f" {obj['Key']} ({obj['Size']} bytes)") +``` + +### List Repositories (Buckets) + +```python +# List all repositories +response = s3_client.list_buckets() + +print(f"Found {len(response['Buckets'])} repositories:") +for bucket in response['Buckets']: + print(f" {bucket['Name']} (created: {bucket['CreationDate']})") +``` + +## Multipart Upload Operations + +For large files, use multipart uploads: + +```python +import os +from concurrent.futures import ThreadPoolExecutor + +def multipart_upload(bucket, key, file_path, part_size=5*1024*1024): + """Upload large file using multipart upload""" + + # Initialize multipart upload + response = s3_client.create_multipart_upload( + Bucket=bucket, + Key=key, + ContentType='application/octet-stream' + ) + upload_id = response['UploadId'] + + try: + # Calculate parts + file_size = os.path.getsize(file_path) + parts = [] + part_number = 1 + + with open(file_path, 'rb') as f: + while True: + data = f.read(part_size) + if not data: + break + + # Upload part + part_response = s3_client.upload_part( + Bucket=bucket, + Key=key, + PartNumber=part_number, + UploadId=upload_id, + Body=data + ) + + parts.append({ + 'ETag': part_response['ETag'], + 'PartNumber': part_number + }) + + print(f"Uploaded part {part_number}") + part_number += 1 + + # Complete multipart upload + s3_client.complete_multipart_upload( + Bucket=bucket, + Key=key, + UploadId=upload_id, + MultipartUpload={'Parts': parts} + ) + + print(f"Multipart upload completed: {key}") + return True + + except Exception as e: + # Abort upload on error + s3_client.abort_multipart_upload( + Bucket=bucket, + Key=key, + UploadId=upload_id + ) + print(f"Multipart upload aborted: {e}") + return False + +# Usage +success = multipart_upload( + bucket='my-repo', + key='main/large-files/dataset.zip', + file_path='local-large-file.zip' +) +``` + +### Parallel Multipart Upload + +```python +def parallel_multipart_upload(bucket, key, file_path, part_size=5*1024*1024, max_workers=4): + """Upload large file using parallel multipart upload""" + + # Initialize multipart upload + response = s3_client.create_multipart_upload( + Bucket=bucket, + Key=key + ) + upload_id = response['UploadId'] + + def upload_part(part_info): + part_number, start, size = part_info + with open(file_path, 'rb') as f: + f.seek(start) + data = f.read(size) + + response = s3_client.upload_part( + Bucket=bucket, + Key=key, + PartNumber=part_number, + UploadId=upload_id, + Body=data + ) + + return { + 'ETag': response['ETag'], + 'PartNumber': part_number + } + + try: + # Calculate parts + file_size = os.path.getsize(file_path) + parts_info = [] + part_number = 1 + start = 0 + + while start < file_size: + size = min(part_size, file_size - start) + parts_info.append((part_number, start, size)) + start += size + part_number += 1 + + # Upload parts in parallel + with ThreadPoolExecutor(max_workers=max_workers) as executor: + parts = list(executor.map(upload_part, parts_info)) + + # Sort parts by part number + parts.sort(key=lambda x: x['PartNumber']) + + # Complete multipart upload + s3_client.complete_multipart_upload( + Bucket=bucket, + Key=key, + UploadId=upload_id, + MultipartUpload={'Parts': parts} + ) + + print(f"Parallel multipart upload completed: {key}") + return True + + except Exception as e: + # Abort upload on error + s3_client.abort_multipart_upload( + Bucket=bucket, + Key=key, + UploadId=upload_id + ) + print(f"Parallel multipart upload failed: {e}") + return False + +# Usage +success = parallel_multipart_upload( + bucket='my-repo', + key='main/large-files/big-dataset.zip', + file_path='very-large-file.zip', + max_workers=8 +) +``` + +## Presigned URLs + +Generate temporary URLs for secure access: + +```python +from botocore.exceptions import ClientError + +# Generate presigned URL for download +def generate_download_url(bucket, key, expiration=3600): + """Generate presigned URL for downloading object""" + try: + url = s3_client.generate_presigned_url( + 'get_object', + Params={'Bucket': bucket, 'Key': key}, + ExpiresIn=expiration + ) + return url + except ClientError as e: + print(f"Error generating presigned URL: {e}") + return None + +# Generate presigned URL for upload +def generate_upload_url(bucket, key, expiration=3600): + """Generate presigned URL for uploading object""" + try: + url = s3_client.generate_presigned_url( + 'put_object', + Params={'Bucket': bucket, 'Key': key}, + ExpiresIn=expiration + ) + return url + except ClientError as e: + print(f"Error generating presigned URL: {e}") + return None + +# Usage +download_url = generate_download_url('my-repo', 'main/data/report.pdf', expiration=7200) +if download_url: + print(f"Download URL (valid for 2 hours): {download_url}") + +upload_url = generate_upload_url('my-repo', 'main/uploads/new-file.txt', expiration=1800) +if upload_url: + print(f"Upload URL (valid for 30 minutes): {upload_url}") +``` + +### Using Presigned URLs + +```python +import requests + +# Upload using presigned URL +def upload_with_presigned_url(presigned_url, file_path): + """Upload file using presigned URL""" + try: + with open(file_path, 'rb') as f: + response = requests.put(presigned_url, data=f) + response.raise_for_status() + print("Upload successful!") + return True + except Exception as e: + print(f"Upload failed: {e}") + return False + +# Download using presigned URL +def download_with_presigned_url(presigned_url, save_path): + """Download file using presigned URL""" + try: + response = requests.get(presigned_url) + response.raise_for_status() + + with open(save_path, 'wb') as f: + f.write(response.content) + print("Download successful!") + return True + except Exception as e: + print(f"Download failed: {e}") + return False + +# Usage +upload_url = generate_upload_url('my-repo', 'main/uploads/document.pdf') +if upload_url: + upload_with_presigned_url(upload_url, 'local-document.pdf') + +download_url = generate_download_url('my-repo', 'main/data/report.pdf') +if download_url: + download_with_presigned_url(download_url, 'downloaded-report.pdf') +``` + +## Object Metadata Operations + +### Working with Custom Metadata + +```python +# Upload with extensive metadata +s3_client.put_object( + Bucket='my-repo', + Key='main/datasets/customer-data.csv', + Body=open('customer-data.csv', 'rb'), + ContentType='text/csv', + Metadata={ + 'source': 'customer-database', + 'extracted-date': '2024-01-15', + 'record-count': '10000', + 'schema-version': '2.1', + 'data-classification': 'sensitive' + }, + # Standard S3 metadata + ContentDisposition='attachment; filename="customer-data.csv"', + ContentLanguage='en', + CacheControl='max-age=3600' +) + +# Retrieve and display metadata +response = s3_client.head_object( + Bucket='my-repo', + Key='main/datasets/customer-data.csv' +) + +print("Standard Metadata:") +print(f" Content Type: {response.get('ContentType')}") +print(f" Content Length: {response.get('ContentLength')}") +print(f" Last Modified: {response.get('LastModified')}") +print(f" ETag: {response.get('ETag')}") + +print("\nCustom Metadata:") +for key, value in response.get('Metadata', {}).items(): + print(f" {key}: {value}") +``` + +### Copy Objects with Metadata + +```python +# Copy object with metadata preservation +s3_client.copy_object( + Bucket='my-repo', + Key='feature-branch/datasets/customer-data.csv', + CopySource={ + 'Bucket': 'my-repo', + 'Key': 'main/datasets/customer-data.csv' + }, + MetadataDirective='COPY' # Preserve original metadata +) + +# Copy object with new metadata +s3_client.copy_object( + Bucket='my-repo', + Key='feature-branch/datasets/customer-data-v2.csv', + CopySource={ + 'Bucket': 'my-repo', + 'Key': 'main/datasets/customer-data.csv' + }, + MetadataDirective='REPLACE', + Metadata={ + 'source': 'customer-database', + 'version': '2.0', + 'updated-date': '2024-01-16' + }, + ContentType='text/csv' +) +``` + +## Error Handling + +### Common Error Patterns + +```python +from botocore.exceptions import ClientError, NoCredentialsError + +def robust_s3_operation(operation_func, *args, **kwargs): + """Execute S3 operation with comprehensive error handling""" + try: + return operation_func(*args, **kwargs) + + except NoCredentialsError: + print("Error: AWS credentials not found") + print("Check your access key configuration") + + except ClientError as e: + error_code = e.response['Error']['Code'] + error_message = e.response['Error']['Message'] + + if error_code == 'NoSuchBucket': + print(f"Error: Repository not found - {error_message}") + elif error_code == 'NoSuchKey': + print(f"Error: Object not found - {error_message}") + elif error_code == 'AccessDenied': + print(f"Error: Access denied - {error_message}") + elif error_code == 'InvalidRequest': + print(f"Error: Invalid request - {error_message}") + else: + print(f"Error: {error_code} - {error_message}") + + except Exception as e: + print(f"Unexpected error: {e}") + +# Usage examples +robust_s3_operation( + s3_client.get_object, + Bucket='my-repo', + Key='main/data/nonexistent.txt' +) + +robust_s3_operation( + s3_client.put_object, + Bucket='nonexistent-repo', + Key='main/data/test.txt', + Body='test content' +) +``` + +### Retry Logic + +```python +import time +from botocore.exceptions import ClientError + +def s3_operation_with_retry(operation_func, max_retries=3, backoff_factor=1, *args, **kwargs): + """Execute S3 operation with exponential backoff retry""" + + for attempt in range(max_retries + 1): + try: + return operation_func(*args, **kwargs) + + except ClientError as e: + error_code = e.response['Error']['Code'] + + # Don't retry certain errors + if error_code in ['NoSuchBucket', 'NoSuchKey', 'AccessDenied']: + raise + + if attempt == max_retries: + print(f"Max retries ({max_retries}) exceeded") + raise + + wait_time = backoff_factor * (2 ** attempt) + print(f"Attempt {attempt + 1} failed, retrying in {wait_time}s...") + time.sleep(wait_time) + + except Exception as e: + if attempt == max_retries: + raise + + wait_time = backoff_factor * (2 ** attempt) + print(f"Attempt {attempt + 1} failed, retrying in {wait_time}s...") + time.sleep(wait_time) + +# Usage +try: + response = s3_operation_with_retry( + s3_client.put_object, + max_retries=3, + backoff_factor=1, + Bucket='my-repo', + Key='main/data/important-file.txt', + Body='Important content' + ) + print("Operation successful!") +except Exception as e: + print(f"Operation failed after retries: {e}") +``` + +## Performance Optimization + +### Batch Operations + +```python +def batch_upload_objects(bucket, objects_data, max_workers=5): + """Upload multiple objects in parallel""" + from concurrent.futures import ThreadPoolExecutor, as_completed + + def upload_single_object(item): + key, body = item + try: + s3_client.put_object( + Bucket=bucket, + Key=key, + Body=body + ) + return f"✓ {key}" + except Exception as e: + return f"✗ {key}: {e}" + + results = [] + with ThreadPoolExecutor(max_workers=max_workers) as executor: + future_to_key = { + executor.submit(upload_single_object, item): item[0] + for item in objects_data + } + + for future in as_completed(future_to_key): + result = future.result() + results.append(result) + print(result) + + return results + +# Usage +objects_to_upload = [ + ('main/data/file1.txt', 'Content 1'), + ('main/data/file2.txt', 'Content 2'), + ('main/data/file3.txt', 'Content 3'), + ('main/data/file4.txt', 'Content 4'), + ('main/data/file5.txt', 'Content 5') +] + +results = batch_upload_objects('my-repo', objects_to_upload) +print(f"Batch upload completed: {len(results)} operations") +``` + +### Connection Pooling + +```python +import boto3 +from botocore.config import Config + +# Configure connection pooling for better performance +config = Config( + max_pool_connections=50, # Increase connection pool size + retries={ + 'max_attempts': 3, + 'mode': 'adaptive' + } +) + +s3_client = boto3.client('s3', + endpoint_url='http://localhost:8000', + aws_access_key_id='AKIAIOSFODNN7EXAMPLE', + aws_secret_access_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', + config=config +) +``` + +## Next Steps + +- Learn about [S3 Router](s3-router.md) for hybrid S3/lakeFS workflows +- Explore [troubleshooting guide](../reference/troubleshooting.md) for common issues +- Review [best practices](../reference/best-practices.md) for production usage +- Check out [Boto3 configuration](configuration.md) for advanced setup options + +## See Also + +- [High-Level SDK](../high-level-sdk/) - For advanced lakeFS features +- [Generated SDK](../generated-sdk/) - For direct API access +- [lakefs-spec](../lakefs-spec/) - For filesystem-style operations \ No newline at end of file diff --git a/docs/src/integrations/python/boto3/s3-router.md b/docs/src/integrations/python/boto3/s3-router.md new file mode 100644 index 00000000000..d7acd72b4d8 --- /dev/null +++ b/docs/src/integrations/python/boto3/s3-router.md @@ -0,0 +1,934 @@ +--- +title: Boto S3 Router Integration +description: Comprehensive guide to using Boto S3 Router for hybrid S3/lakeFS workflows +sdk_types: ["boto3"] +difficulty: "intermediate" +use_cases: ["hybrid-workflows", "gradual-migration", "multi-storage"] +--- + +# Boto S3 Router Integration + +The Boto S3 Router enables hybrid workflows that seamlessly route operations between traditional S3 buckets and lakeFS repositories based on configurable rules. This allows for gradual migration and mixed storage strategies. + +## Overview + +The S3 Router acts as an intelligent proxy that: + +- **Routes Operations** - Directs S3 operations to appropriate storage (S3 or lakeFS) +- **Enables Gradual Migration** - Migrate repositories one at a time +- **Supports Hybrid Workflows** - Use both S3 and lakeFS in the same application +- **Maintains Compatibility** - No changes to existing Boto3 code required + +### Architecture + +``` +Application (Boto3) → S3 Router → S3 Bucket + → lakeFS Repository +``` + +The router examines each request and routes it based on: +- Bucket/repository name patterns +- Request type (read/write operations) +- Custom routing rules +- Fallback strategies + +## Installation and Setup + +### Install S3 Router + +```bash +# Install the S3 Router package +pip install lakefs-s3-router + +# Or install from source +git clone https://github.com/treeverse/lakefs-s3-router.git +cd lakefs-s3-router +pip install -e . +``` + +### Basic Configuration + +Create a configuration file for the S3 Router: + +```yaml +# s3-router-config.yaml +router: + listen_address: "127.0.0.1:8080" + +# lakeFS configuration +lakefs: + endpoint: "http://localhost:8000" + access_key_id: "AKIAIOSFODNN7EXAMPLE" + secret_access_key: "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" + +# S3 configuration +s3: + region: "us-east-1" + # Uses default AWS credentials (environment, IAM role, etc.) + +# Routing rules +routing: + rules: + - pattern: "lakefs-*" + target: "lakefs" + - pattern: "versioned-*" + target: "lakefs" + - pattern: "*" + target: "s3" + + # Default fallback + default_target: "s3" +``` + +### Start the S3 Router + +```bash +# Start the router with configuration +lakefs-s3-router --config s3-router-config.yaml + +# Or with environment variables +export LAKEFS_ENDPOINT=http://localhost:8000 +export LAKEFS_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE +export LAKEFS_SECRET_ACCESS_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY +export AWS_REGION=us-east-1 + +lakefs-s3-router --listen 127.0.0.1:8080 +``` + +## Boto3 Client Configuration + +Configure Boto3 to use the S3 Router: + +```python +import boto3 + +# Configure Boto3 to use S3 Router +s3_client = boto3.client('s3', + endpoint_url='http://127.0.0.1:8080', # S3 Router endpoint + aws_access_key_id='your-aws-access-key', + aws_secret_access_key='your-aws-secret-key', + region_name='us-east-1' +) + +# Now all S3 operations will be routed automatically +# Operations on 'lakefs-*' buckets go to lakeFS +# Operations on other buckets go to S3 + +# This goes to lakeFS +s3_client.put_object( + Bucket='lakefs-data-repo', + Key='main/data/file.txt', + Body='Hello from lakeFS!' +) + +# This goes to S3 +s3_client.put_object( + Bucket='traditional-s3-bucket', + Key='data/file.txt', + Body='Hello from S3!' +) +``` + +## Routing Configuration + +### Pattern-Based Routing + +Configure routing rules based on bucket name patterns: + +```yaml +# Advanced routing configuration +routing: + rules: + # Route all repositories starting with 'data-' to lakeFS + - pattern: "data-*" + target: "lakefs" + description: "Data repositories" + + # Route ML experiment repositories to lakeFS + - pattern: "ml-experiments-*" + target: "lakefs" + description: "ML experiment tracking" + + # Route backup buckets to S3 + - pattern: "*-backup" + target: "s3" + description: "Backup storage" + + # Route logs to S3 (cheaper for append-only data) + - pattern: "logs-*" + target: "s3" + description: "Log storage" + + # Default: route everything else to S3 + - pattern: "*" + target: "s3" + description: "Default S3 storage" +``` + +### Operation-Based Routing + +Route based on operation types: + +```yaml +routing: + rules: + # Route write operations to lakeFS for versioning + - pattern: "critical-data-*" + target: "lakefs" + operations: ["PUT", "POST", "DELETE"] + description: "Version-controlled writes" + + # Route read operations to S3 for performance + - pattern: "critical-data-*" + target: "s3" + operations: ["GET", "HEAD"] + description: "Fast reads from S3" + + # Fallback for other operations + - pattern: "critical-data-*" + target: "lakefs" + description: "Default to lakeFS" +``` + +### Environment-Based Routing + +Different routing for different environments: + +```yaml +# Production configuration +routing: + environment: "production" + rules: + - pattern: "prod-*" + target: "lakefs" + - pattern: "staging-*" + target: "s3" + - pattern: "*" + target: "s3" + +--- +# Development configuration +routing: + environment: "development" + rules: + - pattern: "dev-*" + target: "lakefs" + - pattern: "*" + target: "lakefs" # Everything to lakeFS in dev +``` + +## Hybrid Workflow Examples + +### Gradual Migration Strategy + +Migrate repositories from S3 to lakeFS incrementally: + +```python +import boto3 +from datetime import datetime + +# S3 Router client +s3_client = boto3.client('s3', + endpoint_url='http://127.0.0.1:8080', + aws_access_key_id='your-aws-access-key', + aws_secret_access_key='your-aws-secret-key' +) + +class HybridDataManager: + """Manage data across S3 and lakeFS through S3 Router""" + + def __init__(self, s3_client): + self.s3_client = s3_client + + def migrate_bucket_to_lakefs(self, s3_bucket, lakefs_repo): + """Migrate data from S3 bucket to lakeFS repository""" + + print(f"Starting migration: {s3_bucket} → {lakefs_repo}") + + # List all objects in S3 bucket + paginator = self.s3_client.get_paginator('list_objects_v2') + pages = paginator.paginate(Bucket=s3_bucket) + + migrated_count = 0 + for page in pages: + for obj in page.get('Contents', []): + try: + # Copy object from S3 to lakeFS + self.s3_client.copy_object( + Bucket=lakefs_repo, + Key=f"main/{obj['Key']}", # Add to main branch + CopySource={ + 'Bucket': s3_bucket, + 'Key': obj['Key'] + } + ) + migrated_count += 1 + + if migrated_count % 100 == 0: + print(f"Migrated {migrated_count} objects...") + + except Exception as e: + print(f"Failed to migrate {obj['Key']}: {e}") + + print(f"Migration completed: {migrated_count} objects migrated") + + def sync_repositories(self, source_repo, target_repo, branch='main'): + """Sync data between repositories""" + + # List objects in source + response = self.s3_client.list_objects_v2( + Bucket=source_repo, + Prefix=f"{branch}/" + ) + + for obj in response.get('Contents', []): + # Check if object exists in target + target_key = obj['Key'] + + try: + self.s3_client.head_object( + Bucket=target_repo, + Key=target_key + ) + # Object exists, check if newer + # Implementation depends on your sync strategy + + except self.s3_client.exceptions.NoSuchKey: + # Object doesn't exist, copy it + self.s3_client.copy_object( + Bucket=target_repo, + Key=target_key, + CopySource={ + 'Bucket': source_repo, + 'Key': obj['Key'] + } + ) + print(f"Synced: {target_key}") + +# Usage +manager = HybridDataManager(s3_client) + +# Migrate specific bucket to lakeFS +manager.migrate_bucket_to_lakefs('old-s3-bucket', 'lakefs-new-repo') + +# Sync between repositories +manager.sync_repositories('lakefs-source-repo', 'lakefs-target-repo') +``` + +### Multi-Environment Workflow + +Handle different environments with different storage strategies: + +```python +import os +import boto3 + +class MultiEnvironmentClient: + """S3 client that adapts to different environments""" + + def __init__(self): + self.environment = os.getenv('ENVIRONMENT', 'development') + self.s3_client = self._create_client() + + def _create_client(self): + """Create S3 client based on environment""" + + if self.environment == 'production': + # Production: Use S3 Router for hybrid workflows + return boto3.client('s3', + endpoint_url='http://s3-router.prod.example.com:8080', + aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'), + aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY') + ) + + elif self.environment == 'staging': + # Staging: Direct lakeFS for testing + return boto3.client('s3', + endpoint_url='http://lakefs.staging.example.com:8000', + aws_access_key_id=os.getenv('LAKEFS_ACCESS_KEY_ID'), + aws_secret_access_key=os.getenv('LAKEFS_SECRET_ACCESS_KEY') + ) + + else: + # Development: Local lakeFS + return boto3.client('s3', + endpoint_url='http://localhost:8000', + aws_access_key_id='AKIAIOSFODNN7EXAMPLE', + aws_secret_access_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY' + ) + + def get_bucket_name(self, logical_name): + """Get environment-specific bucket name""" + + bucket_mapping = { + 'production': { + 'user-data': 'lakefs-prod-users', # → lakeFS + 'analytics': 'lakefs-prod-analytics', # → lakeFS + 'logs': 'prod-logs-s3', # → S3 + 'backups': 'prod-backups-s3' # → S3 + }, + 'staging': { + 'user-data': 'staging-users', + 'analytics': 'staging-analytics', + 'logs': 'staging-logs', + 'backups': 'staging-backups' + }, + 'development': { + 'user-data': 'dev-users', + 'analytics': 'dev-analytics', + 'logs': 'dev-logs', + 'backups': 'dev-backups' + } + } + + return bucket_mapping[self.environment][logical_name] + + def store_user_data(self, user_id, data): + """Store user data with environment-appropriate routing""" + + bucket = self.get_bucket_name('user-data') + key = f"main/users/{user_id}/profile.json" + + self.s3_client.put_object( + Bucket=bucket, + Key=key, + Body=data, + ContentType='application/json' + ) + + print(f"Stored user data in {bucket}/{key}") + + def store_analytics_data(self, dataset_name, data): + """Store analytics data""" + + bucket = self.get_bucket_name('analytics') + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + key = f"main/datasets/{dataset_name}/{timestamp}.parquet" + + self.s3_client.put_object( + Bucket=bucket, + Key=key, + Body=data, + ContentType='application/octet-stream' + ) + + print(f"Stored analytics data in {bucket}/{key}") + +# Usage +client = MultiEnvironmentClient() + +# These operations will be routed based on environment and configuration +client.store_user_data('user123', '{"name": "John", "email": "john@example.com"}') +client.store_analytics_data('daily-metrics', parquet_data) +``` + +## Advanced Configuration + +### Custom Routing Logic + +Implement custom routing logic with Python: + +```python +# custom_router.py +import re +from datetime import datetime + +class CustomRouter: + """Custom routing logic for S3 Router""" + + def __init__(self, config): + self.config = config + self.routing_rules = self._compile_rules() + + def _compile_rules(self): + """Compile routing rules for efficiency""" + rules = [] + for rule in self.config.get('routing', {}).get('rules', []): + pattern = rule['pattern'] + # Convert glob pattern to regex + regex_pattern = pattern.replace('*', '.*') + rules.append({ + 'regex': re.compile(regex_pattern), + 'target': rule['target'], + 'operations': rule.get('operations', []), + 'conditions': rule.get('conditions', {}) + }) + return rules + + def route_request(self, bucket_name, operation, metadata=None): + """Determine routing target for request""" + + for rule in self.routing_rules: + if rule['regex'].match(bucket_name): + # Check operation filter + if rule['operations'] and operation not in rule['operations']: + continue + + # Check custom conditions + if not self._check_conditions(rule['conditions'], metadata): + continue + + return rule['target'] + + # Default fallback + return self.config.get('routing', {}).get('default_target', 's3') + + def _check_conditions(self, conditions, metadata): + """Check custom routing conditions""" + + if not conditions: + return True + + # Time-based routing + if 'time_range' in conditions: + current_hour = datetime.now().hour + start, end = conditions['time_range'] + if not (start <= current_hour <= end): + return False + + # Size-based routing + if 'max_size' in conditions and metadata: + if metadata.get('content_length', 0) > conditions['max_size']: + return False + + # Content-type routing + if 'content_types' in conditions and metadata: + content_type = metadata.get('content_type', '') + if content_type not in conditions['content_types']: + return False + + return True + +# Configuration with custom conditions +config = { + 'routing': { + 'rules': [ + { + 'pattern': 'large-files-*', + 'target': 's3', + 'conditions': { + 'max_size': 100 * 1024 * 1024 # Files > 100MB go to S3 + } + }, + { + 'pattern': 'images-*', + 'target': 'lakefs', + 'conditions': { + 'content_types': ['image/jpeg', 'image/png', 'image/gif'] + } + }, + { + 'pattern': 'batch-*', + 'target': 'lakefs', + 'operations': ['PUT', 'POST'], + 'conditions': { + 'time_range': [2, 6] # Batch operations 2-6 AM + } + } + ], + 'default_target': 'lakefs' + } +} + +router = CustomRouter(config) +``` + +### Health Monitoring + +Monitor S3 Router health and routing decisions: + +```python +import boto3 +import time +import logging +from datetime import datetime + +class RouterMonitor: + """Monitor S3 Router health and performance""" + + def __init__(self, router_endpoint): + self.router_endpoint = router_endpoint + self.s3_client = boto3.client('s3', endpoint_url=router_endpoint) + self.logger = logging.getLogger(__name__) + + def health_check(self): + """Perform health check on S3 Router""" + + checks = { + 'router_connectivity': False, + 'lakefs_connectivity': False, + 's3_connectivity': False, + 'routing_accuracy': False + } + + try: + # Test router connectivity + response = self.s3_client.list_buckets() + checks['router_connectivity'] = True + + # Test lakeFS routing (assuming lakefs-* pattern) + test_bucket = 'lakefs-health-check' + try: + self.s3_client.head_bucket(Bucket=test_bucket) + checks['lakefs_connectivity'] = True + except: + pass + + # Test S3 routing + test_bucket = 's3-health-check' + try: + self.s3_client.head_bucket(Bucket=test_bucket) + checks['s3_connectivity'] = True + except: + pass + + # Test routing accuracy + checks['routing_accuracy'] = self._test_routing_accuracy() + + except Exception as e: + self.logger.error(f"Health check failed: {e}") + + return checks + + def _test_routing_accuracy(self): + """Test that routing works as expected""" + + test_cases = [ + ('lakefs-test', 'lakefs'), + ('s3-test', 's3'), + ('data-repo', 'lakefs'), # Based on your routing rules + ] + + for bucket, expected_target in test_cases: + try: + # This would require router API to report routing decisions + # Implementation depends on router capabilities + pass + except: + return False + + return True + + def monitor_performance(self, duration_minutes=5): + """Monitor router performance""" + + start_time = time.time() + end_time = start_time + (duration_minutes * 60) + + metrics = { + 'total_requests': 0, + 'successful_requests': 0, + 'failed_requests': 0, + 'average_response_time': 0, + 'routing_distribution': {} + } + + response_times = [] + + while time.time() < end_time: + try: + request_start = time.time() + + # Test request + self.s3_client.list_buckets() + + request_time = time.time() - request_start + response_times.append(request_time) + + metrics['total_requests'] += 1 + metrics['successful_requests'] += 1 + + except Exception as e: + metrics['total_requests'] += 1 + metrics['failed_requests'] += 1 + self.logger.error(f"Request failed: {e}") + + time.sleep(1) # Wait between requests + + # Calculate averages + if response_times: + metrics['average_response_time'] = sum(response_times) / len(response_times) + + return metrics + +# Usage +monitor = RouterMonitor('http://127.0.0.1:8080') + +# Perform health check +health = monitor.health_check() +print("Health Check Results:") +for check, status in health.items(): + print(f" {check}: {'✓' if status else '✗'}") + +# Monitor performance +print("\nMonitoring performance for 2 minutes...") +performance = monitor.monitor_performance(duration_minutes=2) +print("Performance Results:") +print(f" Total Requests: {performance['total_requests']}") +print(f" Success Rate: {performance['successful_requests']}/{performance['total_requests']}") +print(f" Average Response Time: {performance['average_response_time']:.3f}s") +``` + +## Migration Examples + +### From Pure S3 to Hybrid + +Migrate an existing S3-only application to use hybrid routing: + +```python +# Before: Direct S3 usage +import boto3 + +# Old configuration +s3_client = boto3.client('s3', + region_name='us-east-1' +) + +# Application code (unchanged) +def process_data(): + # Read from S3 + response = s3_client.get_object( + Bucket='data-bucket', + Key='input/data.csv' + ) + data = response['Body'].read() + + # Process data + processed_data = process_csv(data) + + # Write back to S3 + s3_client.put_object( + Bucket='results-bucket', + Key='output/processed.csv', + Body=processed_data + ) + +# After: Hybrid routing with S3 Router +import boto3 + +# New configuration - only endpoint changes! +s3_client = boto3.client('s3', + endpoint_url='http://127.0.0.1:8080', # S3 Router + region_name='us-east-1' +) + +# Application code remains exactly the same +def process_data(): + # Now routes to lakeFS if bucket matches pattern + response = s3_client.get_object( + Bucket='lakefs-data-bucket', # Routes to lakeFS + Key='main/input/data.csv' # lakeFS path format + ) + data = response['Body'].read() + + # Process data (unchanged) + processed_data = process_csv(data) + + # Routes to lakeFS for versioning + s3_client.put_object( + Bucket='lakefs-results-bucket', # Routes to lakeFS + Key='main/output/processed.csv', + Body=processed_data + ) +``` + +### Gradual Repository Migration + +Migrate repositories one by one: + +```python +class GradualMigrationManager: + """Manage gradual migration from S3 to lakeFS""" + + def __init__(self, s3_router_client): + self.s3_client = s3_router_client + self.migration_status = {} + + def plan_migration(self, s3_buckets, lakefs_repos): + """Plan migration phases""" + + migration_plan = { + 'phase_1': { + 'buckets': s3_buckets[:2], # Start with 2 buckets + 'repos': lakefs_repos[:2], + 'priority': 'high' + }, + 'phase_2': { + 'buckets': s3_buckets[2:5], + 'repos': lakefs_repos[2:5], + 'priority': 'medium' + }, + 'phase_3': { + 'buckets': s3_buckets[5:], + 'repos': lakefs_repos[5:], + 'priority': 'low' + } + } + + return migration_plan + + def execute_phase(self, phase_config): + """Execute a migration phase""" + + for s3_bucket, lakefs_repo in zip(phase_config['buckets'], phase_config['repos']): + print(f"Migrating {s3_bucket} → {lakefs_repo}") + + # Update routing configuration to include new repo + self._update_routing_config(lakefs_repo) + + # Migrate data + self._migrate_bucket_data(s3_bucket, lakefs_repo) + + # Update application configuration + self._update_app_config(s3_bucket, lakefs_repo) + + # Validate migration + if self._validate_migration(s3_bucket, lakefs_repo): + self.migration_status[s3_bucket] = 'completed' + print(f"✓ Migration completed: {s3_bucket}") + else: + self.migration_status[s3_bucket] = 'failed' + print(f"✗ Migration failed: {s3_bucket}") + + def _update_routing_config(self, lakefs_repo): + """Update S3 Router configuration""" + # Implementation depends on your router configuration method + pass + + def _migrate_bucket_data(self, s3_bucket, lakefs_repo): + """Migrate data from S3 bucket to lakeFS repo""" + # Implementation similar to previous examples + pass + + def _update_app_config(self, old_bucket, new_repo): + """Update application configuration""" + # Update configuration files, environment variables, etc. + pass + + def _validate_migration(self, s3_bucket, lakefs_repo): + """Validate successful migration""" + try: + # Compare object counts + s3_count = self._count_objects(s3_bucket) + lakefs_count = self._count_objects(lakefs_repo, prefix='main/') + + return s3_count == lakefs_count + except: + return False + + def _count_objects(self, bucket, prefix=''): + """Count objects in bucket""" + paginator = self.s3_client.get_paginator('list_objects_v2') + pages = paginator.paginate(Bucket=bucket, Prefix=prefix) + + count = 0 + for page in pages: + count += len(page.get('Contents', [])) + + return count + +# Usage +migration_manager = GradualMigrationManager(s3_client) + +# Plan migration +s3_buckets = ['old-bucket-1', 'old-bucket-2', 'old-bucket-3'] +lakefs_repos = ['lakefs-repo-1', 'lakefs-repo-2', 'lakefs-repo-3'] + +plan = migration_manager.plan_migration(s3_buckets, lakefs_repos) + +# Execute phases +for phase_name, phase_config in plan.items(): + print(f"\nExecuting {phase_name}...") + migration_manager.execute_phase(phase_config) + + # Wait for validation before next phase + input(f"Phase {phase_name} completed. Press Enter to continue to next phase...") +``` + +## Troubleshooting + +### Common Issues + +```python +def diagnose_router_issues(s3_client): + """Diagnose common S3 Router issues""" + + issues = [] + + try: + # Test basic connectivity + s3_client.list_buckets() + print("✓ Router connectivity OK") + except Exception as e: + issues.append(f"Router connectivity failed: {e}") + + # Test routing accuracy + test_cases = [ + ('lakefs-test', 'Should route to lakeFS'), + ('s3-test', 'Should route to S3') + ] + + for bucket, description in test_cases: + try: + # Attempt operation + s3_client.head_bucket(Bucket=bucket) + print(f"✓ {description}") + except Exception as e: + issues.append(f"{description}: {e}") + + # Check configuration + if not os.getenv('LAKEFS_ENDPOINT'): + issues.append("LAKEFS_ENDPOINT environment variable not set") + + if not os.getenv('AWS_REGION'): + issues.append("AWS_REGION environment variable not set") + + return issues + +# Usage +issues = diagnose_router_issues(s3_client) +if issues: + print("Issues found:") + for issue in issues: + print(f" ✗ {issue}") +else: + print("✓ All checks passed") +``` + +### Performance Optimization + +```python +# Optimize S3 Router performance +import boto3 +from botocore.config import Config + +# Configure for high-throughput scenarios +config = Config( + max_pool_connections=100, # Increase connection pool + retries={ + 'max_attempts': 3, + 'mode': 'adaptive' + }, + # Increase timeouts for router processing + connect_timeout=30, + read_timeout=60 +) + +s3_client = boto3.client('s3', + endpoint_url='http://127.0.0.1:8080', + config=config +) +``` + +## Next Steps + +- Review [S3 operations](s3-operations.md) for detailed operation examples +- Check [troubleshooting guide](../reference/troubleshooting.md) for common issues +- Explore [configuration options](configuration.md) for advanced setup +- Learn about [best practices](../reference/best-practices.md) for production usage + +## See Also + +- [High-Level SDK](../high-level-sdk/) - For advanced lakeFS features +- [Generated SDK](../generated-sdk/) - For direct API access +- [Boto3 Configuration](configuration.md) - Detailed setup guide \ No newline at end of file diff --git a/docs/src/integrations/python/boto3/troubleshooting.md b/docs/src/integrations/python/boto3/troubleshooting.md new file mode 100644 index 00000000000..3f0fc5c8482 --- /dev/null +++ b/docs/src/integrations/python/boto3/troubleshooting.md @@ -0,0 +1,1178 @@ +--- +title: Boto3 Troubleshooting and Migration Guide +description: Comprehensive troubleshooting guide and migration patterns for Boto3 with lakeFS +sdk_types: ["boto3"] +difficulty: "intermediate" +use_cases: ["troubleshooting", "migration", "s3-compatibility"] +--- + +# Boto3 Troubleshooting and Migration Guide + +This guide covers common issues when using Boto3 with lakeFS and provides step-by-step migration patterns from pure S3 workflows. + +## Common Error Scenarios + +### Connection and Authentication Errors + +#### Error: "Could not connect to the endpoint URL" + +**Symptom:** +``` +botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://localhost:8000/" +``` + +**Causes and Solutions:** + +1. **lakeFS Server Not Running** + ```bash + # Check if lakeFS is running + curl http://localhost:8000/api/v1/healthcheck + + # Start lakeFS if needed + lakefs run + ``` + +2. **Wrong Endpoint URL** + ```python + # ✗ Wrong + s3_client = boto3.client('s3', + endpoint_url='http://localhost:8080' # Wrong port + ) + + # ✓ Correct + s3_client = boto3.client('s3', + endpoint_url='http://localhost:8000' # Default lakeFS port + ) + ``` + +3. **Network/Firewall Issues** + ```python + # Test connectivity + import requests + try: + response = requests.get('http://localhost:8000/api/v1/healthcheck') + print(f"lakeFS is reachable: {response.status_code}") + except Exception as e: + print(f"Cannot reach lakeFS: {e}") + ``` + +#### Error: "The AWS Access Key Id you provided does not exist" + +**Symptom:** +``` +botocore.exceptions.ClientError: An error occurred (InvalidAccessKeyId) when calling the ListBuckets operation: The AWS Access Key Id you provided does not exist in our records. +``` + +**Solutions:** + +1. **Check Credentials** + ```python + # Verify credentials are set correctly + import os + + print("Endpoint:", os.getenv('LAKEFS_ENDPOINT')) + print("Access Key:", os.getenv('LAKEFS_ACCESS_KEY_ID')) + print("Secret Key:", "***" if os.getenv('LAKEFS_SECRET_ACCESS_KEY') else "NOT SET") + ``` + +2. **Create New Access Keys** + ```bash + # Using lakectl + lakectl auth users credentials create --user admin + ``` + +3. **Validate Credentials** + ```python + def validate_credentials(endpoint, access_key, secret_key): + """Validate lakeFS credentials""" + try: + client = boto3.client('s3', + endpoint_url=endpoint, + aws_access_key_id=access_key, + aws_secret_access_key=secret_key + ) + + response = client.list_buckets() + print(f"✓ Credentials valid. Found {len(response['Buckets'])} repositories") + return True + + except Exception as e: + print(f"✗ Credentials invalid: {e}") + return False + + # Test + validate_credentials( + 'http://localhost:8000', + 'AKIAIOSFODNN7EXAMPLE', + 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY' + ) + ``` + +### SSL and Certificate Errors + +#### Error: "SSL: CERTIFICATE_VERIFY_FAILED" + +**Symptom:** +``` +ssl.SSLError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate +``` + +**Solutions:** + +1. **Disable SSL Verification (Development Only)** + ```python + import urllib3 + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + + s3_client = boto3.client('s3', + endpoint_url='https://lakefs.example.com', + aws_access_key_id='AKIAIOSFODNN7EXAMPLE', + aws_secret_access_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', + verify=False # Disable SSL verification + ) + ``` + +2. **Provide Custom CA Certificate** + ```python + s3_client = boto3.client('s3', + endpoint_url='https://lakefs.example.com', + aws_access_key_id='AKIAIOSFODNN7EXAMPLE', + aws_secret_access_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', + verify='/path/to/ca-bundle.pem' # Custom CA certificate + ) + ``` + +3. **Use HTTP for Development** + ```python + # For local development, use HTTP instead of HTTPS + s3_client = boto3.client('s3', + endpoint_url='http://localhost:8000', # HTTP instead of HTTPS + aws_access_key_id='AKIAIOSFODNN7EXAMPLE', + aws_secret_access_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY' + ) + ``` + +### Checksum and Compatibility Errors + +#### Error: "An error occurred (InvalidRequest) when calling the PutObject operation" + +**Symptom:** +``` +botocore.exceptions.ClientError: An error occurred (InvalidRequest) when calling the PutObject operation: Content-MD5 header is required +``` + +**Cause:** Newer versions of Boto3 automatically calculate checksums that may not be compatible with lakeFS. + +**Solutions:** + +1. **Configure Checksum Settings** + ```python + from botocore.config import Config + + # Disable automatic checksums + config = Config( + request_checksum_calculation='when_required', + response_checksum_validation='when_required' + ) + + s3_client = boto3.client('s3', + endpoint_url='http://localhost:8000', + aws_access_key_id='AKIAIOSFODNN7EXAMPLE', + aws_secret_access_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', + config=config + ) + ``` + +2. **Downgrade Boto3 (Temporary Solution)** + ```bash + # Install compatible version + pip install boto3==1.26.137 botocore==1.29.137 + ``` + +3. **Use Alternative Upload Methods** + ```python + # Use upload_file instead of put_object for large files + s3_client.upload_file( + Filename='local-file.txt', + Bucket='my-repo', + Key='main/data/file.txt' + ) + ``` + +### Repository and Object Errors + +#### Error: "The specified bucket does not exist" + +**Symptom:** +``` +botocore.exceptions.ClientError: An error occurred (NoSuchBucket) when calling the GetObject operation: The specified bucket does not exist +``` + +**Solutions:** + +1. **Check Repository Exists** + ```python + def check_repository_exists(s3_client, repo_name): + """Check if repository exists""" + try: + s3_client.head_bucket(Bucket=repo_name) + print(f"✓ Repository '{repo_name}' exists") + return True + except s3_client.exceptions.NoSuchBucket: + print(f"✗ Repository '{repo_name}' does not exist") + return False + except Exception as e: + print(f"✗ Error checking repository: {e}") + return False + + # Usage + check_repository_exists(s3_client, 'my-repo') + ``` + +2. **List Available Repositories** + ```python + def list_repositories(s3_client): + """List all available repositories""" + try: + response = s3_client.list_buckets() + repos = [bucket['Name'] for bucket in response['Buckets']] + print(f"Available repositories: {repos}") + return repos + except Exception as e: + print(f"Error listing repositories: {e}") + return [] + + # Usage + available_repos = list_repositories(s3_client) + ``` + +3. **Create Repository (if needed)** + ```bash + # Create repository using lakectl + lakectl repo create lakefs://my-repo s3://my-storage-bucket/path/ + ``` + +#### Error: "The specified key does not exist" + +**Symptom:** +``` +botocore.exceptions.ClientError: An error occurred (NoSuchKey) when calling the GetObject operation: The specified key does not exist +``` + +**Solutions:** + +1. **Check Object Path Format** + ```python + # ✗ Wrong - missing branch + key = 'data/file.txt' + + # ✓ Correct - includes branch + key = 'main/data/file.txt' + + # ✓ Correct - specific commit + key = 'c1a2b3c4d5e6f7g8/data/file.txt' + ``` + +2. **List Objects to Verify Path** + ```python + def find_object(s3_client, bucket, partial_key): + """Find objects matching partial key""" + try: + response = s3_client.list_objects_v2( + Bucket=bucket, + Prefix=partial_key + ) + + objects = response.get('Contents', []) + if objects: + print(f"Found {len(objects)} matching objects:") + for obj in objects[:10]: # Show first 10 + print(f" {obj['Key']}") + else: + print(f"No objects found with prefix: {partial_key}") + + return [obj['Key'] for obj in objects] + + except Exception as e: + print(f"Error searching objects: {e}") + return [] + + # Usage + find_object(s3_client, 'my-repo', 'main/data/') + ``` + +3. **Check Branch Exists** + ```bash + # List branches using lakectl + lakectl branch list lakefs://my-repo + ``` + +## Migration Patterns + +### S3 to lakeFS Migration + +#### Pattern 1: Direct Replacement + +**Before (Pure S3):** +```python +import boto3 + +# Original S3 configuration +s3_client = boto3.client('s3', region_name='us-east-1') + +def process_data(): + # Read from S3 + response = s3_client.get_object( + Bucket='data-bucket', + Key='input/data.csv' + ) + + # Process data + data = response['Body'].read() + processed = process_csv_data(data) + + # Write to S3 + s3_client.put_object( + Bucket='results-bucket', + Key='output/processed.csv', + Body=processed + ) +``` + +**After (lakeFS):** +```python +import boto3 + +# lakeFS configuration - only endpoint changes +s3_client = boto3.client('s3', + endpoint_url='http://localhost:8000', + aws_access_key_id='AKIAIOSFODNN7EXAMPLE', + aws_secret_access_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY' +) + +def process_data(): + # Read from lakeFS - add branch to path + response = s3_client.get_object( + Bucket='data-repo', # Repository name + Key='main/input/data.csv' # Branch + path + ) + + # Process data (unchanged) + data = response['Body'].read() + processed = process_csv_data(data) + + # Write to lakeFS - add branch to path + s3_client.put_object( + Bucket='results-repo', # Repository name + Key='main/output/processed.csv', # Branch + path + Body=processed + ) +``` + +#### Pattern 2: Gradual Migration with Environment Variables + +```python +import boto3 +import os + +class AdaptiveS3Client: + """S3 client that adapts based on configuration""" + + def __init__(self): + self.use_lakefs = os.getenv('USE_LAKEFS', 'false').lower() == 'true' + self.s3_client = self._create_client() + + def _create_client(self): + if self.use_lakefs: + return boto3.client('s3', + endpoint_url=os.getenv('LAKEFS_ENDPOINT', 'http://localhost:8000'), + aws_access_key_id=os.getenv('LAKEFS_ACCESS_KEY_ID'), + aws_secret_access_key=os.getenv('LAKEFS_SECRET_ACCESS_KEY') + ) + else: + return boto3.client('s3', + region_name=os.getenv('AWS_REGION', 'us-east-1') + ) + + def get_bucket_name(self, logical_name): + """Get actual bucket/repository name""" + if self.use_lakefs: + return f"lakefs-{logical_name}" + else: + return f"s3-{logical_name}" + + def get_object_key(self, logical_path): + """Get actual object key""" + if self.use_lakefs: + return f"main/{logical_path}" # Add branch prefix + else: + return logical_path + + def get_object(self, logical_bucket, logical_key): + """Get object with automatic path translation""" + return self.s3_client.get_object( + Bucket=self.get_bucket_name(logical_bucket), + Key=self.get_object_key(logical_key) + ) + + def put_object(self, logical_bucket, logical_key, body, **kwargs): + """Put object with automatic path translation""" + return self.s3_client.put_object( + Bucket=self.get_bucket_name(logical_bucket), + Key=self.get_object_key(logical_key), + Body=body, + **kwargs + ) + +# Usage - same code works for both S3 and lakeFS +client = AdaptiveS3Client() + +# This works with both S3 and lakeFS +response = client.get_object('data', 'input/file.csv') +data = response['Body'].read() + +processed = process_data(data) + +client.put_object('results', 'output/processed.csv', processed) +``` + +#### Pattern 3: Side-by-Side Comparison + +```python +class MigrationValidator: + """Validate migration by comparing S3 and lakeFS results""" + + def __init__(self): + # S3 client + self.s3_client = boto3.client('s3', region_name='us-east-1') + + # lakeFS client + self.lakefs_client = boto3.client('s3', + endpoint_url='http://localhost:8000', + aws_access_key_id='AKIAIOSFODNN7EXAMPLE', + aws_secret_access_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY' + ) + + def compare_operations(self, s3_bucket, lakefs_repo, key_mapping): + """Compare operations between S3 and lakeFS""" + + results = { + 'matching': [], + 'different': [], + 'errors': [] + } + + for s3_key, lakefs_key in key_mapping.items(): + try: + # Get from S3 + s3_response = self.s3_client.get_object( + Bucket=s3_bucket, + Key=s3_key + ) + s3_content = s3_response['Body'].read() + + # Get from lakeFS + lakefs_response = self.lakefs_client.get_object( + Bucket=lakefs_repo, + Key=lakefs_key + ) + lakefs_content = lakefs_response['Body'].read() + + # Compare content + if s3_content == lakefs_content: + results['matching'].append(s3_key) + else: + results['different'].append({ + 's3_key': s3_key, + 'lakefs_key': lakefs_key, + 's3_size': len(s3_content), + 'lakefs_size': len(lakefs_content) + }) + + except Exception as e: + results['errors'].append({ + 's3_key': s3_key, + 'lakefs_key': lakefs_key, + 'error': str(e) + }) + + return results + + def migration_report(self, results): + """Generate migration validation report""" + + total = len(results['matching']) + len(results['different']) + len(results['errors']) + + print(f"Migration Validation Report") + print(f"=" * 40) + print(f"Total objects checked: {total}") + print(f"Matching: {len(results['matching'])} ({len(results['matching'])/total*100:.1f}%)") + print(f"Different: {len(results['different'])} ({len(results['different'])/total*100:.1f}%)") + print(f"Errors: {len(results['errors'])} ({len(results['errors'])/total*100:.1f}%)") + + if results['different']: + print(f"\nDifferent objects:") + for diff in results['different']: + print(f" {diff['s3_key']} → {diff['lakefs_key']}") + print(f" S3 size: {diff['s3_size']}, lakeFS size: {diff['lakefs_size']}") + + if results['errors']: + print(f"\nErrors:") + for error in results['errors']: + print(f" {error['s3_key']} → {error['lakefs_key']}: {error['error']}") + +# Usage +validator = MigrationValidator() + +# Define key mappings (S3 key → lakeFS key) +key_mapping = { + 'data/file1.csv': 'main/data/file1.csv', + 'data/file2.json': 'main/data/file2.json', + 'results/output.txt': 'main/results/output.txt' +} + +results = validator.compare_operations('s3-bucket', 'lakefs-repo', key_mapping) +validator.migration_report(results) +``` + +### Common Migration Challenges + +#### Challenge 1: Path Structure Differences + +**Problem:** S3 uses flat key structure, lakeFS uses branch/path structure. + +**Solution:** +```python +def convert_s3_path_to_lakefs(s3_key, branch='main'): + """Convert S3 key to lakeFS key format""" + return f"{branch}/{s3_key}" + +def convert_lakefs_path_to_s3(lakefs_key): + """Convert lakeFS key to S3 key format""" + # Remove branch prefix + parts = lakefs_key.split('/', 1) + if len(parts) > 1: + return parts[1] # Return path without branch + return lakefs_key + +# Usage +s3_key = 'data/users/profile.json' +lakefs_key = convert_s3_path_to_lakefs(s3_key) # 'main/data/users/profile.json' + +original_key = convert_lakefs_path_to_s3(lakefs_key) # 'data/users/profile.json' +``` + +#### Challenge 2: Batch Operations + +**Problem:** Need to migrate large numbers of objects efficiently. + +**Solution:** +```python +from concurrent.futures import ThreadPoolExecutor, as_completed +import time + +class BatchMigrator: + """Efficiently migrate objects in batches""" + + def __init__(self, s3_client, lakefs_client, max_workers=10): + self.s3_client = s3_client + self.lakefs_client = lakefs_client + self.max_workers = max_workers + + def migrate_objects(self, s3_bucket, lakefs_repo, object_list, branch='main'): + """Migrate objects in parallel""" + + def migrate_single_object(s3_key): + try: + # Download from S3 + response = self.s3_client.get_object( + Bucket=s3_bucket, + Key=s3_key + ) + + # Upload to lakeFS + self.lakefs_client.put_object( + Bucket=lakefs_repo, + Key=f"{branch}/{s3_key}", + Body=response['Body'].read(), + ContentType=response.get('ContentType', 'binary/octet-stream'), + Metadata=response.get('Metadata', {}) + ) + + return {'status': 'success', 'key': s3_key} + + except Exception as e: + return {'status': 'error', 'key': s3_key, 'error': str(e)} + + # Execute migrations in parallel + results = {'success': 0, 'errors': 0, 'details': []} + + with ThreadPoolExecutor(max_workers=self.max_workers) as executor: + # Submit all tasks + future_to_key = { + executor.submit(migrate_single_object, key): key + for key in object_list + } + + # Process results + for future in as_completed(future_to_key): + result = future.result() + results['details'].append(result) + + if result['status'] == 'success': + results['success'] += 1 + if results['success'] % 100 == 0: + print(f"Migrated {results['success']} objects...") + else: + results['errors'] += 1 + print(f"Error migrating {result['key']}: {result['error']}") + + return results + + def get_all_s3_objects(self, bucket, prefix=''): + """Get list of all objects in S3 bucket""" + objects = [] + paginator = self.s3_client.get_paginator('list_objects_v2') + + for page in paginator.paginate(Bucket=bucket, Prefix=prefix): + for obj in page.get('Contents', []): + objects.append(obj['Key']) + + return objects + +# Usage +migrator = BatchMigrator(s3_client, lakefs_client, max_workers=20) + +# Get all objects to migrate +objects_to_migrate = migrator.get_all_s3_objects('source-s3-bucket') +print(f"Found {len(objects_to_migrate)} objects to migrate") + +# Migrate in batches +batch_size = 1000 +for i in range(0, len(objects_to_migrate), batch_size): + batch = objects_to_migrate[i:i+batch_size] + print(f"Migrating batch {i//batch_size + 1} ({len(batch)} objects)...") + + results = migrator.migrate_objects('source-s3-bucket', 'target-lakefs-repo', batch) + print(f"Batch completed: {results['success']} success, {results['errors']} errors") + + # Brief pause between batches + time.sleep(1) +``` + +#### Challenge 3: Metadata Preservation + +**Problem:** Preserving S3 object metadata during migration. + +**Solution:** +```python +def migrate_with_metadata(s3_client, lakefs_client, s3_bucket, lakefs_repo, s3_key, lakefs_key): + """Migrate object while preserving all metadata""" + + try: + # Get object with metadata + response = s3_client.get_object(Bucket=s3_bucket, Key=s3_key) + + # Extract all metadata + content = response['Body'].read() + content_type = response.get('ContentType', 'binary/octet-stream') + metadata = response.get('Metadata', {}) + + # Additional S3 metadata to preserve + additional_metadata = {} + if 'CacheControl' in response: + additional_metadata['cache-control'] = response['CacheControl'] + if 'ContentDisposition' in response: + additional_metadata['content-disposition'] = response['ContentDisposition'] + if 'ContentEncoding' in response: + additional_metadata['content-encoding'] = response['ContentEncoding'] + if 'ContentLanguage' in response: + additional_metadata['content-language'] = response['ContentLanguage'] + + # Combine metadata + all_metadata = {**metadata, **additional_metadata} + + # Upload to lakeFS with preserved metadata + put_args = { + 'Bucket': lakefs_repo, + 'Key': lakefs_key, + 'Body': content, + 'ContentType': content_type + } + + if all_metadata: + put_args['Metadata'] = all_metadata + + if 'CacheControl' in response: + put_args['CacheControl'] = response['CacheControl'] + if 'ContentDisposition' in response: + put_args['ContentDisposition'] = response['ContentDisposition'] + if 'ContentEncoding' in response: + put_args['ContentEncoding'] = response['ContentEncoding'] + if 'ContentLanguage' in response: + put_args['ContentLanguage'] = response['ContentLanguage'] + + lakefs_client.put_object(**put_args) + + print(f"✓ Migrated {s3_key} with metadata") + return True + + except Exception as e: + print(f"✗ Failed to migrate {s3_key}: {e}") + return False + +# Usage +success = migrate_with_metadata( + s3_client, lakefs_client, + 's3-bucket', 'lakefs-repo', + 'data/file.csv', 'main/data/file.csv' +) +``` + +## Performance Troubleshooting + +### Slow Operations + +**Symptoms:** +- Long response times for S3 operations +- Timeouts during large file uploads +- Poor performance compared to direct S3 + +**Solutions:** + +1. **Optimize Connection Settings** + ```python + from botocore.config import Config + + # Optimize for performance + config = Config( + max_pool_connections=50, + retries={'max_attempts': 3, 'mode': 'adaptive'}, + connect_timeout=10, + read_timeout=60 + ) + + s3_client = boto3.client('s3', + endpoint_url='http://localhost:8000', + config=config + ) + ``` + +2. **Use Multipart Upload for Large Files** + ```python + def upload_large_file(s3_client, bucket, key, file_path, part_size=100*1024*1024): + """Upload large file using multipart upload""" + + file_size = os.path.getsize(file_path) + if file_size < part_size: + # Use regular upload for small files + s3_client.upload_file(file_path, bucket, key) + return + + # Use multipart upload + response = s3_client.create_multipart_upload(Bucket=bucket, Key=key) + upload_id = response['UploadId'] + + parts = [] + part_number = 1 + + try: + with open(file_path, 'rb') as f: + while True: + data = f.read(part_size) + if not data: + break + + part_response = s3_client.upload_part( + Bucket=bucket, + Key=key, + PartNumber=part_number, + UploadId=upload_id, + Body=data + ) + + parts.append({ + 'ETag': part_response['ETag'], + 'PartNumber': part_number + }) + + part_number += 1 + + # Complete upload + s3_client.complete_multipart_upload( + Bucket=bucket, + Key=key, + UploadId=upload_id, + MultipartUpload={'Parts': parts} + ) + + except Exception as e: + # Abort on error + s3_client.abort_multipart_upload( + Bucket=bucket, + Key=key, + UploadId=upload_id + ) + raise e + ``` + +3. **Monitor Performance** + ```python + import time + from contextlib import contextmanager + + @contextmanager + def measure_time(operation_name): + """Measure operation time""" + start = time.time() + try: + yield + finally: + duration = time.time() - start + print(f"{operation_name} took {duration:.2f} seconds") + + # Usage + with measure_time("List buckets"): + response = s3_client.list_buckets() + + with measure_time("Upload file"): + s3_client.upload_file('large-file.zip', 'my-repo', 'main/data/large-file.zip') + ``` + +### Memory Issues + +**Symptoms:** +- Out of memory errors during large file operations +- High memory usage during batch operations + +**Solutions:** + +1. **Stream Large Files** + ```python + def stream_large_file(s3_client, bucket, key, local_path): + """Stream large file without loading into memory""" + + with open(local_path, 'rb') as f: + s3_client.upload_fileobj(f, bucket, key) + + def download_large_file(s3_client, bucket, key, local_path): + """Download large file with streaming""" + + with open(local_path, 'wb') as f: + s3_client.download_fileobj(bucket, key, f) + ``` + +2. **Process in Chunks** + ```python + def process_large_object_in_chunks(s3_client, bucket, key, chunk_size=1024*1024): + """Process large object in chunks""" + + # Get object size + response = s3_client.head_object(Bucket=bucket, Key=key) + object_size = response['ContentLength'] + + # Process in chunks + for start in range(0, object_size, chunk_size): + end = min(start + chunk_size - 1, object_size - 1) + + # Download chunk + response = s3_client.get_object( + Bucket=bucket, + Key=key, + Range=f'bytes={start}-{end}' + ) + + chunk_data = response['Body'].read() + + # Process chunk + process_chunk(chunk_data) + + print(f"Processed bytes {start}-{end} of {object_size}") + ``` + +## Debugging Tools + +### Enable Debug Logging + +```python +import logging +import boto3 + +# Enable debug logging +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger('boto3') +logger.setLevel(logging.DEBUG) +logger = logging.getLogger('botocore') +logger.setLevel(logging.DEBUG) + +# Create client with debug logging +s3_client = boto3.client('s3', + endpoint_url='http://localhost:8000', + aws_access_key_id='AKIAIOSFODNN7EXAMPLE', + aws_secret_access_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY' +) + +# Operations will now show detailed debug information +s3_client.list_buckets() +``` + +### Request/Response Inspection + +```python +import boto3 +from botocore.awsrequest import AWSRequest + +class DebugS3Client: + """S3 client wrapper with request/response debugging""" + + def __init__(self, **kwargs): + self.client = boto3.client('s3', **kwargs) + + # Add event handlers for debugging + self.client.meta.events.register('before-call', self._log_request) + self.client.meta.events.register('after-call', self._log_response) + + def _log_request(self, event_name, **kwargs): + """Log request details""" + print(f"\n--- REQUEST ---") + print(f"Operation: {kwargs.get('operation_name')}") + print(f"Params: {kwargs.get('params', {})}") + + def _log_response(self, event_name, **kwargs): + """Log response details""" + print(f"\n--- RESPONSE ---") + if 'parsed' in kwargs: + response = kwargs['parsed'] + print(f"Status: {response.get('ResponseMetadata', {}).get('HTTPStatusCode')}") + print(f"Headers: {response.get('ResponseMetadata', {}).get('HTTPHeaders', {})}") + + def __getattr__(self, name): + """Delegate to underlying client""" + return getattr(self.client, name) + +# Usage +debug_client = DebugS3Client( + endpoint_url='http://localhost:8000', + aws_access_key_id='AKIAIOSFODNN7EXAMPLE', + aws_secret_access_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY' +) + +# This will show detailed request/response information +debug_client.list_buckets() +``` + +### Health Check Script + +```python +#!/usr/bin/env python3 +""" +lakeFS Boto3 Health Check Script +Run this script to diagnose common issues +""" + +import boto3 +import os +import sys +import requests +from botocore.exceptions import ClientError, NoCredentialsError + +def check_environment(): + """Check environment configuration""" + print("=== Environment Check ===") + + required_vars = ['LAKEFS_ENDPOINT', 'LAKEFS_ACCESS_KEY_ID', 'LAKEFS_SECRET_ACCESS_KEY'] + missing_vars = [] + + for var in required_vars: + value = os.getenv(var) + if value: + print(f"✓ {var}: {'*' * len(value)}") + else: + print(f"✗ {var}: NOT SET") + missing_vars.append(var) + + return len(missing_vars) == 0 + +def check_connectivity(): + """Check lakeFS connectivity""" + print("\n=== Connectivity Check ===") + + endpoint = os.getenv('LAKEFS_ENDPOINT', 'http://localhost:8000') + + try: + # Test HTTP connectivity + response = requests.get(f"{endpoint}/api/v1/healthcheck", timeout=10) + if response.status_code == 200: + print(f"✓ lakeFS server reachable at {endpoint}") + return True + else: + print(f"✗ lakeFS server returned status {response.status_code}") + return False + except Exception as e: + print(f"✗ Cannot reach lakeFS server: {e}") + return False + +def check_credentials(): + """Check credential validity""" + print("\n=== Credentials Check ===") + + try: + s3_client = boto3.client('s3', + endpoint_url=os.getenv('LAKEFS_ENDPOINT'), + aws_access_key_id=os.getenv('LAKEFS_ACCESS_KEY_ID'), + aws_secret_access_key=os.getenv('LAKEFS_SECRET_ACCESS_KEY') + ) + + response = s3_client.list_buckets() + repos = response.get('Buckets', []) + print(f"✓ Credentials valid. Found {len(repos)} repositories") + + if repos: + print(" Repositories:") + for repo in repos[:5]: # Show first 5 + print(f" - {repo['Name']}") + + return True + + except NoCredentialsError: + print("✗ No credentials found") + return False + except ClientError as e: + error_code = e.response['Error']['Code'] + if error_code == 'InvalidAccessKeyId': + print("✗ Invalid access key ID") + elif error_code == 'SignatureDoesNotMatch': + print("✗ Invalid secret access key") + else: + print(f"✗ Credential error: {error_code}") + return False + except Exception as e: + print(f"✗ Unexpected error: {e}") + return False + +def check_operations(): + """Check basic operations""" + print("\n=== Operations Check ===") + + try: + s3_client = boto3.client('s3', + endpoint_url=os.getenv('LAKEFS_ENDPOINT'), + aws_access_key_id=os.getenv('LAKEFS_ACCESS_KEY_ID'), + aws_secret_access_key=os.getenv('LAKEFS_SECRET_ACCESS_KEY') + ) + + # Get first repository + response = s3_client.list_buckets() + repos = response.get('Buckets', []) + + if not repos: + print("⚠ No repositories found for testing") + return True + + test_repo = repos[0]['Name'] + print(f"Testing operations on repository: {test_repo}") + + # Test list objects + try: + response = s3_client.list_objects_v2(Bucket=test_repo, MaxKeys=1) + print("✓ List objects successful") + except Exception as e: + print(f"✗ List objects failed: {e}") + return False + + # Test put/get/delete object + test_key = 'main/health-check-test.txt' + test_content = b'Health check test content' + + try: + # Put object + s3_client.put_object( + Bucket=test_repo, + Key=test_key, + Body=test_content + ) + print("✓ Put object successful") + + # Get object + response = s3_client.get_object(Bucket=test_repo, Key=test_key) + retrieved_content = response['Body'].read() + + if retrieved_content == test_content: + print("✓ Get object successful") + else: + print("✗ Get object content mismatch") + return False + + # Delete object + s3_client.delete_object(Bucket=test_repo, Key=test_key) + print("✓ Delete object successful") + + except Exception as e: + print(f"✗ Object operations failed: {e}") + return False + + return True + + except Exception as e: + print(f"✗ Operations check failed: {e}") + return False + +def main(): + """Run all health checks""" + print("lakeFS Boto3 Health Check") + print("=" * 40) + + checks = [ + ("Environment", check_environment), + ("Connectivity", check_connectivity), + ("Credentials", check_credentials), + ("Operations", check_operations) + ] + + results = {} + for name, check_func in checks: + results[name] = check_func() + + print("\n=== Summary ===") + all_passed = True + for name, passed in results.items(): + status = "✓ PASS" if passed else "✗ FAIL" + print(f"{name}: {status}") + if not passed: + all_passed = False + + if all_passed: + print("\n🎉 All checks passed! Your Boto3 + lakeFS setup is working correctly.") + sys.exit(0) + else: + print("\n❌ Some checks failed. Please review the errors above.") + sys.exit(1) + +if __name__ == "__main__": + main() +``` + +Save this as `lakefs_health_check.py` and run: + +```bash +# Set environment variables +export LAKEFS_ENDPOINT=http://localhost:8000 +export LAKEFS_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE +export LAKEFS_SECRET_ACCESS_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY + +# Run health check +python lakefs_health_check.py +``` + +## Next Steps + +- Review [S3 operations](s3-operations.md) for detailed operation examples +- Explore [S3 Router](s3-router.md) for hybrid workflows +- Check [configuration guide](configuration.md) for advanced setup +- Visit [best practices](../reference/best-practices.md) for production guidance + +## See Also + +- [High-Level SDK](../high-level-sdk/) - For advanced lakeFS features +- [Generated SDK](../generated-sdk/) - For direct API access +- [General troubleshooting](../reference/troubleshooting.md) - Cross-SDK issues \ No newline at end of file diff --git a/docs/src/integrations/python/generated-sdk/api-reference.md b/docs/src/integrations/python/generated-sdk/api-reference.md new file mode 100644 index 00000000000..7847a522f20 --- /dev/null +++ b/docs/src/integrations/python/generated-sdk/api-reference.md @@ -0,0 +1,1265 @@ +--- +title: Generated SDK API Reference +description: Complete API reference for the Generated Python SDK +sdk_types: ["generated"] +difficulty: "intermediate" +use_cases: ["api-reference", "direct-api", "custom-operations"] +topics: ["api", "reference", "methods", "classes"] +audience: ["developers", "advanced-users", "integrators"] +last_updated: "2024-01-15" +--- + +# Generated SDK API Reference + +Complete reference for the Generated Python SDK classes and methods. + +## Client Initialization + +### ApiClient Context Manager + +The recommended way to use the Generated SDK is with the `ApiClient` context manager: + +```python +import lakefs_sdk +from lakefs_sdk.rest import ApiException + +configuration = lakefs_sdk.Configuration( + host="http://localhost:8000", + username="AKIAIOSFODNN7EXAMPLE", + password="wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" +) + +with lakefs_sdk.ApiClient(configuration) as api_client: + # Create API instances + repositories_api = lakefs_sdk.RepositoriesApi(api_client) + # Use the API... +``` + +### Configuration Class + +The `Configuration` class manages all client settings: + +```python +import lakefs_sdk + +config = lakefs_sdk.Configuration( + # Required settings + host="http://localhost:8000", + username="access_key_id", + password="secret_access_key", + + # Optional authentication + access_token="jwt_token", # Alternative to username/password + + # SSL settings + verify_ssl=True, + ssl_ca_cert="/path/to/ca.pem", + cert_file="/path/to/client.pem", + key_file="/path/to/client.key", + + # Proxy settings + proxy="http://proxy:8080", + proxy_headers={"Proxy-Authorization": "Basic ..."}, + + # Connection settings + connection_pool_maxsize=10, + user_agent="MyApp/1.0" +) +``` + +## API Classes + +### RepositoriesApi + +Manage lakeFS repositories and their settings. + +#### `list_repositories(prefix=None, after=None, amount=None)` + +List repositories with optional filtering and pagination. + +**Parameters:** +- `prefix` (Optional[str]): Filter repositories by name prefix +- `after` (Optional[str]): Return results after this repository name +- `amount` (Optional[int]): Maximum number of results to return (default: 100) + +**Returns:** +- `RepositoryList`: List of repositories with pagination info + +**Example:** +```python +repositories_api = lakefs_sdk.RepositoriesApi(api_client) + +# List all repositories +repos = repositories_api.list_repositories() +for repo in repos.results: + print(f"Repository: {repo.id}") + +# List with filtering +filtered_repos = repositories_api.list_repositories( + prefix="data-", + amount=50 +) +``` + +#### `create_repository(repository_creation)` + +Create a new repository. + +**Parameters:** +- `repository_creation` (RepositoryCreation): Repository configuration + +**Returns:** +- `Repository`: Created repository object + +**Raises:** +- `ConflictException`: Repository already exists +- `ValidationException`: Invalid repository configuration + +**Example:** +```python +from lakefs_sdk.models import RepositoryCreation + +repo_creation = RepositoryCreation( + name="my-data-lake", + storage_namespace="s3://my-bucket/repos/my-data-lake", + default_branch="main", + sample_data=False +) + +repo = repositories_api.create_repository(repository_creation=repo_creation) +print(f"Created repository: {repo.id}") +``` + +#### `get_repository(repository)` + +Get repository details. + +**Parameters:** +- `repository` (str): Repository name + +**Returns:** +- `Repository`: Repository object + +**Raises:** +- `NotFoundException`: Repository not found + +**Example:** +```python +repo = repositories_api.get_repository(repository="my-repo") +print(f"Default branch: {repo.default_branch}") +print(f"Storage namespace: {repo.storage_namespace}") +``` + +#### `delete_repository(repository)` + +Delete a repository permanently. + +**Parameters:** +- `repository` (str): Repository name + +**Raises:** +- `NotFoundException`: Repository not found +- `ValidationException`: Repository cannot be deleted + +**Example:** +```python +repositories_api.delete_repository(repository="old-repo") +print("Repository deleted successfully") +``` + +### BranchesApi + +Manage branches within repositories. + +#### `list_branches(repository, prefix=None, after=None, amount=None)` + +List branches in a repository. + +**Parameters:** +- `repository` (str): Repository name +- `prefix` (Optional[str]): Filter branches by name prefix +- `after` (Optional[str]): Return results after this branch name +- `amount` (Optional[int]): Maximum number of results to return + +**Returns:** +- `RefList`: List of branches with pagination info + +**Example:** +```python +branches_api = lakefs_sdk.BranchesApi(api_client) + +# List all branches +branches = branches_api.list_branches(repository="my-repo") +for branch in branches.results: + print(f"Branch: {branch.id} (commit: {branch.commit_id})") + +# List with filtering +feature_branches = branches_api.list_branches( + repository="my-repo", + prefix="feature-" +) +``` + +#### `create_branch(repository, branch_creation)` + +Create a new branch. + +**Parameters:** +- `repository` (str): Repository name +- `branch_creation` (BranchCreation): Branch configuration + +**Returns:** +- `Ref`: Created branch object + +**Raises:** +- `ConflictException`: Branch already exists +- `NotFoundException`: Source reference not found + +**Example:** +```python +from lakefs_sdk.models import BranchCreation + +branch_creation = BranchCreation( + name="feature-new-analytics", + source="main" # Source branch or commit ID +) + +branch = branches_api.create_branch( + repository="my-repo", + branch_creation=branch_creation +) +print(f"Created branch: {branch.id}") +``` + +#### `get_branch(repository, branch)` + +Get branch details. + +**Parameters:** +- `repository` (str): Repository name +- `branch` (str): Branch name + +**Returns:** +- `Ref`: Branch object + +**Raises:** +- `NotFoundException`: Branch not found + +**Example:** +```python +branch = branches_api.get_branch( + repository="my-repo", + branch="feature-branch" +) +print(f"Branch commit: {branch.commit_id}") +``` + +#### `delete_branch(repository, branch)` + +Delete a branch. + +**Parameters:** +- `repository` (str): Repository name +- `branch` (str): Branch name + +**Raises:** +- `NotFoundException`: Branch not found +- `ValidationException`: Cannot delete protected branch + +**Example:** +```python +branches_api.delete_branch( + repository="my-repo", + branch="old-feature" +) +print("Branch deleted successfully") +``` + +#### `diff_branch(repository, branch, after=None, prefix=None, delimiter=None, amount=None)` + +Show changes between branch and its source. + +**Parameters:** +- `repository` (str): Repository name +- `branch` (str): Branch name +- `after` (Optional[str]): Return results after this path +- `prefix` (Optional[str]): Filter by path prefix +- `delimiter` (Optional[str]): Path delimiter for grouping +- `amount` (Optional[int]): Maximum number of results + +**Returns:** +- `DiffList`: List of changes + +**Example:** +```python +diff = branches_api.diff_branch( + repository="my-repo", + branch="feature-branch", + prefix="data/" +) + +for change in diff.results: + print(f"{change.type}: {change.path}") +``` + +### ObjectsApi + +Manage objects within repositories. + +#### `list_objects(repository, ref, prefix=None, after=None, delimiter=None, amount=None)` + +List objects in a repository reference. + +**Parameters:** +- `repository` (str): Repository name +- `ref` (str): Branch name, tag, or commit ID +- `prefix` (Optional[str]): Filter objects by path prefix +- `after` (Optional[str]): Return results after this path +- `delimiter` (Optional[str]): Path delimiter for grouping +- `amount` (Optional[int]): Maximum number of results to return + +**Returns:** +- `ObjectStatsList`: List of objects with pagination info + +**Example:** +```python +objects_api = lakefs_sdk.ObjectsApi(api_client) + +# List all objects +objects = objects_api.list_objects( + repository="my-repo", + ref="main" +) + +for obj in objects.results: + print(f"Object: {obj.path} ({obj.size_bytes} bytes)") + +# List objects with prefix +data_objects = objects_api.list_objects( + repository="my-repo", + ref="main", + prefix="data/", + amount=100 +) +``` + +#### `get_object(repository, ref, path, range=None, if_none_match=None, presign=None)` + +Download object content. + +**Parameters:** +- `repository` (str): Repository name +- `ref` (str): Branch name, tag, or commit ID +- `path` (str): Object path +- `range` (Optional[str]): Byte range (e.g., "bytes=0-1023") +- `if_none_match` (Optional[str]): ETag for conditional requests +- `presign` (Optional[bool]): Return presigned URL instead of content + +**Returns:** +- `bytes`: Object content (or presigned URL if presign=True) + +**Example:** +```python +# Download object content +content = objects_api.get_object( + repository="my-repo", + ref="main", + path="data/file.txt" +) +print(content.decode('utf-8')) + +# Get presigned URL +presigned_url = objects_api.get_object( + repository="my-repo", + ref="main", + path="data/file.txt", + presign=True +) +``` + +#### `upload_object(repository, branch, path, content=None, if_none_match=None, storage_class=None)` + +Upload object to a branch. + +**Parameters:** +- `repository` (str): Repository name +- `branch` (str): Branch name +- `path` (str): Object path +- `content` (Optional[bytes]): Object content +- `if_none_match` (Optional[str]): ETag for conditional upload +- `storage_class` (Optional[str]): Storage class for the object + +**Returns:** +- `ObjectStats`: Uploaded object metadata + +**Example:** +```python +# Upload text content +content = "Hello, lakeFS!".encode('utf-8') +stats = objects_api.upload_object( + repository="my-repo", + branch="main", + path="data/greeting.txt", + content=content +) +print(f"Uploaded: {stats.path} ({stats.size_bytes} bytes)") + +# Upload with storage class +stats = objects_api.upload_object( + repository="my-repo", + branch="main", + path="archive/old-data.csv", + content=csv_data, + storage_class="GLACIER" +) +``` + +#### `stat_object(repository, ref, path, user_metadata=None, presign=None)` + +Get object metadata. + +**Parameters:** +- `repository` (str): Repository name +- `ref` (str): Branch name, tag, or commit ID +- `path` (str): Object path +- `user_metadata` (Optional[bool]): Include user metadata +- `presign` (Optional[bool]): Include presigned URL + +**Returns:** +- `ObjectStats`: Object metadata + +**Example:** +```python +stats = objects_api.stat_object( + repository="my-repo", + ref="main", + path="data/file.txt", + user_metadata=True +) + +print(f"Size: {stats.size_bytes}") +print(f"Modified: {stats.mtime}") +print(f"Checksum: {stats.checksum}") +print(f"Metadata: {stats.metadata}") +``` + +#### `delete_object(repository, branch, path)` + +Delete an object from a branch. + +**Parameters:** +- `repository` (str): Repository name +- `branch` (str): Branch name +- `path` (str): Object path + +**Example:** +```python +objects_api.delete_object( + repository="my-repo", + branch="main", + path="data/old-file.txt" +) +print("Object deleted successfully") +``` + +#### `copy_object(repository, branch, dest_path, object_copy_creation)` + +Copy an object within or between repositories. + +**Parameters:** +- `repository` (str): Destination repository name +- `branch` (str): Destination branch name +- `dest_path` (str): Destination object path +- `object_copy_creation` (ObjectCopyCreation): Copy configuration + +**Returns:** +- `ObjectStats`: Copied object metadata + +**Example:** +```python +from lakefs_sdk.models import ObjectCopyCreation + +copy_config = ObjectCopyCreation( + src_path="data/source.txt", + src_ref="main" +) + +copied_stats = objects_api.copy_object( + repository="my-repo", + branch="feature-branch", + dest_path="data/copied.txt", + object_copy_creation=copy_config +) +``` + +### CommitsApi + +Manage commits and commit operations. + +#### `commit(repository, branch, commit_creation)` + +Create a commit on a branch. + +**Parameters:** +- `repository` (str): Repository name +- `branch` (str): Branch name +- `commit_creation` (CommitCreation): Commit configuration + +**Returns:** +- `Commit`: Created commit object + +**Raises:** +- `NotFoundException`: Repository or branch not found +- `ConflictException`: No changes to commit + +**Example:** +```python +from lakefs_sdk.models import CommitCreation + +commits_api = lakefs_sdk.CommitsApi(api_client) + +commit_creation = CommitCreation( + message="Add new analytics data", + metadata={ + "author": "data-team", + "source": "analytics-pipeline" + } +) + +commit = commits_api.commit( + repository="my-repo", + branch="main", + commit_creation=commit_creation +) +print(f"Created commit: {commit.id}") +``` + +#### `get_commit(repository, commit_id)` + +Get commit details. + +**Parameters:** +- `repository` (str): Repository name +- `commit_id` (str): Commit ID + +**Returns:** +- `Commit`: Commit object + +**Raises:** +- `NotFoundException`: Commit not found + +**Example:** +```python +commit = commits_api.get_commit( + repository="my-repo", + commit_id="c7a632d0a7c4c9b5e8f1a2b3c4d5e6f7g8h9i0j1" +) +print(f"Commit message: {commit.message}") +print(f"Author: {commit.committer}") +print(f"Date: {commit.creation_date}") +``` + +### RefsApi + +Manage references (branches, tags, commits) and operations between them. + +#### `diff_refs(repository, left_ref, right_ref, after=None, prefix=None, delimiter=None, amount=None)` + +Compare two references and show differences. + +**Parameters:** +- `repository` (str): Repository name +- `left_ref` (str): Left reference (branch, tag, or commit ID) +- `right_ref` (str): Right reference (branch, tag, or commit ID) +- `after` (Optional[str]): Return results after this path +- `prefix` (Optional[str]): Filter by path prefix +- `delimiter` (Optional[str]): Path delimiter for grouping +- `amount` (Optional[int]): Maximum number of results + +**Returns:** +- `DiffList`: List of differences between references + +**Example:** +```python +refs_api = lakefs_sdk.RefsApi(api_client) + +# Compare main branch with feature branch +diff = refs_api.diff_refs( + repository="my-repo", + left_ref="main", + right_ref="feature-branch" +) + +for change in diff.results: + print(f"{change.type}: {change.path}") + if change.type == "changed": + print(f" Size changed: {change.size_bytes}") +``` + +#### `merge_into_branch(repository, source_ref, destination_branch, merge=None)` + +Merge one reference into a branch. + +**Parameters:** +- `repository` (str): Repository name +- `source_ref` (str): Source reference to merge from +- `destination_branch` (str): Destination branch to merge into +- `merge` (Optional[Merge]): Merge configuration + +**Returns:** +- `MergeResult`: Result of the merge operation + +**Raises:** +- `ConflictException`: Merge conflicts detected +- `NotFoundException`: Reference not found + +**Example:** +```python +from lakefs_sdk.models import Merge + +merge_config = Merge( + message="Merge feature-analytics into main", + metadata={ + "merger": "data-team", + "review_id": "PR-123" + } +) + +result = refs_api.merge_into_branch( + repository="my-repo", + source_ref="feature-analytics", + destination_branch="main", + merge=merge_config +) + +print(f"Merge result: {result.reference}") +print(f"Summary: {result.summary}") +``` + +#### `log_commits(repository, ref, after=None, amount=None, objects=None, prefixes=None, limit=None, first_parent=None, since=None, until=None)` + +Get commit history for a reference. + +**Parameters:** +- `repository` (str): Repository name +- `ref` (str): Reference (branch, tag, or commit ID) +- `after` (Optional[str]): Return commits after this commit ID +- `amount` (Optional[int]): Maximum number of commits to return +- `objects` (Optional[List[str]]): Filter by specific object paths +- `prefixes` (Optional[List[str]]): Filter by path prefixes +- `limit` (Optional[bool]): Limit to commits affecting specified paths +- `first_parent` (Optional[bool]): Follow only first parent in merge commits +- `since` (Optional[datetime]): Show commits since this date +- `until` (Optional[datetime]): Show commits until this date + +**Returns:** +- `CommitList`: List of commits with pagination info + +**Example:** +```python +# Get recent commits +commits = refs_api.log_commits( + repository="my-repo", + ref="main", + amount=10 +) + +for commit in commits.results: + print(f"{commit.id[:8]}: {commit.message}") + print(f" Author: {commit.committer}") + +# Get commits affecting specific paths +data_commits = refs_api.log_commits( + repository="my-repo", + ref="main", + prefixes=["data/"], + limit=True +) +``` + +### TagsApi + +Manage tags for specific commits. + +#### `list_tags(repository, prefix=None, after=None, amount=None)` + +List tags in a repository. + +**Parameters:** +- `repository` (str): Repository name +- `prefix` (Optional[str]): Filter tags by name prefix +- `after` (Optional[str]): Return results after this tag name +- `amount` (Optional[int]): Maximum number of results to return + +**Returns:** +- `TagList`: List of tags with pagination info + +**Example:** +```python +tags_api = lakefs_sdk.TagsApi(api_client) + +# List all tags +tags = tags_api.list_tags(repository="my-repo") +for tag in tags.results: + print(f"Tag: {tag.id} -> {tag.commit_id}") + +# List version tags +version_tags = tags_api.list_tags( + repository="my-repo", + prefix="v" +) +``` + +#### `create_tag(repository, tag_creation)` + +Create a new tag. + +**Parameters:** +- `repository` (str): Repository name +- `tag_creation` (TagCreation): Tag configuration + +**Returns:** +- `Ref`: Created tag object + +**Raises:** +- `ConflictException`: Tag already exists +- `NotFoundException`: Target reference not found + +**Example:** +```python +from lakefs_sdk.models import TagCreation + +tag_creation = TagCreation( + id="v1.2.0", + ref="main" # Branch, tag, or commit ID +) + +tag = tags_api.create_tag( + repository="my-repo", + tag_creation=tag_creation +) +print(f"Created tag: {tag.id} -> {tag.commit_id}") +``` + +#### `get_tag(repository, tag)` + +Get tag details. + +**Parameters:** +- `repository` (str): Repository name +- `tag` (str): Tag name + +**Returns:** +- `Ref`: Tag object + +**Raises:** +- `NotFoundException`: Tag not found + +**Example:** +```python +tag = tags_api.get_tag( + repository="my-repo", + tag="v1.0.0" +) +print(f"Tag {tag.id} points to commit: {tag.commit_id}") +``` + +#### `delete_tag(repository, tag)` + +Delete a tag. + +**Parameters:** +- `repository` (str): Repository name +- `tag` (str): Tag name + +**Raises:** +- `NotFoundException`: Tag not found + +**Example:** +```python +tags_api.delete_tag( + repository="my-repo", + tag="v0.9.0-beta" +) +print("Tag deleted successfully") +``` + +### AuthApi + +Manage authentication and authorization. + +#### `get_current_user()` + +Get information about the current authenticated user. + +**Returns:** +- `CurrentUser`: Current user information + +**Example:** +```python +auth_api = lakefs_sdk.AuthApi(api_client) + +user = auth_api.get_current_user() +print(f"Current user: {user.user.id}") +print(f"Email: {user.user.email}") +``` + +#### `list_users(prefix=None, after=None, amount=None)` + +List users (requires admin privileges). + +**Parameters:** +- `prefix` (Optional[str]): Filter users by ID prefix +- `after` (Optional[str]): Return results after this user ID +- `amount` (Optional[int]): Maximum number of results to return + +**Returns:** +- `UserList`: List of users with pagination info + +**Example:** +```python +users = auth_api.list_users() +for user in users.results: + print(f"User: {user.id} ({user.email})") +``` + +## Data Models + +The Generated SDK includes comprehensive data models for all API objects. Here are the most commonly used models: + +### Repository + +Represents a lakeFS repository. + +```python +from lakefs_sdk.models import Repository + +# Repository properties +repo = Repository( + id="my-data-lake", + creation_date=1640995200, # Unix timestamp + default_branch="main", + storage_namespace="s3://my-bucket/repos/my-data-lake", + read_only=False +) + +# Access properties +print(f"Repository ID: {repo.id}") +print(f"Default branch: {repo.default_branch}") +print(f"Storage: {repo.storage_namespace}") +``` + +### RepositoryCreation + +Configuration for creating new repositories. + +```python +from lakefs_sdk.models import RepositoryCreation + +repo_creation = RepositoryCreation( + name="new-repo", + storage_namespace="s3://bucket/path", + default_branch="main", # Optional, defaults to "main" + sample_data=False # Optional, include sample data +) +``` + +### Ref (Branch/Tag) + +Represents a branch or tag reference. + +```python +from lakefs_sdk.models import Ref + +# Branch or tag reference +ref = Ref( + id="feature-branch", + commit_id="c7a632d0a7c4c9b5e8f1a2b3c4d5e6f7g8h9i0j1" +) + +print(f"Reference: {ref.id}") +print(f"Points to commit: {ref.commit_id}") +``` + +### BranchCreation + +Configuration for creating new branches. + +```python +from lakefs_sdk.models import BranchCreation + +branch_creation = BranchCreation( + name="feature-analytics", + source="main" # Source branch, tag, or commit ID +) +``` + +### ObjectStats + +Object metadata and statistics. + +```python +from lakefs_sdk.models import ObjectStats + +# Object metadata +stats = ObjectStats( + path="data/users.csv", + physical_address="s3://bucket/data/abc123.csv", + checksum="d41d8cd98f00b204e9800998ecf8427e", + size_bytes=2048, + mtime=1640995200, + metadata={"content-type": "text/csv", "author": "data-team"}, + content_type="text/csv" +) + +# Access properties +print(f"Path: {stats.path}") +print(f"Size: {stats.size_bytes} bytes") +print(f"Checksum: {stats.checksum}") +print(f"Metadata: {stats.metadata}") +``` + +### Commit + +Represents a commit in the repository history. + +```python +from lakefs_sdk.models import Commit + +commit = Commit( + id="c7a632d0a7c4c9b5e8f1a2b3c4d5e6f7g8h9i0j1", + parents=["b6a531c0a6c3c8a4d7e0a1b2c3d4e5f6g7h8i9j0"], + committer="user@example.com", + message="Add new analytics data", + creation_date=1640995200, + metadata={"author": "data-team", "pipeline": "analytics-v2"} +) + +print(f"Commit ID: {commit.id}") +print(f"Message: {commit.message}") +print(f"Author: {commit.committer}") +``` + +### CommitCreation + +Configuration for creating new commits. + +```python +from lakefs_sdk.models import CommitCreation + +commit_creation = CommitCreation( + message="Update user analytics data", + metadata={ + "author": "analytics-team", + "source": "daily-pipeline", + "version": "1.2.0" + } +) +``` + +### Diff + +Represents a difference between two references. + +```python +from lakefs_sdk.models import Diff + +# Diff entry +diff_entry = Diff( + type="changed", # "added", "removed", "changed", "conflict" + path="data/users.csv", + path_type="object", # "object" or "common_prefix" + size_bytes=2048 +) + +print(f"Change type: {diff_entry.type}") +print(f"Path: {diff_entry.path}") +``` + +### Merge + +Configuration for merge operations. + +```python +from lakefs_sdk.models import Merge + +merge_config = Merge( + message="Merge feature-analytics into main", + metadata={ + "merger": "data-team", + "review_id": "PR-123", + "approved_by": "team-lead" + }, + strategy="dest-wins", # Optional: merge strategy + force=False # Optional: force merge +) +``` + +### TagCreation + +Configuration for creating new tags. + +```python +from lakefs_sdk.models import TagCreation + +tag_creation = TagCreation( + id="v1.2.0", + ref="main" # Branch, tag, or commit ID to tag +) +``` + +## Error Handling + +The Generated SDK uses exceptions to handle API errors. All API exceptions inherit from `ApiException`. + +### Exception Hierarchy + +```python +from lakefs_sdk.rest import ApiException + +# Base exception for all API errors +try: + result = api_call() +except ApiException as e: + print(f"API Error: {e.status} - {e.reason}") + print(f"Response body: {e.body}") +``` + +### Common Exception Types + +#### HTTP Status Code Based Handling + +```python +from lakefs_sdk.rest import ApiException + +try: + repo = repositories_api.get_repository("nonexistent-repo") +except ApiException as e: + if e.status == 404: + print("Repository not found") + elif e.status == 401: + print("Authentication failed - check credentials") + elif e.status == 403: + print("Access denied - insufficient permissions") + elif e.status == 409: + print("Conflict - resource already exists or is in use") + elif e.status == 422: + print("Validation error - invalid input") + elif e.status >= 500: + print("Server error - try again later") + else: + print(f"Unexpected error: {e.status} - {e.reason}") +``` + +#### Detailed Error Information + +```python +import json +from lakefs_sdk.rest import ApiException + +try: + # API operation that might fail + result = repositories_api.create_repository(invalid_config) +except ApiException as e: + print(f"HTTP Status: {e.status}") + print(f"Reason: {e.reason}") + + # Parse error details from response body + try: + error_details = json.loads(e.body) + print(f"Error message: {error_details.get('message', 'Unknown error')}") + if 'error_code' in error_details: + print(f"Error code: {error_details['error_code']}") + except (json.JSONDecodeError, AttributeError): + print(f"Raw error body: {e.body}") +``` + +#### Retry Logic with Exponential Backoff + +```python +import time +import random +from lakefs_sdk.rest import ApiException + +def api_call_with_retry(api_func, max_retries=3, base_delay=1): + """Execute API call with exponential backoff retry logic.""" + for attempt in range(max_retries + 1): + try: + return api_func() + except ApiException as e: + if e.status >= 500 and attempt < max_retries: + # Server error - retry with exponential backoff + delay = base_delay * (2 ** attempt) + random.uniform(0, 1) + print(f"Server error, retrying in {delay:.2f}s (attempt {attempt + 1})") + time.sleep(delay) + else: + # Client error or max retries reached + raise + +# Usage example +try: + result = api_call_with_retry( + lambda: repositories_api.list_repositories() + ) +except ApiException as e: + print(f"Failed after retries: {e.status} - {e.reason}") +``` + +## Pagination + +Many API endpoints return paginated results. The Generated SDK provides consistent pagination patterns across all list operations. + +### Pagination Response Structure + +All paginated responses include a `pagination` object: + +```python +response = repositories_api.list_repositories(amount=10) + +# Access results +for repo in response.results: + print(f"Repository: {repo.id}") + +# Check pagination info +pagination = response.pagination +print(f"Has more results: {pagination.has_more}") +print(f"Next offset: {pagination.next_offset}") +print(f"Results count: {pagination.results}") +print(f"Max per page: {pagination.max_per_page}") +``` + +### Manual Pagination + +```python +def list_repositories_page_by_page(): + """List repositories one page at a time.""" + after = "" + page_num = 1 + + while True: + print(f"Fetching page {page_num}...") + + response = repositories_api.list_repositories( + after=after, + amount=50 # Page size + ) + + # Process current page + for repo in response.results: + print(f" Repository: {repo.id}") + + # Check if more pages exist + if not response.pagination.has_more: + break + + after = response.pagination.next_offset + page_num += 1 + + print(f"Processed {page_num} pages total") +``` + +### Automatic Pagination Helper + +```python +def list_all_items(list_func, **kwargs): + """Generic helper to fetch all items from a paginated API.""" + all_items = [] + after = "" + + while True: + # Call the list function with pagination parameters + response = list_func(after=after, amount=1000, **kwargs) + + # Add items from current page + all_items.extend(response.results) + + # Check if more pages exist + if not response.pagination.has_more: + break + + after = response.pagination.next_offset + + return all_items + +# Usage examples +all_repos = list_all_items(repositories_api.list_repositories) +all_branches = list_all_items( + branches_api.list_branches, + repository="my-repo" +) +all_objects = list_all_items( + objects_api.list_objects, + repository="my-repo", + ref="main", + prefix="data/" +) +``` + +### Filtering with Pagination + +```python +def find_repositories_by_pattern(pattern): + """Find repositories matching a pattern across all pages.""" + matching_repos = [] + after = "" + + while True: + response = repositories_api.list_repositories( + after=after, + amount=100 + ) + + # Filter results on current page + for repo in response.results: + if pattern in repo.id: + matching_repos.append(repo) + + if not response.pagination.has_more: + break + + after = response.pagination.next_offset + + return matching_repos + +# Find all repositories containing "data" +data_repos = find_repositories_by_pattern("data") +``` + +### Performance Considerations + +```python +# Optimize page size based on use case +def list_with_optimal_pagination(list_func, **kwargs): + """Use larger page sizes for better performance.""" + all_items = [] + after = "" + + # Use maximum page size for fewer API calls + page_size = 1000 # Adjust based on API limits + + while True: + response = list_func( + after=after, + amount=page_size, + **kwargs + ) + + all_items.extend(response.results) + + if not response.pagination.has_more: + break + + after = response.pagination.next_offset + + # Optional: Add progress indication for large datasets + if len(all_items) % 10000 == 0: + print(f"Fetched {len(all_items)} items so far...") + + return all_items +``` + +## Next Steps + +- See [usage examples](examples.md) for practical implementations +- Learn about [direct access](direct-access.md) from High-Level SDK +- Review the [complete API documentation](https://pydocs-sdk.lakefs.io) \ No newline at end of file diff --git a/docs/src/integrations/python/generated-sdk/direct-access.md b/docs/src/integrations/python/generated-sdk/direct-access.md new file mode 100644 index 00000000000..639a744247f --- /dev/null +++ b/docs/src/integrations/python/generated-sdk/direct-access.md @@ -0,0 +1,378 @@ +--- +title: Direct Access from High-Level SDK +description: Accessing the Generated SDK from within the High-Level SDK +sdk_types: ["high-level", "generated"] +difficulty: "intermediate" +use_cases: ["hybrid-usage", "advanced-operations", "direct-api"] +topics: ["integration", "direct-access", "hybrid"] +audience: ["developers", "advanced-users", "python-developers"] +last_updated: "2024-01-15" +--- + +# Direct Access from High-Level SDK + +Learn how to access the Generated SDK client from within the High-Level SDK for operations not covered by the high-level interface. + +## Accessing the Generated Client + +The High-Level SDK is built on top of the Generated SDK, and you can access the underlying client when needed. + +### Basic Access Pattern +```python +import lakefs + +# Create High-Level SDK objects +repo = lakefs.repository("my-repo") +branch = repo.branch("main") + +# Access the underlying Generated SDK client +generated_client = repo.client.sdk + +# Now you can use Generated SDK APIs directly +from lakefs_sdk import ObjectsApi +objects_api = ObjectsApi(generated_client) +``` + +## When to Use Direct Access + +### Operations Not in High-Level SDK +```python +import lakefs +from lakefs_sdk import RefsApi, Merge + +# High-Level SDK setup +repo = lakefs.repository("my-repo") +generated_client = repo.client.sdk + +# Use Generated SDK for advanced merge options +refs_api = RefsApi(generated_client) + +# Advanced merge with specific strategy +merge_result = refs_api.merge_into_branch( + repository="my-repo", + source_ref="feature-branch", + destination_branch="main", + merge=Merge( + message="Advanced merge", + strategy="source-wins", # Specific merge strategy + allow_empty=True, # Allow empty commits + force=False # Force merge option + ) +) +``` + +### Advanced API Parameters +```python +import lakefs +from lakefs_sdk import ObjectsApi + +repo = lakefs.repository("my-repo") +objects_api = ObjectsApi(repo.client.sdk) + +# List objects with advanced pagination options +objects_response = objects_api.list_objects( + repository="my-repo", + ref="main", + prefix="data/", + after="data/file100.txt", # Start after specific object + amount=500, # Custom page size + delimiter="/", # Directory-style listing + user_metadata=True # Include user metadata +) + +# Access pagination info +pagination = objects_response.pagination +print(f"Has more: {pagination.has_more}") +print(f"Next offset: {pagination.next_offset}") +print(f"Max per page: {pagination.max_per_page}") +``` + +## Combining High-Level and Generated SDK + +### Mixed Operations Workflow +```python +import lakefs +from lakefs_sdk import CommitsApi, TagsApi + +# Use High-Level SDK for common operations +repo = lakefs.repository("my-repo") +branch = repo.branch("feature-branch") + +# Upload files using High-Level SDK +branch.object("data/file1.txt").upload(data="Content 1") +branch.object("data/file2.txt").upload(data="Content 2") + +# Commit using High-Level SDK +commit_ref = branch.commit(message="Add data files") + +# Use Generated SDK for advanced tagging +tags_api = TagsApi(repo.client.sdk) +tag = tags_api.create_tag( + repository="my-repo", + tag_creation=lakefs_sdk.TagCreation( + id="v1.0.0-beta", + ref=commit_ref.get_commit().id, + force=True # Force tag creation + ) +) + +print(f"Created tag: {tag.id} at commit {tag.commit_id}") +``` + +### Advanced Object Operations +```python +import lakefs +from lakefs_sdk import ObjectsApi + +repo = lakefs.repository("my-repo") +branch = repo.branch("main") +objects_api = ObjectsApi(repo.client.sdk) + +# Use High-Level SDK for simple upload +obj = branch.object("data/simple.txt").upload(data="Simple content") + +# Use Generated SDK for advanced object operations +# Get object with presigned URL +presigned_response = objects_api.get_object( + repository="my-repo", + ref="main", + path="data/simple.txt", + presign=True, # Generate presigned URL + presign_expires=3600 # URL expires in 1 hour +) + +print(f"Presigned URL: {presigned_response.url}") + +# Get object statistics with additional metadata +stats = objects_api.stat_object( + repository="my-repo", + ref="main", + path="data/simple.txt", + user_metadata=True, # Include user metadata + presign=True # Include presigned URL in stats +) + +print(f"Physical address: {stats.physical_address}") +print(f"Presigned URL: {stats.physical_address_expiry}") +``` + +## Error Handling with Mixed Approach + +### Consistent Error Handling +```python +import lakefs +from lakefs_sdk import ObjectsApi +from lakefs_sdk.exceptions import ApiException +from lakefs.exceptions import LakeFSException + +def mixed_operation_with_error_handling(repo_name, branch_name, file_path): + try: + # High-Level SDK operations + repo = lakefs.repository(repo_name) + branch = repo.branch(branch_name) + + # Generated SDK operations + objects_api = ObjectsApi(repo.client.sdk) + + # Try High-Level SDK first + try: + obj = branch.object(file_path) + content = obj.reader().read() + return content + except LakeFSException as e: + print(f"High-Level SDK error: {e}") + + # Fallback to Generated SDK + try: + response = objects_api.get_object( + repository=repo_name, + ref=branch_name, + path=file_path + ) + return response.read() + except ApiException as e: + print(f"Generated SDK error: {e}") + + except Exception as e: + print(f"Unexpected error: {e}") + return None +``` + +## Advanced Integration Patterns + +### Custom Repository Manager +```python +import lakefs +from lakefs_sdk import RepositoriesApi, BranchesApi + +class AdvancedRepositoryManager: + def __init__(self, repo_name): + self.repo_name = repo_name + self.high_level_repo = lakefs.repository(repo_name) + self.generated_client = self.high_level_repo.client.sdk + + # Initialize Generated SDK APIs + self.repos_api = RepositoriesApi(self.generated_client) + self.branches_api = BranchesApi(self.generated_client) + + def create_with_advanced_options(self, storage_namespace, **kwargs): + """Create repository with advanced options""" + try: + repo = self.repos_api.create_repository( + repository_creation=lakefs_sdk.RepositoryCreation( + name=self.repo_name, + storage_namespace=storage_namespace, + default_branch=kwargs.get('default_branch', 'main'), + sample_data=kwargs.get('sample_data', False) + ) + ) + return repo + except Exception as e: + print(f"Advanced repository creation failed: {e}") + return None + + def bulk_branch_operations(self, branch_configs): + """Create multiple branches with different configurations""" + created_branches = [] + + for config in branch_configs: + try: + branch = self.branches_api.create_branch( + repository=self.repo_name, + branch_creation=lakefs_sdk.BranchCreation( + name=config['name'], + source=config.get('source', 'main') + ) + ) + created_branches.append(branch) + except Exception as e: + print(f"Failed to create branch {config['name']}: {e}") + + return created_branches + +# Usage +manager = AdvancedRepositoryManager("advanced-repo") + +# Create repository with advanced options +repo = manager.create_with_advanced_options( + storage_namespace="s3://bucket/advanced-repo", + default_branch="develop", + sample_data=True +) + +# Create multiple branches +branch_configs = [ + {"name": "feature-1", "source": "develop"}, + {"name": "feature-2", "source": "develop"}, + {"name": "hotfix", "source": "main"} +] + +branches = manager.bulk_branch_operations(branch_configs) +``` + +### Performance Optimization +```python +import lakefs +from lakefs_sdk import ObjectsApi +import concurrent.futures + +def optimized_batch_operations(repo_name, branch_name, operations): + """Perform batch operations with optimized Generated SDK calls""" + repo = lakefs.repository(repo_name) + objects_api = ObjectsApi(repo.client.sdk) + + def execute_operation(operation): + op_type, path, data = operation + + if op_type == "upload": + return objects_api.upload_object( + repository=repo_name, + branch=branch_name, + path=path, + content=data + ) + elif op_type == "delete": + return objects_api.delete_object( + repository=repo_name, + branch=branch_name, + path=path + ) + + # Execute operations in parallel using Generated SDK + with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: + futures = [executor.submit(execute_operation, op) for op in operations] + results = [future.result() for future in concurrent.futures.as_completed(futures)] + + return results + +# Usage +operations = [ + ("upload", "data/file1.txt", b"Content 1"), + ("upload", "data/file2.txt", b"Content 2"), + ("delete", "old/file.txt", None) +] + +results = optimized_batch_operations("my-repo", "main", operations) +``` + +## Best Practices + +### When to Use Each Approach + +**Use High-Level SDK for:** +- Common operations (upload, download, commit) +- Transaction-based workflows +- Streaming I/O operations +- Simplified error handling + +**Use Generated SDK for:** +- Advanced API parameters +- Operations not available in High-Level SDK +- Performance-critical batch operations +- Custom integrations + +### Consistent Client Management +```python +import lakefs + +class UnifiedLakeFSClient: + def __init__(self, repo_name): + self.repo_name = repo_name + self.high_level_repo = lakefs.repository(repo_name) + self.generated_client = self.high_level_repo.client.sdk + + def get_high_level_branch(self, branch_name): + """Get High-Level SDK branch object""" + return self.high_level_repo.branch(branch_name) + + def get_generated_api(self, api_class): + """Get Generated SDK API instance""" + return api_class(self.generated_client) + + def mixed_operation(self, branch_name, file_path, content): + """Example of mixed High-Level and Generated SDK usage""" + # Use High-Level SDK for upload + branch = self.get_high_level_branch(branch_name) + obj = branch.object(file_path).upload(data=content) + + # Use Generated SDK for advanced stats + objects_api = self.get_generated_api(lakefs_sdk.ObjectsApi) + stats = objects_api.stat_object( + repository=self.repo_name, + ref=branch_name, + path=file_path, + presign=True + ) + + return obj, stats + +# Usage +client = UnifiedLakeFSClient("my-repo") +obj, stats = client.mixed_operation("main", "data/file.txt", "Hello World") +``` + +## Next Steps + +- Review [High-Level SDK documentation](../high-level-sdk/) for comparison +- Check [Generated SDK API reference](api-reference.md) for complete API coverage +- Explore [best practices](../reference/best-practices.md) for optimal usage patterns \ No newline at end of file diff --git a/docs/src/integrations/python/generated-sdk/examples.md b/docs/src/integrations/python/generated-sdk/examples.md new file mode 100644 index 00000000000..410d0e9eef4 --- /dev/null +++ b/docs/src/integrations/python/generated-sdk/examples.md @@ -0,0 +1,867 @@ +--- +title: Generated SDK Usage Examples +description: Common usage patterns and examples for the Generated Python SDK +sdk_types: ["generated"] +difficulty: "intermediate" +use_cases: ["examples", "patterns", "direct-api", "custom-operations"] +topics: ["examples", "patterns", "usage", "api-calls"] +audience: ["developers", "advanced-users", "integrators"] +last_updated: "2024-01-15" +--- + +# Generated SDK Usage Examples + +Comprehensive examples and patterns for using the Generated Python SDK effectively in various scenarios. + +## Basic Setup and Configuration + +### Client Initialization +```python +import lakefs_sdk +from lakefs_sdk.client import LakeFSClient + +# Basic client setup +config = lakefs_sdk.Configuration( + host="http://localhost:8000", + username="AKIAIOSFODNN7EXAMPLE", + password="wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" +) + +client = LakeFSClient(configuration=config) +``` + +### Production Configuration +```python +import os + +# Production-ready configuration +config = lakefs_sdk.Configuration( + host=os.getenv('LAKEFS_ENDPOINT'), + username=os.getenv('LAKEFS_ACCESS_KEY_ID'), + password=os.getenv('LAKEFS_SECRET_ACCESS_KEY'), + verify_ssl=True, + ssl_ca_cert=os.getenv('LAKEFS_CA_CERT'), + timeout=30 +) + +client = LakeFSClient(configuration=config) +``` + +## Repository Operations + +### Repository Management +```python +repositories_api = lakefs_sdk.RepositoriesApi(client) + +# Create repository +def create_repository(name, storage_namespace): + try: + repo = repositories_api.create_repository( + repository_creation=lakefs_sdk.RepositoryCreation( + name=name, + storage_namespace=storage_namespace, + default_branch="main" + ) + ) + print(f"Created repository: {repo.id}") + return repo + except lakefs_sdk.ConflictException: + print(f"Repository {name} already exists") + return repositories_api.get_repository(repository=name) + +# List all repositories +def list_repositories(): + repos = repositories_api.list_repositories() + for repo in repos.results: + print(f"Repository: {repo.id}") + print(f" Storage: {repo.storage_namespace}") + print(f" Default branch: {repo.default_branch}") + print(f" Created: {repo.creation_date}") +``` + +## Branch Operations + +### Branch Management +```python +branches_api = lakefs_sdk.BranchesApi(client) + +# Create branch from main +def create_feature_branch(repository, branch_name, source_ref="main"): + try: + branch = branches_api.create_branch( + repository=repository, + branch_creation=lakefs_sdk.BranchCreation( + name=branch_name, + source=source_ref + ) + ) + print(f"Created branch: {branch.id}") + return branch + except lakefs_sdk.ConflictException: + print(f"Branch {branch_name} already exists") + return branches_api.get_branch(repository=repository, branch=branch_name) + +# List branches with details +def list_branches_detailed(repository): + branches = branches_api.list_branches(repository=repository) + for branch in branches.results: + print(f"Branch: {branch.id}") + print(f" Commit: {branch.commit_id}") +``` + +## Object Operations + +### Object Upload and Download +```python +objects_api = lakefs_sdk.ObjectsApi(client) + +# Upload text content +def upload_text_object(repository, branch, path, content): + try: + objects_api.upload_object( + repository=repository, + branch=branch, + path=path, + content=content.encode('utf-8') + ) + print(f"Uploaded: {path}") + except Exception as e: + print(f"Upload failed: {e}") + +# Upload binary file +def upload_file(repository, branch, remote_path, local_path): + with open(local_path, 'rb') as f: + content = f.read() + objects_api.upload_object( + repository=repository, + branch=branch, + path=remote_path, + content=content + ) + +# Download object +def download_object(repository, ref, path): + try: + response = objects_api.get_object( + repository=repository, + ref=ref, + path=path + ) + return response.read() + except lakefs_sdk.NotFoundException: + print(f"Object not found: {path}") + return None + +# Get object metadata +def get_object_info(repository, ref, path): + try: + stats = objects_api.stat_object( + repository=repository, + ref=ref, + path=path + ) + print(f"Object: {stats.path}") + print(f" Size: {stats.size_bytes} bytes") + print(f" Content-Type: {stats.content_type}") + print(f" Checksum: {stats.checksum}") + print(f" Modified: {stats.mtime}") + return stats + except lakefs_sdk.NotFoundException: + print(f"Object not found: {path}") + return None +``` + +### Batch Object Operations +```python +def batch_upload(repository, branch, files_data): + """Upload multiple files efficiently""" + uploaded = [] + failed = [] + + for path, content in files_data.items(): + try: + objects_api.upload_object( + repository=repository, + branch=branch, + path=path, + content=content if isinstance(content, bytes) else content.encode('utf-8') + ) + uploaded.append(path) + except Exception as e: + failed.append((path, str(e))) + + print(f"Uploaded: {len(uploaded)} files") + if failed: + print(f"Failed: {len(failed)} files") + for path, error in failed: + print(f" {path}: {error}") + + return uploaded, failed + +# Usage +files = { + "data/file1.txt": "Content 1", + "data/file2.txt": "Content 2", + "data/file3.json": '{"key": "value"}' +} + +uploaded, failed = batch_upload("my-repo", "main", files) +``` + +## Commit Operations + +### Creating Commits +```python +commits_api = lakefs_sdk.CommitsApi(client) + +# Simple commit +def create_commit(repository, branch, message): + try: + commit = commits_api.commit( + repository=repository, + branch=branch, + commit_creation=lakefs_sdk.CommitCreation( + message=message + ) + ) + print(f"Created commit: {commit.id}") + return commit + except Exception as e: + print(f"Commit failed: {e}") + return None + +# Commit with metadata +def create_commit_with_metadata(repository, branch, message, metadata): + commit = commits_api.commit( + repository=repository, + branch=branch, + commit_creation=lakefs_sdk.CommitCreation( + message=message, + metadata=metadata + ) + ) + return commit + +# Get commit history +def get_commit_history(repository, ref, limit=10): + commits = commits_api.log_commits( + repository=repository, + ref=ref, + amount=limit + ) + + for commit in commits.results: + print(f"Commit: {commit.id}") + print(f" Message: {commit.message}") + print(f" Author: {commit.committer}") + print(f" Date: {commit.creation_date}") + if commit.metadata: + print(f" Metadata: {commit.metadata}") + print() +``` + +## Diff and Merge Operations + +### Comparing References +```python +refs_api = lakefs_sdk.RefsApi(client) + +# Get diff between branches +def diff_branches(repository, left_ref, right_ref): + try: + diff = refs_api.diff_refs( + repository=repository, + left_ref=left_ref, + right_ref=right_ref + ) + + print(f"Diff between {left_ref} and {right_ref}:") + for change in diff.results: + print(f" {change.type}: {change.path}") + if change.size_bytes: + print(f" Size: {change.size_bytes} bytes") + + return diff + except Exception as e: + print(f"Diff failed: {e}") + return None + +# Merge branches +def merge_branches(repository, source_ref, destination_branch, message): + try: + result = refs_api.merge_into_branch( + repository=repository, + source_ref=source_ref, + destination_branch=destination_branch, + merge=lakefs_sdk.Merge( + message=message, + strategy="dest-wins" # or "source-wins" + ) + ) + print(f"Merge successful: {result.reference}") + return result + except lakefs_sdk.ConflictException as e: + print(f"Merge conflict: {e}") + return None + except Exception as e: + print(f"Merge failed: {e}") + return None +``` + +## Tag Management + +### Working with Tags +```python +tags_api = lakefs_sdk.TagsApi(api_client) + +# Create semantic version tags +def create_version_tag(repository, version, commit_ref): + """Create a semantic version tag""" + try: + tag = tags_api.create_tag( + repository=repository, + tag_creation=lakefs_sdk.models.TagCreation( + id=f"v{version}", + ref=commit_ref + ) + ) + print(f"Created tag: {tag.id} -> {tag.commit_id}") + return tag + except lakefs_sdk.rest.ApiException as e: + if e.status == 409: + print(f"Tag v{version} already exists") + else: + print(f"Failed to create tag: {e}") + return None + +# List and filter tags +def list_version_tags(repository): + """List all version tags""" + try: + tags = tags_api.list_tags( + repository=repository, + prefix="v", # Only version tags + amount=100 + ) + + version_tags = [] + for tag in tags.results: + if tag.id.startswith("v"): + version_tags.append({ + 'version': tag.id, + 'commit': tag.commit_id, + 'tag_object': tag + }) + + # Sort by version (simple string sort) + version_tags.sort(key=lambda x: x['version']) + return version_tags + + except Exception as e: + print(f"Failed to list tags: {e}") + return [] + +# Usage example +version_tags = list_version_tags("my-repo") +for tag_info in version_tags: + print(f"Version: {tag_info['version']} (commit: {tag_info['commit'][:8]})") +``` + +## Advanced Object Operations + +### Presigned URLs and Metadata +```python +# Generate presigned URLs for external access +def generate_presigned_urls(repository, ref, paths, expires_in=3600): + """Generate presigned URLs for multiple objects""" + presigned_urls = {} + + for path in paths: + try: + # Get presigned URL for download + response = objects_api.get_object( + repository=repository, + ref=ref, + path=path, + presign=True, + presign_expires=expires_in + ) + presigned_urls[path] = response.url + + except Exception as e: + print(f"Failed to generate presigned URL for {path}: {e}") + presigned_urls[path] = None + + return presigned_urls + +# Object metadata operations +def update_object_metadata(repository, branch, path, metadata): + """Update object user metadata""" + try: + # Note: This requires the experimental API + experimental_api = lakefs_sdk.ExperimentalApi(api_client) + + result = experimental_api.update_object_user_metadata( + repository=repository, + branch=branch, + path=path, + object_user_metadata=lakefs_sdk.models.ObjectUserMetadata( + metadata=metadata + ) + ) + print(f"Updated metadata for {path}") + return result + + except Exception as e: + print(f"Failed to update metadata: {e}") + return None + +# Copy objects with metadata preservation +def copy_object_with_metadata(repository, source_branch, dest_branch, source_path, dest_path): + """Copy object preserving metadata""" + try: + # First get source object metadata + source_stats = objects_api.stat_object( + repository=repository, + ref=source_branch, + path=source_path, + user_metadata=True + ) + + # Copy the object + copy_result = objects_api.copy_object( + repository=repository, + branch=dest_branch, + dest_path=dest_path, + object_copy_creation=lakefs_sdk.models.ObjectCopyCreation( + src_path=source_path, + src_ref=source_branch + ) + ) + + print(f"Copied {source_path} to {dest_path}") + print(f"Preserved metadata: {source_stats.metadata}") + return copy_result + + except Exception as e: + print(f"Copy failed: {e}") + return None +``` + +## Advanced Pagination Patterns + +### Efficient Pagination with Filtering +```python +def list_all_objects_paginated(repository, ref, prefix=""): + """List all objects handling pagination efficiently""" + all_objects = [] + after = "" + + while True: + try: + response = objects_api.list_objects( + repository=repository, + ref=ref, + prefix=prefix, + after=after, + amount=1000 # Page size + ) + + all_objects.extend(response.results) + + if not response.pagination.has_more: + break + + after = response.pagination.next_offset + + except Exception as e: + print(f"Error listing objects: {e}") + break + + return all_objects + +def find_objects_by_pattern(repository, ref, pattern, prefix=""): + """Find objects matching a pattern across all pages""" + import re + matching_objects = [] + after = "" + + compiled_pattern = re.compile(pattern) + + while True: + try: + response = objects_api.list_objects( + repository=repository, + ref=ref, + prefix=prefix, + after=after, + amount=1000 + ) + + # Filter objects matching pattern + for obj in response.results: + if compiled_pattern.search(obj.path): + matching_objects.append(obj) + + if not response.pagination.has_more: + break + + after = response.pagination.next_offset + + except Exception as e: + print(f"Error searching objects: {e}") + break + + return matching_objects + +# Usage examples +all_csv_files = find_objects_by_pattern("my-repo", "main", r"\.csv$", "data/") +print(f"Found {len(all_csv_files)} CSV files") + +large_files = [] +for obj in list_all_objects_paginated("my-repo", "main"): + if obj.size_bytes > 1024 * 1024: # Files larger than 1MB + large_files.append(obj) +``` + +## Concurrent Operations + +### Parallel Processing with Threading +```python +import concurrent.futures +import threading +from typing import List, Tuple + +def parallel_object_operations(repository: str, branch: str, operations: List[Tuple[str, str, bytes]]): + """Perform multiple object operations in parallel""" + + def upload_single_object(operation): + op_type, path, content = operation + try: + if op_type == "upload": + result = objects_api.upload_object( + repository=repository, + branch=branch, + path=path, + content=content + ) + return ("success", path, result) + elif op_type == "delete": + objects_api.delete_object( + repository=repository, + branch=branch, + path=path + ) + return ("success", path, None) + except Exception as e: + return ("error", path, str(e)) + + # Execute operations in parallel + results = [] + with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: + future_to_operation = { + executor.submit(upload_single_object, op): op + for op in operations + } + + for future in concurrent.futures.as_completed(future_to_operation): + result = future.result() + results.append(result) + + # Summarize results + successful = [r for r in results if r[0] == "success"] + failed = [r for r in results if r[0] == "error"] + + print(f"Completed {len(successful)} operations successfully") + if failed: + print(f"Failed {len(failed)} operations:") + for status, path, error in failed: + print(f" {path}: {error}") + + return results + +# Usage +operations = [ + ("upload", "data/file1.txt", b"Content 1"), + ("upload", "data/file2.txt", b"Content 2"), + ("upload", "data/file3.txt", b"Content 3"), + ("delete", "old/deprecated.txt", None) +] + +results = parallel_object_operations("my-repo", "main", operations) +``` + +## Advanced Error Handling + +### Comprehensive Error Handling +```python +import time +import random +from lakefs_sdk.rest import ApiException + +class LakeFSRetryHandler: + """Advanced retry handler for lakeFS operations""" + + def __init__(self, max_retries=3, base_delay=1, max_delay=60): + self.max_retries = max_retries + self.base_delay = base_delay + self.max_delay = max_delay + + def execute_with_retry(self, operation, *args, **kwargs): + """Execute operation with exponential backoff retry""" + last_exception = None + + for attempt in range(self.max_retries + 1): + try: + return operation(*args, **kwargs) + + except ApiException as e: + last_exception = e + + # Don't retry client errors (4xx) except rate limiting + if 400 <= e.status < 500 and e.status != 429: + raise + + # Don't retry on last attempt + if attempt == self.max_retries: + raise + + # Calculate delay with jitter + delay = min( + self.base_delay * (2 ** attempt) + random.uniform(0, 1), + self.max_delay + ) + + print(f"Attempt {attempt + 1} failed (status: {e.status}), " + f"retrying in {delay:.2f}s...") + time.sleep(delay) + + except Exception as e: + last_exception = e + + # Don't retry non-API exceptions on last attempt + if attempt == self.max_retries: + raise + + delay = min(self.base_delay * (2 ** attempt), self.max_delay) + print(f"Attempt {attempt + 1} failed, retrying in {delay:.2f}s...") + time.sleep(delay) + + # This should never be reached, but just in case + raise last_exception + +# Usage +retry_handler = LakeFSRetryHandler(max_retries=3) + +try: + repo = retry_handler.execute_with_retry( + repositories_api.get_repository, + repository="my-repo" + ) + print(f"Successfully retrieved repository: {repo.id}") + +except ApiException as e: + print(f"Failed after retries: HTTP {e.status} - {e.reason}") + +except Exception as e: + print(f"Unexpected error: {e}") + +# Context manager version +class RetryContext: + """Context manager for retry operations""" + + def __init__(self, max_retries=3): + self.retry_handler = LakeFSRetryHandler(max_retries=max_retries) + + def __enter__(self): + return self.retry_handler + + def __exit__(self, exc_type, exc_val, exc_tb): + return False + +# Usage with context manager +with RetryContext(max_retries=5) as retry: + branches = retry.execute_with_retry( + branches_api.list_branches, + repository="my-repo" + ) +``` + +## Working with Large Files and Streaming + +### Efficient Large File Handling +```python +import os +from pathlib import Path + +def upload_large_file_chunked(repository, branch, remote_path, local_path, chunk_size=8*1024*1024): + """Upload large files efficiently""" + file_size = os.path.getsize(local_path) + print(f"Uploading {local_path} ({file_size:,} bytes) to {remote_path}") + + try: + with open(local_path, 'rb') as f: + content = f.read() + + # For very large files, consider using multipart upload + # (requires experimental API) + result = objects_api.upload_object( + repository=repository, + branch=branch, + path=remote_path, + content=content + ) + + print(f"Upload completed: {result.path}") + return result + + except Exception as e: + print(f"Upload failed: {e}") + return None + +def download_with_progress(repository, ref, remote_path, local_path): + """Download file with progress indication""" + try: + # Get object stats first to show progress + stats = objects_api.stat_object( + repository=repository, + ref=ref, + path=remote_path + ) + + print(f"Downloading {remote_path} ({stats.size_bytes:,} bytes)...") + + # Download the object + response = objects_api.get_object( + repository=repository, + ref=ref, + path=remote_path + ) + + # Write to local file + with open(local_path, 'wb') as f: + content = response.read() + f.write(content) + + print(f"Download completed: {local_path}") + return True + + except Exception as e: + print(f"Download failed: {e}") + return False + +def batch_download_directory(repository, ref, remote_prefix, local_dir): + """Download all objects with a prefix to local directory""" + local_path = Path(local_dir) + local_path.mkdir(parents=True, exist_ok=True) + + # List all objects with the prefix + objects = list_all_objects_paginated(repository, ref, remote_prefix) + + downloaded = [] + failed = [] + + for obj in objects: + # Calculate local file path + relative_path = obj.path[len(remote_prefix):].lstrip('/') + local_file_path = local_path / relative_path + + # Create parent directories + local_file_path.parent.mkdir(parents=True, exist_ok=True) + + # Download the file + if download_with_progress(repository, ref, obj.path, str(local_file_path)): + downloaded.append(obj.path) + else: + failed.append(obj.path) + + print(f"Downloaded {len(downloaded)} files, {len(failed)} failed") + return downloaded, failed + +# Usage examples +upload_large_file_chunked("my-repo", "main", "data/large-dataset.csv", "/path/to/large-file.csv") + +downloaded, failed = batch_download_directory( + "my-repo", "main", "data/exports/", "/local/download/dir" +) +``` + +### Error Handling Patterns +```python +def robust_api_call(api_func, *args, **kwargs): + """Wrapper for robust API calls with retry logic""" + max_retries = 3 + + for attempt in range(max_retries): + try: + return api_func(*args, **kwargs) + except lakefs_sdk.ApiException as e: + if e.status == 429: # Rate limited + time.sleep(2 ** attempt) # Exponential backoff + continue + elif e.status >= 500: # Server error + if attempt < max_retries - 1: + time.sleep(1) + continue + raise + except Exception as e: + if attempt < max_retries - 1: + time.sleep(1) + continue + raise + + raise Exception(f"Failed after {max_retries} attempts") + +# Usage +try: + repo = robust_api_call( + repositories_api.get_repository, + repository="my-repo" + ) +except Exception as e: + print(f"Failed to get repository: {e}") +``` + +### Working with Large Files +```python +def upload_large_file(repository, branch, remote_path, local_path, chunk_size=8192): + """Upload large files in chunks""" + try: + with open(local_path, 'rb') as f: + content = f.read() + + objects_api.upload_object( + repository=repository, + branch=branch, + path=remote_path, + content=content + ) + + print(f"Uploaded large file: {remote_path}") + + except Exception as e: + print(f"Large file upload failed: {e}") + +def download_large_file(repository, ref, remote_path, local_path): + """Download large files efficiently""" + try: + response = objects_api.get_object( + repository=repository, + ref=ref, + path=remote_path + ) + + with open(local_path, 'wb') as f: + f.write(response.read()) + + print(f"Downloaded large file: {local_path}") + + except Exception as e: + print(f"Large file download failed: {e}") +``` + +## Next Steps + +- Learn about [direct access](direct-access.md) from High-Level SDK +- Review the [complete API reference](api-reference.md) +- Check the [official Generated SDK documentation](https://pydocs-sdk.lakefs.io) \ No newline at end of file diff --git a/docs/src/integrations/python/generated-sdk/index.md b/docs/src/integrations/python/generated-sdk/index.md new file mode 100644 index 00000000000..dfdaab6a161 --- /dev/null +++ b/docs/src/integrations/python/generated-sdk/index.md @@ -0,0 +1,383 @@ +--- +title: Generated Python SDK +description: Direct API access using the Generated Python SDK +sdk_types: ["generated"] +difficulty: "intermediate" +use_cases: ["direct-api", "custom-operations", "advanced-integration"] +topics: ["api-access", "openapi", "low-level"] +audience: ["developers", "advanced-users", "integrators"] +last_updated: "2024-01-15" +--- + +# Generated Python SDK + +The Generated Python SDK provides direct access to the lakeFS API based on the OpenAPI specification. It offers complete API coverage and is the foundation for the High-Level SDK. + +## Overview + +The Generated SDK is automatically generated from lakeFS's OpenAPI specification, ensuring it stays up-to-date with all API features. It provides low-level access to every lakeFS API endpoint with full type safety and validation. + +### Key Features + +- **Complete API Coverage** - Access to all lakeFS API endpoints +- **OpenAPI-Based** - Generated from the official lakeFS OpenAPI specification +- **Type Safety** - Full type hints and model validation +- **Direct Control** - Low-level access for custom operations +- **Synchronous Operations** - Straightforward request/response patterns + +### Relationship to High-Level SDK + +The High-Level SDK is built on top of the Generated SDK, providing: +- Simplified interfaces for common operations +- Advanced features like transactions and streaming I/O +- Pythonic abstractions over raw API calls + +You can access the Generated SDK client directly from High-Level SDK objects when needed. + +## When to Use Generated SDK + +Choose the Generated SDK when you need: + +### Custom Operations +- Operations not available in High-Level SDK +- Access to experimental or internal APIs +- Fine-grained control over API parameters + +### Performance Critical Applications +- Direct API access for optimal performance +- Minimal overhead for high-throughput scenarios +- Custom retry and error handling logic + +### Advanced Integration Development +- Building custom integrations or tools +- Implementing complex workflows +- Need for all API parameters and options + +### Migration from REST API +- Converting existing REST API calls to Python +- Maintaining compatibility with existing code patterns +- Direct mapping from API documentation + +## Installation and Setup + +### Installation + +Install the Generated SDK using pip: + +```bash +pip install lakefs-sdk +``` + +### Basic Configuration + +The Generated SDK uses a configuration object to manage connection settings: + +```python +import lakefs_sdk + +# Create configuration +configuration = lakefs_sdk.Configuration( + host="http://localhost:8000", + username="AKIAIOSFODNN7EXAMPLE", + password="wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" +) + +# Create API client +with lakefs_sdk.ApiClient(configuration) as api_client: + # Use the client with API classes + repositories_api = lakefs_sdk.RepositoriesApi(api_client) + repos = repositories_api.list_repositories() +``` + +### Authentication Methods + +#### Basic Authentication (Username/Password) + +```python +import lakefs_sdk + +configuration = lakefs_sdk.Configuration( + host="http://localhost:8000", + username="your-access-key-id", + password="your-secret-access-key" +) +``` + +#### Bearer Token Authentication + +```python +import lakefs_sdk + +configuration = lakefs_sdk.Configuration( + host="http://localhost:8000", + access_token="your-jwt-token" +) +``` + +#### API Key Authentication + +```python +import lakefs_sdk + +configuration = lakefs_sdk.Configuration( + host="http://localhost:8000" +) + +# Set API key for specific auth method +configuration.api_key['cookie_auth'] = 'your-api-key' +configuration.api_key_prefix['cookie_auth'] = 'Bearer' # Optional prefix +``` + +#### Environment Variables + +You can also configure authentication using environment variables: + +```python +import os +import lakefs_sdk + +configuration = lakefs_sdk.Configuration( + host=os.environ.get("LAKEFS_HOST", "http://localhost:8000"), + username=os.environ.get("LAKEFS_ACCESS_KEY_ID"), + password=os.environ.get("LAKEFS_SECRET_ACCESS_KEY") +) +``` + +### Advanced Configuration Options + +#### SSL and TLS Settings + +```python +import lakefs_sdk + +configuration = lakefs_sdk.Configuration( + host="https://your-lakefs-instance.com", + username="your-access-key-id", + password="your-secret-access-key" +) + +# SSL verification (default: True) +configuration.verify_ssl = True + +# Custom SSL certificate path +configuration.ssl_ca_cert = "/path/to/ca-cert.pem" + +# Client certificate authentication +configuration.cert_file = "/path/to/client-cert.pem" +configuration.key_file = "/path/to/client-key.pem" +``` + +#### Proxy Configuration + +```python +import lakefs_sdk + +configuration = lakefs_sdk.Configuration( + host="http://localhost:8000", + username="your-access-key-id", + password="your-secret-access-key" +) + +# HTTP proxy +configuration.proxy = "http://proxy.example.com:8080" + +# HTTPS proxy +configuration.proxy_headers = { + 'Proxy-Authorization': 'Basic dXNlcjpwYXNz' +} +``` + +#### Timeout and Retry Settings + +```python +import lakefs_sdk + +configuration = lakefs_sdk.Configuration( + host="http://localhost:8000", + username="your-access-key-id", + password="your-secret-access-key" +) + +# Connection timeout (seconds) +configuration.connection_pool_maxsize = 10 + +# Custom user agent +configuration.user_agent = "MyApp/1.0 lakefs-sdk" +``` + +## Basic Usage Pattern + +### Standard API Client Usage + +```python +import lakefs_sdk +from lakefs_sdk.rest import ApiException + +# Configure client +configuration = lakefs_sdk.Configuration( + host="http://localhost:8000", + username="AKIAIOSFODNN7EXAMPLE", + password="wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" +) + +# Use context manager for proper resource cleanup +with lakefs_sdk.ApiClient(configuration) as api_client: + try: + # Create API instance + repositories_api = lakefs_sdk.RepositoriesApi(api_client) + + # Make API call + repositories = repositories_api.list_repositories() + + # Process results + for repo in repositories.results: + print(f"Repository: {repo.id}") + + except ApiException as e: + print(f"API Error: {e.status} - {e.reason}") + print(f"Response body: {e.body}") +``` + +### Error Handling + +The Generated SDK raises `ApiException` for API errors: + +```python +import lakefs_sdk +from lakefs_sdk.rest import ApiException + +with lakefs_sdk.ApiClient(configuration) as api_client: + repositories_api = lakefs_sdk.RepositoriesApi(api_client) + + try: + repo = repositories_api.get_repository("nonexistent-repo") + except ApiException as e: + if e.status == 404: + print("Repository not found") + elif e.status == 401: + print("Authentication failed") + elif e.status == 403: + print("Access denied") + else: + print(f"API error: {e.status} - {e.reason}") +``` + +## Quick Start Example + +Here's a complete example showing common operations: + +```python +import lakefs_sdk +from lakefs_sdk.rest import ApiException + +# Configuration +configuration = lakefs_sdk.Configuration( + host="http://localhost:8000", + username="AKIAIOSFODNN7EXAMPLE", + password="wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" +) + +with lakefs_sdk.ApiClient(configuration) as api_client: + try: + # Initialize API classes + repositories_api = lakefs_sdk.RepositoriesApi(api_client) + branches_api = lakefs_sdk.BranchesApi(api_client) + objects_api = lakefs_sdk.ObjectsApi(api_client) + + # List repositories + print("Repositories:") + repos = repositories_api.list_repositories() + for repo in repos.results: + print(f" - {repo.id}") + + # Get repository details + if repos.results: + repo_id = repos.results[0].id + repo = repositories_api.get_repository(repo_id) + print(f"\nRepository '{repo_id}' default branch: {repo.default_branch}") + + # List branches + branches = branches_api.list_branches(repo_id) + print(f"Branches in {repo_id}:") + for branch in branches.results: + print(f" - {branch.id} (commit: {branch.commit_id})") + + # List objects in main branch + if branches.results: + branch_id = branches.results[0].id + objects = objects_api.list_objects(repo_id, branch_id) + print(f"\nObjects in {repo_id}/{branch_id}:") + for obj in objects.results: + print(f" - {obj.path} ({obj.size_bytes} bytes)") + + except ApiException as e: + print(f"Error: {e.status} - {e.reason}") +``` + +**Expected Output:** +``` +Repositories: + - my-repo + - data-lake + +Repository 'my-repo' default branch: main + +Branches in my-repo: + - main (commit: c7a632d0a7c4c9b5e8f1a2b3c4d5e6f7g8h9i0j1) + - feature-branch (commit: d8b743e1b8d5d0c6f9g2b4c5d6e7f8g9h0i1j2k3) + +Objects in my-repo/main: + - data/users.csv (1024 bytes) + - models/model.pkl (2048 bytes) +``` + +## Documentation Sections + +- **[API Reference](api-reference.md)** - Complete API classes and methods +- **[Usage Examples](examples.md)** - Common usage patterns and operations +- **[Direct Access](direct-access.md)** - Accessing Generated SDK from High-Level SDK + +## Next Steps + +- Review the [API reference](api-reference.md) for complete documentation +- See [usage examples](examples.md) for common patterns +- Learn about [direct access](direct-access.md) from High-Level SDK +- Check the [troubleshooting guide](../reference/troubleshooting.md) for common issues + +## See Also + +**SDK Selection and Comparison:** +- [Python SDK Overview](../index.md) - Compare all Python SDK options +- [SDK Decision Matrix](../index.md#sdk-selection-decision-matrix) - Choose the right SDK +- [API Feature Comparison](../reference/api-comparison.md) - Detailed feature comparison + +**Generated SDK Documentation:** +- [API Reference](api-reference.md) - Complete API classes and methods documentation +- [Usage Examples](examples.md) - Common patterns and operations +- [Direct Access from High-Level SDK](direct-access.md) - Using Generated SDK with High-Level SDK + +**Alternative SDK Options:** +- [High-Level SDK](../high-level-sdk/index.md) - Simplified Python interface built on Generated SDK +- [High-Level SDK Quickstart](../high-level-sdk/quickstart.md) - Easy-to-use interface for common operations +- [lakefs-spec](../lakefs-spec/index.md) - Filesystem interface for data science workflows +- [Boto3 Integration](../boto3/index.md) - S3-compatible operations + +**Setup and Configuration:** +- [Installation Guide](../getting-started.md) - Complete setup instructions for all SDKs +- [Authentication Methods](../getting-started.md#authentication-and-configuration) - All credential options +- [Best Practices](../reference/best-practices.md#configuration) - Production configuration guidance + +**Learning Resources:** +- [Data Science Tutorial](../tutorials/data-science-workflow.md) - End-to-end workflow examples +- [ETL Pipeline Tutorial](../tutorials/etl-pipeline.md) - Building data pipelines +- [Advanced Integration Patterns](../reference/best-practices.md#integration-patterns) - Custom tooling development + +**Reference Materials:** +- [Error Handling Guide](../reference/troubleshooting.md#error-handling) - Exception handling strategies +- [Performance Optimization](../reference/best-practices.md#performance) - Optimize API usage +- [Security Best Practices](../reference/best-practices.md#security) - Secure API access patterns + +**External Resources:** +- [Generated SDK API Documentation](https://pydocs-sdk.lakefs.io){:target="_blank"} - Auto-generated API reference +- [lakeFS OpenAPI Specification](https://docs.lakefs.io/reference/api.html){:target="_blank"} - Complete API specification +- [lakeFS REST API Documentation](https://docs.lakefs.io/reference/api.html){:target="_blank"} - REST API reference \ No newline at end of file diff --git a/docs/src/integrations/python/getting-started.md b/docs/src/integrations/python/getting-started.md new file mode 100644 index 00000000000..c5402bf1d1f --- /dev/null +++ b/docs/src/integrations/python/getting-started.md @@ -0,0 +1,511 @@ +--- +title: Getting Started with Python and lakeFS +description: Complete installation and setup guide for all Python SDK options +sdk_types: ["high-level", "generated", "lakefs-spec", "boto3"] +difficulty: "beginner" +use_cases: ["getting-started", "installation", "authentication"] +topics: ["setup", "configuration", "credentials"] +audience: ["data-engineers", "data-scientists", "developers"] +last_updated: "2024-01-15" +--- + +# Getting Started with Python and lakeFS + +This comprehensive guide walks you through installing and configuring Python SDKs for lakeFS. Follow the steps for your chosen SDK to get up and running quickly. + +## Prerequisites + +Before you begin, ensure you have: + +- **Python 3.8 or higher** (check with `python --version`) +- **pip** package manager +- **Access to a lakeFS instance** (local or remote) +- **lakeFS credentials** (access key ID and secret access key) + +## Quick SDK Selection + +Not sure which SDK to choose? See our [SDK comparison](index.md#comprehensive-sdk-comparison) or use the [decision matrix](index.md#sdk-selection-decision-matrix). + +| SDK | Installation | Best For | +|-----|-------------|----------| +| [High-Level SDK](#high-level-sdk-installation) | `pip install lakefs` | Most users, data pipelines | +| [Generated SDK](#generated-sdk-installation) | `pip install lakefs-sdk` | Direct API access | +| [lakefs-spec](#lakefs-spec-installation) | `pip install lakefs-spec` | Data science workflows | +| [Boto3](#boto3-installation) | `pip install boto3` | S3 migration | + +## Installation Guide + +### High-Level SDK Installation + +The High-Level SDK provides the most user-friendly interface for lakeFS operations. + +#### Basic Installation +```bash +pip install lakefs +``` + +#### Development Installation +For the latest features and bug fixes: +```bash +pip install --upgrade lakefs +``` + +#### Virtual Environment (Recommended) +```bash +# Create virtual environment +python -m venv lakefs-env +source lakefs-env/bin/activate # On Windows: lakefs-env\Scripts\activate + +# Install SDK +pip install lakefs +``` + +#### Verify Installation +```python +import lakefs +print(lakefs.__version__) +``` + +### Generated SDK Installation + +The Generated SDK provides direct access to all lakeFS API endpoints. + +#### Basic Installation +```bash +pip install lakefs-sdk +``` + +#### With Optional Dependencies +```bash +# For async support (if available) +pip install lakefs-sdk[async] +``` + +#### Verify Installation +```python +import lakefs_sdk +print(lakefs_sdk.__version__) +``` + +### lakefs-spec Installation + +lakefs-spec provides filesystem-like operations and integrates with the fsspec ecosystem. + +#### Basic Installation +```bash +pip install lakefs-spec +``` + +#### With Data Science Dependencies +```bash +# For pandas integration +pip install lakefs-spec[pandas] + +# For complete data science stack +pip install lakefs-spec[all] +``` + +#### Verify Installation +```python +import lakefs_spec +print(lakefs_spec.__version__) +``` + +### Boto3 Installation + +Use Boto3 for S3-compatible operations with lakeFS. + +#### Basic Installation +```bash +pip install boto3 +``` + +#### With Additional AWS Tools +```bash +# For AWS CLI compatibility +pip install boto3 awscli + +# For async operations +pip install aioboto3 +``` + +#### Verify Installation +```python +import boto3 +print(boto3.__version__) +``` + +### Installation Troubleshooting + +#### Common Issues + +**Permission Errors:** +```bash +# Use --user flag to install for current user only +pip install --user lakefs + +# Or use virtual environment (recommended) +python -m venv venv && source venv/bin/activate +``` + +**Version Conflicts:** +```bash +# Check for conflicts +pip check + +# Upgrade pip first +pip install --upgrade pip + +# Force reinstall +pip install --force-reinstall lakefs +``` + +**Network Issues:** +```bash +# Use different index +pip install -i https://pypi.org/simple/ lakefs + +# Install from wheel +pip install --only-binary=all lakefs +``` + +## Authentication and Configuration + +All Python SDKs support multiple authentication methods. Choose the method that best fits your deployment and security requirements. + +### Method 1: Environment Variables (Recommended for Development) + +Set environment variables in your shell or deployment environment: + +#### Linux/macOS +```bash +export LAKEFS_ENDPOINT=http://localhost:8000 +export LAKEFS_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE +export LAKEFS_SECRET_ACCESS_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY +``` + +#### Windows Command Prompt +```cmd +set LAKEFS_ENDPOINT=http://localhost:8000 +set LAKEFS_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE +set LAKEFS_SECRET_ACCESS_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY +``` + +#### Windows PowerShell +```powershell +$env:LAKEFS_ENDPOINT="http://localhost:8000" +$env:LAKEFS_ACCESS_KEY_ID="AKIAIOSFODNN7EXAMPLE" +$env:LAKEFS_SECRET_ACCESS_KEY="wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" +``` + +#### Using .env Files +Create a `.env` file in your project directory: +```bash +LAKEFS_ENDPOINT=http://localhost:8000 +LAKEFS_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE +LAKEFS_SECRET_ACCESS_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY +``` + +Load with python-dotenv: +```python +from dotenv import load_dotenv +load_dotenv() + +import lakefs +# SDK will automatically use environment variables +``` + +### Method 2: Configuration File (Recommended for Production) + +#### lakectl Configuration File +Create `~/.lakectl.yaml` (compatible with lakectl CLI): +```yaml +credentials: + access_key_id: AKIAIOSFODNN7EXAMPLE + secret_access_key: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY +server: + endpoint_url: http://localhost:8000 +``` + +#### Custom Configuration File +Create a custom YAML configuration file: +```yaml +# config/lakefs.yaml +lakefs: + endpoint: http://localhost:8000 + access_key_id: AKIAIOSFODNN7EXAMPLE + secret_access_key: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY + verify_ssl: true +``` + +Load in Python: +```python +import yaml +import lakefs + +with open('config/lakefs.yaml', 'r') as f: + config = yaml.safe_load(f)['lakefs'] + +client = lakefs.Client( + host=config['endpoint'], + username=config['access_key_id'], + password=config['secret_access_key'], + verify_ssl=config.get('verify_ssl', True) +) +``` + +### Method 3: Programmatic Configuration + +#### High-Level SDK +```python +import lakefs + +# Basic configuration +client = lakefs.Client( + host="http://localhost:8000", + username="AKIAIOSFODNN7EXAMPLE", + password="wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" +) + +# Advanced configuration +client = lakefs.Client( + host="https://lakefs.example.com", + username="AKIAIOSFODNN7EXAMPLE", + password="wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY", + verify_ssl=True, + ssl_ca_cert="/path/to/ca-bundle.pem", + proxy="http://proxy.example.com:8080" +) + +# Use client with repository operations +repo = lakefs.Repository("my-repo", client=client) +``` + +#### Generated SDK +```python +import lakefs_sdk +from lakefs_sdk.configuration import Configuration +from lakefs_sdk.api_client import ApiClient + +# Configure client +configuration = Configuration( + host="http://localhost:8000", + username="AKIAIOSFODNN7EXAMPLE", + password="wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" +) + +# Create API client +api_client = ApiClient(configuration) +``` + +#### lakefs-spec +```python +from lakefs_spec import LakeFSFileSystem + +# Using credentials directly +fs = LakeFSFileSystem( + host="http://localhost:8000", + username="AKIAIOSFODNN7EXAMPLE", + password="wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" +) + +# Auto-discover from ~/.lakectl.yaml +fs = LakeFSFileSystem() +``` + +#### Boto3 +```python +import boto3 +from botocore.config import Config + +# Basic S3 client configuration +s3_client = boto3.client( + 's3', + endpoint_url='http://localhost:8000', + aws_access_key_id='AKIAIOSFODNN7EXAMPLE', + aws_secret_access_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY' +) + +# Advanced configuration with SSL and checksums +config = Config( + request_checksum_calculation='when_required', + response_checksum_validation='when_required', + retries={'max_attempts': 3} +) + +s3_client = boto3.client( + 's3', + endpoint_url='https://lakefs.example.com', + aws_access_key_id='AKIAIOSFODNN7EXAMPLE', + aws_secret_access_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', + config=config +) +``` + +### SSL/TLS Configuration + +#### Self-Signed Certificates (Development Only) +```python +import lakefs + +# Disable SSL verification (NOT for production) +client = lakefs.Client( + host="https://localhost:8000", + username="AKIAIOSFODNN7EXAMPLE", + password="wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY", + verify_ssl=False +) +``` + +!!! warning "Security Warning" + Disabling SSL verification allows man-in-the-middle attacks. Never use `verify_ssl=False` in production environments. + +#### Custom CA Certificates +```python +import lakefs + +# Use custom CA bundle +client = lakefs.Client( + host="https://lakefs.example.com", + username="AKIAIOSFODNN7EXAMPLE", + password="wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY", + ssl_ca_cert="/path/to/ca-bundle.pem" +) +``` + +### Proxy Configuration + +#### HTTP/HTTPS Proxy +```python +import lakefs + +client = lakefs.Client( + host="http://localhost:8000", + username="AKIAIOSFODNN7EXAMPLE", + password="wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY", + proxy="http://proxy.example.com:8080" +) +``` + +#### Proxy with Authentication +```python +import lakefs + +client = lakefs.Client( + host="http://localhost:8000", + username="AKIAIOSFODNN7EXAMPLE", + password="wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY", + proxy="http://user:pass@proxy.example.com:8080" +) +``` + +### Testing Your Configuration + +#### Quick Connection Test +```python +import lakefs + +try: + # Test with High-Level SDK + repos = list(lakefs.repositories()) + print(f"✅ Connected successfully! Found {len(repos)} repositories.") +except Exception as e: + print(f"❌ Connection failed: {e}") +``` + +#### Comprehensive Health Check +```python +import lakefs + +def test_lakefs_connection(): + try: + # Test repository listing + repos = list(lakefs.repositories()) + print(f"✅ Repository access: {len(repos)} repositories found") + + if repos: + # Test branch listing on first repository + repo = repos[0] + branches = list(repo.branches()) + print(f"✅ Branch access: {len(branches)} branches in '{repo.id}'") + + return True + except Exception as e: + print(f"❌ Connection test failed: {e}") + return False + +# Run the test +if test_lakefs_connection(): + print("🎉 lakeFS connection is working correctly!") +``` + +### Environment-Specific Configuration + +#### Development Environment +```python +# development.py +import lakefs + +# Use local lakeFS instance with relaxed SSL +client = lakefs.Client( + host="http://localhost:8000", + username="lakefs", + password="lakefs_password", + verify_ssl=False # OK for local development +) +``` + +#### Production Environment +```python +# production.py +import os +import lakefs + +# Use environment variables with strict SSL +client = lakefs.Client( + host=os.getenv("LAKEFS_ENDPOINT"), + username=os.getenv("LAKEFS_ACCESS_KEY_ID"), + password=os.getenv("LAKEFS_SECRET_ACCESS_KEY"), + verify_ssl=True, + ssl_ca_cert=os.getenv("LAKEFS_CA_CERT_PATH") +) +``` + +## Next Steps + +- **High-Level SDK:** Start with the [quickstart guide](high-level-sdk/quickstart.md) +- **Generated SDK:** See [API reference](generated-sdk/api-reference.md) +- **lakefs-spec:** Check [filesystem operations](lakefs-spec/filesystem-api.md) +- **Boto3:** Review [configuration guide](boto3/configuration.md) + +## See Also + +**SDK Selection:** +- [Python SDK Overview](index.md) - Compare all available Python SDK options +- [SDK Decision Matrix](index.md#sdk-selection-decision-matrix) - Choose the right SDK for your use case +- [Feature Comparison](reference/api-comparison.md) - Detailed feature comparison across SDKs + +**SDK-Specific Getting Started:** +- [High-Level SDK Quickstart](high-level-sdk/quickstart.md) - Basic operations with simplified interface +- [Generated SDK Examples](generated-sdk/examples.md) - Direct API access patterns +- [lakefs-spec Filesystem API](lakefs-spec/filesystem-api.md) - File-like operations +- [Boto3 Configuration](boto3/configuration.md) - S3-compatible setup + +**Authentication and Security:** +- [Best Practices Guide](reference/best-practices.md#security) - Production security recommendations +- [Troubleshooting Authentication](reference/troubleshooting.md#authentication-issues) - Common auth problems +- [SSL/TLS Configuration](reference/best-practices.md#ssl-configuration) - Secure connections + +**Learning Resources:** +- [Data Science Workflow Tutorial](tutorials/data-science-workflow.md) - End-to-end data analysis +- [ETL Pipeline Tutorial](tutorials/etl-pipeline.md) - Building data pipelines +- [ML Experiment Tracking](tutorials/ml-experiment-tracking.md) - Model versioning workflow + +**Reference Materials:** +- [Environment Configuration Examples](reference/best-practices.md#environment-configuration) - Production setup patterns +- [Connection Testing](reference/troubleshooting.md#connection-testing) - Verify your setup +- [Performance Optimization](reference/best-practices.md#performance) - Optimize SDK performance + +**External Resources:** +- [lakeFS Documentation](https://docs.lakefs.io){:target="_blank"} - Complete lakeFS documentation +- [Python Package Index](https://pypi.org/search/?q=lakefs){:target="_blank"} - All lakeFS Python packages \ No newline at end of file diff --git a/docs/src/integrations/python/high-level-sdk/advanced.md b/docs/src/integrations/python/high-level-sdk/advanced.md new file mode 100644 index 00000000000..8af26d3385c --- /dev/null +++ b/docs/src/integrations/python/high-level-sdk/advanced.md @@ -0,0 +1,1284 @@ +--- +title: Advanced Features +description: Advanced patterns and optimization techniques for the High-Level Python SDK +sdk_types: ["high-level"] +difficulty: "advanced" +use_cases: ["performance", "optimization", "advanced-patterns", "production"] +topics: ["performance", "optimization", "patterns", "advanced"] +audience: ["advanced-users", "developers", "data-engineers"] +last_updated: "2024-01-15" +--- + +# Advanced Features + +Explore advanced patterns, optimization techniques, and best practices for the High-Level Python SDK. This guide covers error handling, performance optimization, logging, debugging, and production deployment strategies. + +## Error Handling and Exception Management + +### Complete Exception Hierarchy + +The High-Level SDK provides a comprehensive exception hierarchy for different error scenarios: + +```python +import lakefs +from lakefs.exceptions import ( + # Base exceptions + LakeFSException, + ServerException, + + # Authentication and authorization + NoAuthenticationFound, + NotAuthorizedException, + ForbiddenException, + PermissionException, + + # Resource errors + NotFoundException, + ObjectNotFoundException, + ConflictException, + ObjectExistsException, + + # Request errors + BadRequestException, + UnsupportedOperationException, + InvalidRangeException, + + # SDK-specific errors + ImportManagerException, + TransactionException, + + # Configuration errors + UnsupportedCredentialsProviderType, + InvalidEnvVarFormat +) +``` + +### Comprehensive Error Handling Patterns + +```python +def robust_repository_operations(repo_id, storage_namespace): + """Demonstrate comprehensive error handling for repository operations""" + + try: + # Attempt to create repository + repo = lakefs.repository(repo_id).create( + storage_namespace=storage_namespace, + exist_ok=False + ) + print(f"Repository created: {repo_id}") + return repo + + except ConflictException: + print(f"Repository {repo_id} already exists, connecting to existing") + return lakefs.repository(repo_id) + + except NotAuthorizedException: + print("Authentication failed - check credentials") + raise + + except ForbiddenException: + print("Operation forbidden - insufficient permissions") + raise + + except BadRequestException as e: + print(f"Invalid request parameters: {e}") + raise + + except ServerException as e: + print(f"Server error (HTTP {e.status_code}): {e.reason}") + if e.body: + print(f"Error details: {e.body}") + raise + + except LakeFSException as e: + print(f"lakeFS SDK error: {e}") + raise + + except Exception as e: + print(f"Unexpected error: {e}") + raise + +# Usage +try: + repo = robust_repository_operations("my-repo", "s3://my-bucket/repos/my-repo") +except Exception as e: + print(f"Failed to set up repository: {e}") +``` + +### Object-Level Error Handling + +```python +def safe_object_operations(branch, operations): + """Safely perform multiple object operations with detailed error handling""" + + results = [] + + for operation in operations: + op_type = operation["type"] + path = operation["path"] + + try: + if op_type == "upload": + obj = branch.object(path).upload( + data=operation["data"], + mode=operation.get("mode", "w") + ) + results.append({"path": path, "status": "uploaded", "object": obj}) + + elif op_type == "download": + obj = branch.object(path) + if not obj.exists(): + raise ObjectNotFoundException(404, "Object not found", None) + + content = obj.reader().read() + results.append({"path": path, "status": "downloaded", "content": content}) + + elif op_type == "delete": + obj = branch.object(path) + obj.delete() + results.append({"path": path, "status": "deleted"}) + + except ObjectNotFoundException: + print(f"Object not found: {path}") + results.append({"path": path, "status": "not_found"}) + + except ObjectExistsException: + print(f"Object already exists (exclusive mode): {path}") + results.append({"path": path, "status": "exists"}) + + except PermissionException: + print(f"Permission denied for object: {path}") + results.append({"path": path, "status": "permission_denied"}) + + except InvalidRangeException: + print(f"Invalid range request for object: {path}") + results.append({"path": path, "status": "invalid_range"}) + + except Exception as e: + print(f"Unexpected error with object {path}: {e}") + results.append({"path": path, "status": "error", "error": str(e)}) + + return results + +# Usage +operations = [ + {"type": "upload", "path": "data/file1.txt", "data": "content1"}, + {"type": "upload", "path": "data/file2.txt", "data": "content2", "mode": "x"}, + {"type": "download", "path": "data/file1.txt"}, + {"type": "delete", "path": "data/old-file.txt"} +] + +results = safe_object_operations(branch, operations) +for result in results: + print(f"{result['path']}: {result['status']}") +``` + +### Custom Exception Handling + +```python +class DataPipelineException(LakeFSException): + """Custom exception for data pipeline operations""" + + def __init__(self, stage, message, original_exception=None): + self.stage = stage + self.original_exception = original_exception + super().__init__(f"Pipeline failed at stage '{stage}': {message}") + +def pipeline_stage_wrapper(stage_name): + """Decorator for pipeline stages with custom error handling""" + + def decorator(func): + def wrapper(*args, **kwargs): + try: + return func(*args, **kwargs) + except LakeFSException as e: + raise DataPipelineException(stage_name, str(e), e) + except Exception as e: + raise DataPipelineException(stage_name, f"Unexpected error: {e}", e) + return wrapper + return decorator + +@pipeline_stage_wrapper("data_extraction") +def extract_data(branch, source_path): + """Extract data with error handling""" + obj = branch.object(source_path) + return json.loads(obj.reader().read()) + +@pipeline_stage_wrapper("data_transformation") +def transform_data(data): + """Transform data with error handling""" + return [{"id": item["id"], "value": item["value"] * 2} for item in data] + +@pipeline_stage_wrapper("data_loading") +def load_data(branch, target_path, data): + """Load data with error handling""" + branch.object(target_path).upload( + data=json.dumps(data, indent=2), + content_type="application/json" + ) + +# Usage +try: + data = extract_data(branch, "raw/input.json") + transformed = transform_data(data) + load_data(branch, "processed/output.json", transformed) + print("Pipeline completed successfully") + +except DataPipelineException as e: + print(f"Pipeline failed: {e}") + print(f"Failed stage: {e.stage}") + if e.original_exception: + print(f"Original error: {e.original_exception}") +``` + +## Performance Optimization + +### Advanced Client Configuration + +```python +from lakefs.client import Client +import os + +def create_optimized_client(): + """Create a performance-optimized client""" + + return Client( + host=os.getenv('LAKEFS_ENDPOINT'), + username=os.getenv('LAKEFS_ACCESS_KEY_ID'), + password=os.getenv('LAKEFS_SECRET_ACCESS_KEY'), + + # Connection pooling for high throughput + pool_connections=50, + pool_maxsize=100, + pool_block=False, + + # Retry configuration + max_retries=5, + backoff_factor=0.3, + retry_on_status=[500, 502, 503, 504], + + # Timeout settings + timeout=60, + connect_timeout=10, + read_timeout=50, + + # SSL and security + verify_ssl=True, + ssl_ca_cert=os.getenv('LAKEFS_CA_CERT_PATH'), + + # Proxy configuration + proxy=os.getenv('HTTPS_PROXY'), + proxy_headers={'User-Agent': 'MyApp/1.0'} + ) + +# Create singleton client for reuse +_optimized_client = None + +def get_optimized_client(): + """Get or create optimized client singleton""" + global _optimized_client + if _optimized_client is None: + _optimized_client = create_optimized_client() + return _optimized_client +``` + +### Batch Operations and Bulk Processing + +```python +import concurrent.futures +import threading +from collections import defaultdict + +class BulkOperationManager: + """Manager for efficient bulk operations""" + + def __init__(self, branch, max_workers=10, batch_size=100): + self.branch = branch + self.max_workers = max_workers + self.batch_size = batch_size + self.results = defaultdict(list) + self.lock = threading.Lock() + + def bulk_upload(self, file_data_pairs): + """Upload multiple files efficiently using threading""" + + def upload_batch(batch): + """Upload a batch of files in a transaction""" + batch_results = [] + + try: + with self.branch.transact( + commit_message=f"Bulk upload batch ({len(batch)} files)" + ) as tx: + for path, data in batch: + obj = tx.object(path).upload(data=data) + batch_results.append({"path": path, "status": "success", "object": obj}) + + except Exception as e: + for path, _ in batch: + batch_results.append({"path": path, "status": "error", "error": str(e)}) + + with self.lock: + self.results["uploads"].extend(batch_results) + + return batch_results + + # Split into batches + batches = [ + file_data_pairs[i:i + self.batch_size] + for i in range(0, len(file_data_pairs), self.batch_size) + ] + + # Process batches concurrently + with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor: + futures = [executor.submit(upload_batch, batch) for batch in batches] + + for future in concurrent.futures.as_completed(futures): + try: + future.result() + except Exception as e: + print(f"Batch upload failed: {e}") + + return self.results["uploads"] + + def bulk_download(self, paths): + """Download multiple files efficiently""" + + def download_file(path): + """Download a single file""" + try: + obj = self.branch.object(path) + if obj.exists(): + content = obj.reader().read() + return {"path": path, "status": "success", "content": content} + else: + return {"path": path, "status": "not_found"} + except Exception as e: + return {"path": path, "status": "error", "error": str(e)} + + # Download files concurrently + with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor: + futures = {executor.submit(download_file, path): path for path in paths} + + results = [] + for future in concurrent.futures.as_completed(futures): + results.append(future.result()) + + return results + +# Usage +bulk_manager = BulkOperationManager(branch, max_workers=20, batch_size=50) + +# Bulk upload +files_to_upload = [ + (f"data/file_{i}.txt", f"Content for file {i}") + for i in range(1000) +] + +upload_results = bulk_manager.bulk_upload(files_to_upload) +success_count = len([r for r in upload_results if r["status"] == "success"]) +print(f"Successfully uploaded {success_count} files") + +# Bulk download +paths_to_download = [f"data/file_{i}.txt" for i in range(100)] +download_results = bulk_manager.bulk_download(paths_to_download) +``` + +### Memory-Efficient Streaming + +```python +import hashlib +import time + +class StreamingProcessor: + """Efficient streaming processor for large files""" + + def __init__(self, chunk_size=64*1024): # 64KB chunks + self.chunk_size = chunk_size + + def stream_upload_with_progress(self, branch, local_path, remote_path, + progress_callback=None): + """Upload large file with progress tracking""" + + file_size = os.path.getsize(local_path) + uploaded_bytes = 0 + start_time = time.time() + + obj = branch.object(remote_path) + + with open(local_path, 'rb') as local_file: + with obj.writer(mode='wb') as remote_writer: + while True: + chunk = local_file.read(self.chunk_size) + if not chunk: + break + + remote_writer.write(chunk) + uploaded_bytes += len(chunk) + + if progress_callback: + progress = (uploaded_bytes / file_size) * 100 + elapsed = time.time() - start_time + speed = uploaded_bytes / elapsed if elapsed > 0 else 0 + + progress_callback({ + "progress": progress, + "uploaded_bytes": uploaded_bytes, + "total_bytes": file_size, + "speed_bps": speed, + "elapsed_time": elapsed + }) + + return obj + + def stream_download_with_verification(self, branch, remote_path, local_path, + verify_checksum=True): + """Download large file with checksum verification""" + + obj = branch.object(remote_path) + + # Get object stats for verification + stats = obj.stat() + expected_size = stats.size_bytes + expected_checksum = stats.checksum if verify_checksum else None + + downloaded_bytes = 0 + hasher = hashlib.sha256() if verify_checksum else None + + with obj.reader(mode='rb') as remote_reader: + with open(local_path, 'wb') as local_file: + while True: + chunk = remote_reader.read(self.chunk_size) + if not chunk: + break + + local_file.write(chunk) + downloaded_bytes += len(chunk) + + if hasher: + hasher.update(chunk) + + # Verify download + if downloaded_bytes != expected_size: + raise ValueError(f"Size mismatch: expected {expected_size}, got {downloaded_bytes}") + + if verify_checksum and expected_checksum: + actual_checksum = f"sha256:{hasher.hexdigest()}" + if actual_checksum != expected_checksum: + raise ValueError(f"Checksum mismatch: expected {expected_checksum}, got {actual_checksum}") + + return { + "local_path": local_path, + "remote_path": remote_path, + "size_bytes": downloaded_bytes, + "checksum_verified": verify_checksum + } + +# Usage with progress tracking +def progress_callback(info): + print(f"Upload progress: {info['progress']:.1f}% " + f"({info['uploaded_bytes']}/{info['total_bytes']} bytes) " + f"Speed: {info['speed_bps']/1024/1024:.1f} MB/s") + +processor = StreamingProcessor() + +# Upload large file +large_obj = processor.stream_upload_with_progress( + branch, + "large_dataset.csv", + "data/large_dataset.csv", + progress_callback=progress_callback +) + +# Download with verification +result = processor.stream_download_with_verification( + branch, + "data/large_dataset.csv", + "downloaded_dataset.csv", + verify_checksum=True +) +print(f"Download verified: {result}") +``` + +### Connection Pooling and Resource Management + +```python +import atexit +from contextlib import contextmanager + +class ConnectionManager: + """Manage lakeFS connections and resources efficiently""" + + def __init__(self): + self.clients = {} + self.active_connections = 0 + self.max_connections = 50 + + # Register cleanup on exit + atexit.register(self.cleanup_all) + + def get_client(self, config_name="default", **client_kwargs): + """Get or create a client with connection pooling""" + + if config_name not in self.clients: + if self.active_connections >= self.max_connections: + raise RuntimeError(f"Maximum connections ({self.max_connections}) exceeded") + + client_config = { + "pool_connections": 10, + "pool_maxsize": 20, + "max_retries": 3, + **client_kwargs + } + + self.clients[config_name] = Client(**client_config) + self.active_connections += 1 + + return self.clients[config_name] + + @contextmanager + def managed_repository(self, repo_id, config_name="default", **client_kwargs): + """Context manager for repository operations with automatic cleanup""" + + client = self.get_client(config_name, **client_kwargs) + repo = lakefs.Repository(repo_id, client=client) + + try: + yield repo + finally: + # Cleanup could be added here if needed + pass + + def cleanup_all(self): + """Cleanup all connections""" + for client in self.clients.values(): + # Perform any necessary cleanup + pass + self.clients.clear() + self.active_connections = 0 + +# Global connection manager +connection_manager = ConnectionManager() + +# Usage +with connection_manager.managed_repository("my-repo") as repo: + branch = repo.branch("main") + + # Perform operations + branch.object("data/test.txt").upload(data="test content") + + # Repository and client are automatically managed +``` + +## Advanced I/O Patterns + +### Custom Serialization and Formats + +```python +import pickle +import json +import csv +import io +import gzip +import base64 +from typing import Any, Dict, List + +class AdvancedSerializer: + """Advanced serialization for different data types and formats""" + + @staticmethod + def serialize_python_object(obj: Any, compression=True) -> bytes: + """Serialize Python object with optional compression""" + data = pickle.dumps(obj) + + if compression: + data = gzip.compress(data) + + return data + + @staticmethod + def deserialize_python_object(data: bytes, compression=True) -> Any: + """Deserialize Python object with optional decompression""" + if compression: + data = gzip.decompress(data) + + return pickle.loads(data) + + @staticmethod + def serialize_dataframe(df, format='parquet', compression=True) -> bytes: + """Serialize pandas DataFrame in various formats""" + buffer = io.BytesIO() + + if format == 'parquet': + df.to_parquet(buffer, compression='gzip' if compression else None) + elif format == 'csv': + csv_data = df.to_csv(index=False) + if compression: + csv_data = gzip.compress(csv_data.encode('utf-8')) + else: + csv_data = csv_data.encode('utf-8') + buffer.write(csv_data) + elif format == 'json': + json_data = df.to_json(orient='records', indent=2) + if compression: + json_data = gzip.compress(json_data.encode('utf-8')) + else: + json_data = json_data.encode('utf-8') + buffer.write(json_data) + + return buffer.getvalue() + + @staticmethod + def deserialize_dataframe(data: bytes, format='parquet', compression=True): + """Deserialize pandas DataFrame from various formats""" + import pandas as pd + + if compression and format in ['csv', 'json']: + data = gzip.decompress(data) + + buffer = io.BytesIO(data) + + if format == 'parquet': + return pd.read_parquet(buffer) + elif format == 'csv': + return pd.read_csv(buffer) + elif format == 'json': + return pd.read_json(buffer, orient='records') + +def advanced_data_storage(branch, data_items): + """Store various data types with optimal serialization""" + + serializer = AdvancedSerializer() + + for item_name, item_data in data_items.items(): + if isinstance(item_data, dict): + # Store as compressed JSON + json_data = json.dumps(item_data, indent=2) + compressed_data = gzip.compress(json_data.encode('utf-8')) + + branch.object(f"data/{item_name}.json.gz").upload( + data=compressed_data, + content_type="application/gzip", + metadata={"format": "json", "compression": "gzip"} + ) + + elif hasattr(item_data, 'to_parquet'): # pandas DataFrame + # Store as compressed Parquet + parquet_data = serializer.serialize_dataframe( + item_data, format='parquet', compression=True + ) + + branch.object(f"data/{item_name}.parquet").upload( + data=parquet_data, + content_type="application/octet-stream", + metadata={"format": "parquet", "compression": "gzip"} + ) + + else: + # Store as compressed pickle for arbitrary Python objects + pickle_data = serializer.serialize_python_object( + item_data, compression=True + ) + + branch.object(f"data/{item_name}.pkl.gz").upload( + data=pickle_data, + content_type="application/octet-stream", + metadata={"format": "pickle", "compression": "gzip"} + ) + +# Usage +import pandas as pd + +data_items = { + "config": {"version": "1.0", "settings": {"debug": True}}, + "users": pd.DataFrame({"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"]}), + "model": {"weights": [0.1, 0.2, 0.3], "bias": 0.05} +} + +advanced_data_storage(branch, data_items) +``` + +### Pre-signed URL Management + +```python +import time +from urllib.parse import urlparse +import requests + +class PreSignedURLManager: + """Manage pre-signed URLs for direct storage access""" + + def __init__(self, branch): + self.branch = branch + self.url_cache = {} + self.cache_duration = 3600 # 1 hour + + def get_presigned_upload_url(self, path, content_type=None, cache=True): + """Get pre-signed URL for direct upload""" + + cache_key = f"upload:{path}:{content_type}" + + if cache and cache_key in self.url_cache: + cached_url, timestamp = self.url_cache[cache_key] + if time.time() - timestamp < self.cache_duration: + return cached_url + + # Get object stats to get pre-signed URL + obj = self.branch.object(path) + + # For uploads, we need to use the writer with pre-sign + with obj.writer(mode='wb', pre_sign=True, content_type=content_type) as writer: + # The writer provides access to pre-signed URL + presigned_url = writer._pre_signed_url if hasattr(writer, '_pre_signed_url') else None + + if cache and presigned_url: + self.url_cache[cache_key] = (presigned_url, time.time()) + + return presigned_url + + def get_presigned_download_url(self, path, cache=True): + """Get pre-signed URL for direct download""" + + cache_key = f"download:{path}" + + if cache and cache_key in self.url_cache: + cached_url, timestamp = self.url_cache[cache_key] + if time.time() - timestamp < self.cache_duration: + return cached_url + + # Get object stats with pre-sign enabled + obj = self.branch.object(path) + stats = obj.stat(pre_sign=True) + presigned_url = stats.physical_address + + if cache and presigned_url: + self.url_cache[cache_key] = (presigned_url, time.time()) + + return presigned_url + + def direct_upload_via_presigned(self, path, data, content_type=None): + """Upload data directly using pre-signed URL""" + + presigned_url = self.get_presigned_upload_url(path, content_type) + + if not presigned_url: + # Fallback to regular upload + return self.branch.object(path).upload( + data=data, + content_type=content_type + ) + + # Direct upload to storage + headers = {} + if content_type: + headers['Content-Type'] = content_type + + response = requests.put(presigned_url, data=data, headers=headers) + response.raise_for_status() + + return self.branch.object(path) + + def direct_download_via_presigned(self, path): + """Download data directly using pre-signed URL""" + + presigned_url = self.get_presigned_download_url(path) + + if not presigned_url: + # Fallback to regular download + return self.branch.object(path).reader().read() + + # Direct download from storage + response = requests.get(presigned_url) + response.raise_for_status() + + return response.content + +# Usage +url_manager = PreSignedURLManager(branch) + +# Direct upload +data = b"Large binary data that benefits from direct upload" +obj = url_manager.direct_upload_via_presigned( + "data/large-file.bin", + data, + content_type="application/octet-stream" +) + +# Direct download +downloaded_data = url_manager.direct_download_via_presigned("data/large-file.bin") +``` + +## Logging and Debugging + +### Comprehensive Logging Setup + +```python +import logging +import sys +from datetime import datetime + +class LakeFSLogger: + """Comprehensive logging setup for lakeFS operations""" + + def __init__(self, name="lakefs_app", level=logging.INFO): + self.logger = logging.getLogger(name) + self.logger.setLevel(level) + + # Prevent duplicate handlers + if not self.logger.handlers: + self._setup_handlers() + + def _setup_handlers(self): + """Set up logging handlers""" + + # Console handler + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setLevel(logging.INFO) + + # File handler + file_handler = logging.FileHandler( + f"lakefs_{datetime.now().strftime('%Y%m%d')}.log" + ) + file_handler.setLevel(logging.DEBUG) + + # Formatters + console_format = logging.Formatter( + '%(asctime)s - %(name)s - %(levelname)s - %(message)s' + ) + file_format = logging.Formatter( + '%(asctime)s - %(name)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s' + ) + + console_handler.setFormatter(console_format) + file_handler.setFormatter(file_format) + + self.logger.addHandler(console_handler) + self.logger.addHandler(file_handler) + + # Enable lakeFS SDK logging + lakefs_logger = logging.getLogger('lakefs') + lakefs_logger.setLevel(logging.DEBUG) + lakefs_logger.addHandler(file_handler) + + def log_operation(self, operation_name, func, *args, **kwargs): + """Log operation execution with timing""" + + start_time = time.time() + self.logger.info(f"Starting operation: {operation_name}") + + try: + result = func(*args, **kwargs) + duration = time.time() - start_time + self.logger.info(f"Completed operation: {operation_name} ({duration:.2f}s)") + return result + + except Exception as e: + duration = time.time() - start_time + self.logger.error(f"Failed operation: {operation_name} ({duration:.2f}s) - {e}") + raise + + def log_transaction(self, tx_func): + """Decorator for logging transaction operations""" + + def wrapper(branch, commit_message, *args, **kwargs): + self.logger.info(f"Starting transaction: {commit_message}") + + try: + with branch.transact(commit_message=commit_message) as tx: + result = tx_func(tx, *args, **kwargs) + self.logger.info(f"Transaction completed: {commit_message}") + return result + + except Exception as e: + self.logger.error(f"Transaction failed: {commit_message} - {e}") + raise + + return wrapper + +# Usage +logger = LakeFSLogger("my_app", level=logging.DEBUG) + +# Log regular operations +def upload_data(branch, path, data): + return branch.object(path).upload(data=data) + +result = logger.log_operation( + "upload_user_data", + upload_data, + branch, "data/users.json", '{"users": []}' +) + +# Log transactions +@logger.log_transaction +def process_data_transaction(tx, input_path, output_path): + # Read input + input_obj = tx.object(input_path) + data = json.loads(input_obj.reader().read()) + + # Process data + processed = [{"id": item["id"], "processed": True} for item in data] + + # Write output + tx.object(output_path).upload( + data=json.dumps(processed, indent=2), + content_type="application/json" + ) + + return len(processed) + +# Execute logged transaction +processed_count = process_data_transaction( + branch, + "Transaction: Process user data", + "raw/users.json", + "processed/users.json" +) +``` + +### Performance Monitoring and Profiling + +```python +import time +import psutil +import threading +from collections import defaultdict, deque + +class PerformanceMonitor: + """Monitor performance metrics for lakeFS operations""" + + def __init__(self, max_history=1000): + self.metrics = defaultdict(deque) + self.max_history = max_history + self.active_operations = {} + self.lock = threading.Lock() + + def start_operation(self, operation_id, operation_type): + """Start monitoring an operation""" + with self.lock: + self.active_operations[operation_id] = { + "type": operation_type, + "start_time": time.time(), + "start_memory": psutil.Process().memory_info().rss, + "start_cpu": psutil.Process().cpu_percent() + } + + def end_operation(self, operation_id, success=True, error=None): + """End monitoring an operation""" + with self.lock: + if operation_id not in self.active_operations: + return + + start_info = self.active_operations.pop(operation_id) + end_time = time.time() + end_memory = psutil.Process().memory_info().rss + + duration = end_time - start_info["start_time"] + memory_delta = end_memory - start_info["start_memory"] + + metric = { + "operation_id": operation_id, + "type": start_info["type"], + "duration": duration, + "memory_delta": memory_delta, + "success": success, + "error": str(error) if error else None, + "timestamp": end_time + } + + # Store metric + self.metrics[start_info["type"]].append(metric) + + # Limit history + if len(self.metrics[start_info["type"]]) > self.max_history: + self.metrics[start_info["type"]].popleft() + + def get_stats(self, operation_type=None): + """Get performance statistics""" + with self.lock: + if operation_type: + operations = [operation_type] if operation_type in self.metrics else [] + else: + operations = list(self.metrics.keys()) + + stats = {} + + for op_type in operations: + metrics = list(self.metrics[op_type]) + if not metrics: + continue + + successful = [m for m in metrics if m["success"]] + failed = [m for m in metrics if not m["success"]] + + durations = [m["duration"] for m in successful] + memory_deltas = [m["memory_delta"] for m in successful] + + stats[op_type] = { + "total_operations": len(metrics), + "successful": len(successful), + "failed": len(failed), + "success_rate": len(successful) / len(metrics) * 100 if metrics else 0, + "avg_duration": sum(durations) / len(durations) if durations else 0, + "max_duration": max(durations) if durations else 0, + "min_duration": min(durations) if durations else 0, + "avg_memory_delta": sum(memory_deltas) / len(memory_deltas) if memory_deltas else 0, + "recent_errors": [m["error"] for m in failed[-5:] if m["error"]] + } + + return stats + + def monitored_operation(self, operation_type): + """Decorator for monitoring operations""" + + def decorator(func): + def wrapper(*args, **kwargs): + operation_id = f"{operation_type}_{int(time.time() * 1000)}" + self.start_operation(operation_id, operation_type) + + try: + result = func(*args, **kwargs) + self.end_operation(operation_id, success=True) + return result + except Exception as e: + self.end_operation(operation_id, success=False, error=e) + raise + + return wrapper + return decorator + +# Global performance monitor +perf_monitor = PerformanceMonitor() + +# Usage with decorator +@perf_monitor.monitored_operation("object_upload") +def monitored_upload(branch, path, data): + return branch.object(path).upload(data=data) + +@perf_monitor.monitored_operation("object_download") +def monitored_download(branch, path): + return branch.object(path).reader().read() + +@perf_monitor.monitored_operation("transaction") +def monitored_transaction(branch, commit_message, operations): + with branch.transact(commit_message=commit_message) as tx: + results = [] + for op in operations: + if op["type"] == "upload": + result = tx.object(op["path"]).upload(data=op["data"]) + results.append(result) + return results + +# Perform monitored operations +for i in range(10): + monitored_upload(branch, f"test/file_{i}.txt", f"Content {i}") + content = monitored_download(branch, f"test/file_{i}.txt") + +# Get performance statistics +stats = perf_monitor.get_stats() +for op_type, metrics in stats.items(): + print(f"\n{op_type.upper()} Statistics:") + print(f" Total operations: {metrics['total_operations']}") + print(f" Success rate: {metrics['success_rate']:.1f}%") + print(f" Average duration: {metrics['avg_duration']:.3f}s") + print(f" Max duration: {metrics['max_duration']:.3f}s") + print(f" Average memory delta: {metrics['avg_memory_delta']/1024/1024:.1f} MB") +``` + +## Production Deployment Strategies + +### Configuration Management + +```python +import os +import yaml +from dataclasses import dataclass +from typing import Optional, Dict, Any + +@dataclass +class LakeFSConfig: + """Configuration class for lakeFS deployment""" + + # Connection settings + endpoint: str + access_key_id: str + secret_access_key: str + + # SSL/TLS settings + verify_ssl: bool = True + ssl_ca_cert: Optional[str] = None + + # Connection pooling + pool_connections: int = 20 + pool_maxsize: int = 50 + pool_block: bool = False + + # Retry settings + max_retries: int = 3 + backoff_factor: float = 0.3 + retry_on_status: list = None + + # Timeout settings + timeout: int = 60 + connect_timeout: int = 10 + read_timeout: int = 50 + + # Proxy settings + proxy: Optional[str] = None + proxy_headers: Optional[Dict[str, str]] = None + + # Application settings + default_branch: str = "main" + batch_size: int = 100 + max_workers: int = 10 + + def __post_init__(self): + if self.retry_on_status is None: + self.retry_on_status = [500, 502, 503, 504] + +class ConfigManager: + """Manage lakeFS configuration from multiple sources""" + + @staticmethod + def from_environment() -> LakeFSConfig: + """Load configuration from environment variables""" + + return LakeFSConfig( + endpoint=os.getenv('LAKEFS_ENDPOINT', 'http://localhost:8000'), + access_key_id=os.getenv('LAKEFS_ACCESS_KEY_ID'), + secret_access_key=os.getenv('LAKEFS_SECRET_ACCESS_KEY'), + + verify_ssl=os.getenv('LAKEFS_VERIFY_SSL', 'true').lower() == 'true', + ssl_ca_cert=os.getenv('LAKEFS_CA_CERT_PATH'), + + pool_connections=int(os.getenv('LAKEFS_POOL_CONNECTIONS', '20')), + pool_maxsize=int(os.getenv('LAKEFS_POOL_MAXSIZE', '50')), + + max_retries=int(os.getenv('LAKEFS_MAX_RETRIES', '3')), + timeout=int(os.getenv('LAKEFS_TIMEOUT', '60')), + + proxy=os.getenv('HTTPS_PROXY'), + + default_branch=os.getenv('LAKEFS_DEFAULT_BRANCH', 'main'), + batch_size=int(os.getenv('LAKEFS_BATCH_SIZE', '100')), + max_workers=int(os.getenv('LAKEFS_MAX_WORKERS', '10')) + ) + + @staticmethod + def from_file(config_path: str) -> LakeFSConfig: + """Load configuration from YAML file""" + + with open(config_path, 'r') as f: + config_data = yaml.safe_load(f) + + return LakeFSConfig(**config_data.get('lakefs', {})) + + @staticmethod + def from_dict(config_dict: Dict[str, Any]) -> LakeFSConfig: + """Load configuration from dictionary""" + + return LakeFSConfig(**config_dict) + +class ProductionLakeFSClient: + """Production-ready lakeFS client with comprehensive configuration""" + + def __init__(self, config: LakeFSConfig): + self.config = config + self._client = None + self._health_check_interval = 300 # 5 minutes + self._last_health_check = 0 + + @property + def client(self) -> Client: + """Get or create lakeFS client""" + + if self._client is None: + self._client = Client( + host=self.config.endpoint, + username=self.config.access_key_id, + password=self.config.secret_access_key, + + verify_ssl=self.config.verify_ssl, + ssl_ca_cert=self.config.ssl_ca_cert, + + pool_connections=self.config.pool_connections, + pool_maxsize=self.config.pool_maxsize, + pool_block=self.config.pool_block, + + max_retries=self.config.max_retries, + backoff_factor=self.config.backoff_factor, + retry_on_status=self.config.retry_on_status, + + timeout=self.config.timeout, + connect_timeout=self.config.connect_timeout, + read_timeout=self.config.read_timeout, + + proxy=self.config.proxy, + proxy_headers=self.config.proxy_headers + ) + + return self._client + + def health_check(self, force=False) -> bool: + """Perform health check with caching""" + + current_time = time.time() + + if not force and (current_time - self._last_health_check) < self._health_check_interval: + return True # Assume healthy if recently checked + + try: + # Simple operation to test connectivity + list(lakefs.repositories(client=self.client, max_amount=1)) + self._last_health_check = current_time + return True + + except Exception as e: + logger.error(f"Health check failed: {e}") + return False + + def get_repository(self, repo_id: str): + """Get repository with health check""" + + if not self.health_check(): + raise RuntimeError("lakeFS health check failed") + + return lakefs.Repository(repo_id, client=self.client) + +# Usage +# Load configuration +config = ConfigManager.from_environment() + +# Create production client +prod_client = ProductionLakeFSClient(config) + +# Use with health checking +try: + repo = prod_client.get_repository("my-repo") + branch = repo.branch(config.default_branch) + + # Perform operations + branch.object("data/production.txt").upload(data="Production data") + +except Exception as e: + logger.error(f"Production operation failed: {e}") +``` + +## Key Points + +- **Comprehensive error handling**: Use specific exception types for robust error recovery +- **Performance optimization**: Implement connection pooling, batch operations, and streaming +- **Resource management**: Use context managers and connection pooling for efficient resource usage +- **Monitoring and logging**: Implement comprehensive logging and performance monitoring +- **Production readiness**: Use proper configuration management and health checking +- **Advanced I/O**: Leverage pre-signed URLs and custom serialization for optimal performance +- **Debugging support**: Enable detailed logging and performance profiling for troubleshooting + +## See Also + +- **[Repository Management](repositories.md)** - Creating and managing repositories +- **[Branch Operations](branches-and-commits.md)** - Version control operations +- **[Object Operations](objects-and-io.md)** - Individual object management +- **[Transactions](transactions.md)** - Atomic multi-operation workflows +- **[Import Operations](imports-and-exports.md)** - Bulk data operations +- **[Best Practices](../reference/best-practices.md)** - Production deployment guidance +- **[Troubleshooting](../reference/troubleshooting.md)** - Common issues and solutions \ No newline at end of file diff --git a/docs/src/integrations/python/high-level-sdk/branches-and-commits.md b/docs/src/integrations/python/high-level-sdk/branches-and-commits.md new file mode 100644 index 00000000000..3ebdf1a1c85 --- /dev/null +++ b/docs/src/integrations/python/high-level-sdk/branches-and-commits.md @@ -0,0 +1,743 @@ +--- +title: Branches and Commits +description: Version control operations with branches and commits +sdk_types: ["high-level"] +difficulty: "intermediate" +use_cases: ["version-control", "branching", "merging", "commits"] +topics: ["branches", "commits", "merging", "version-control"] +audience: ["data-engineers", "developers", "python-developers"] +last_updated: "2024-01-15" +--- + +# Branches and Commits + +Master version control operations using branches and commits in the High-Level Python SDK. Branches provide isolated development environments while commits create immutable snapshots of your data. + +## Branch Concepts + +### Branch vs Reference +- **Branch**: A mutable pointer to a commit that can be updated with new commits +- **Reference**: A read-only pointer to any commit, branch, or tag +- **Head**: The latest commit on a branch + +### Branch Lifecycle +1. **Create** - Branch from existing reference +2. **Modify** - Add, update, or delete objects +3. **Commit** - Create immutable snapshot +4. **Merge** - Integrate changes into target branch +5. **Delete** - Remove branch when no longer needed + +## Creating Branches + +### Basic Branch Creation + +```python +import lakefs + +repo = lakefs.repository("my-repo") + +# Create branch from main +feature_branch = repo.branch("feature-branch").create(source_reference="main") + +print(f"Created branch: {feature_branch.id}") +print(f"Source commit: {feature_branch.get_commit().id}") +``` + +**Expected Output:** +``` +Created branch: feature-branch +Source commit: c7a632d74f46c... +``` + +### Branch from Specific References + +```python +# Create branch from specific commit +hotfix_branch = repo.branch("hotfix-v1.2").create( + source_reference="a1b2c3d4e5f6..." +) + +# Create branch from tag +release_branch = repo.branch("release-prep").create( + source_reference="v1.1.0" +) + +# Create branch from ref expression +debug_branch = repo.branch("debug-issue").create( + source_reference="main~5" # 5 commits before main +) +``` + +### Safe Branch Creation + +```python +from lakefs.exceptions import ConflictException + +def create_branch_safely(repo, branch_name, source_ref): + try: + # Try to create new branch + branch = repo.branch(branch_name).create( + source_reference=source_ref, + exist_ok=False + ) + print(f"Created new branch: {branch_name}") + return branch + + except ConflictException: + print(f"Branch {branch_name} already exists, using existing branch") + return repo.branch(branch_name) + +# Usage +branch = create_branch_safely(repo, "feature-auth", "main") +``` + +### Branch Creation with exist_ok + +```python +# Create branch or use existing one +branch = repo.branch("development").create( + source_reference="main", + exist_ok=True # Don't fail if branch exists +) + +print(f"Branch ready: {branch.id}") +``` + +## Branch Information and Navigation + +### Accessing Branch Properties + +```python +branch = repo.branch("main") + +# Get current commit +commit = branch.get_commit() +print(f"Branch: {branch.id}") +print(f"Repository: {branch.repo_id}") +print(f"Current commit: {commit.id}") +print(f"Commit message: {commit.message}") +print(f"Committer: {commit.committer}") +print(f"Commit date: {commit.creation_date}") + +# Access commit metadata +if commit.metadata: + print("Commit metadata:") + for key, value in commit.metadata.items(): + print(f" {key}: {value}") +``` + +**Expected Output:** +``` +Branch: main +Repository: my-repo +Current commit: c7a632d74f46c... +Commit message: Initial commit +Committer: admin +Commit date: 1640995200 +Commit metadata: + version: 1.0.0 + environment: production +``` + +### Branch Head Reference + +```python +# Get head reference (always latest commit) +head_ref = branch.head +print(f"Head commit: {head_ref.id}") + +# Head is automatically updated after commits +branch.object("new-file.txt").upload(data="content") +branch.commit("Add new file") + +new_head = branch.head +print(f"New head commit: {new_head.id}") +print(f"Head changed: {head_ref.id != new_head.id}") +``` + +### Listing Branches + +```python +# List all branches in repository +print("All branches:") +for branch in repo.branches(): + commit = branch.get_commit() + print(f" {branch.id} -> {commit.id[:8]} ({commit.message})") + +# List branches with filtering +print("\nFeature branches:") +for branch in repo.branches(prefix="feature-"): + print(f" {branch.id}") + +# List recent branches with pagination +print("\nRecent branches:") +for branch in repo.branches(max_amount=5): + print(f" {branch.id}") +``` + +**Expected Output:** +``` +All branches: + main -> c7a632d7 (Initial commit) + feature-auth -> a1b2c3d4 (Add authentication) + hotfix-v1.2 -> f6e5d4c3 (Fix critical bug) + +Feature branches: + feature-auth + feature-dashboard + +Recent branches: + main + feature-auth + hotfix-v1.2 + development + staging +``` + +## Commit Operations + +### Creating Commits + +```python +branch = repo.branch("feature-branch") + +# Upload some data first +branch.object("data/users.csv").upload( + data="name,email\nAlice,alice@example.com\nBob,bob@example.com" +) +branch.object("config/settings.json").upload( + data='{"version": "2.0", "debug": false}' +) + +# Simple commit +commit = branch.commit(message="Add user data and configuration") +print(f"Commit ID: {commit.id}") +print(f"Message: {commit.message}") +``` + +### Commits with Metadata + +```python +# Commit with rich metadata +commit = branch.commit( + message="Update data pipeline for Q4 processing", + metadata={ + "version": "1.2.0", + "author": "data-team", + "ticket": "PROJ-123", + "environment": "staging", + "reviewed_by": "senior-engineer" + } +) + +print(f"Commit: {commit.id}") +print("Metadata:") +for key, value in commit.metadata.items(): + print(f" {key}: {value}") +``` + +**Expected Output:** +``` +Commit: a1b2c3d4e5f6... +Metadata: + version: 1.2.0 + author: data-team + ticket: PROJ-123 + environment: staging + reviewed_by: senior-engineer +``` + +### Viewing Uncommitted Changes + +```python +# Make some changes +branch.object("data/new-file.txt").upload(data="New content") +branch.object("data/existing-file.txt").upload(data="Updated content") + +# Check uncommitted changes +changes = list(branch.uncommitted()) +print(f"Uncommitted changes: {len(changes)}") + +for change in changes: + print(f" {change.type}: {change.path}") + if change.size_bytes: + print(f" Size: {change.size_bytes} bytes") + print(f" Type: {change.path_type}") +``` + +**Expected Output:** +``` +Uncommitted changes: 2 + added: data/new-file.txt + Size: 11 bytes + Type: object + changed: data/existing-file.txt + Size: 15 bytes + Type: object +``` + +### Commit History and Logs + +```python +# Get commit history +print("Recent commits:") +for commit in branch.log(max_amount=5): + print(f" {commit.id[:8]} - {commit.message}") + print(f" By: {commit.committer}") + print(f" Date: {commit.creation_date}") + if commit.parents: + print(f" Parents: {[p[:8] for p in commit.parents]}") + print() +``` + +**Expected Output:** +``` +Recent commits: + a1b2c3d4 - Update data pipeline for Q4 processing + By: data-team + Date: 1640995200 + Parents: ['c7a632d7'] + + c7a632d7 - Add user data and configuration + By: admin + Date: 1640908800 + Parents: ['f6e5d4c3'] +``` + +## Branch Comparison and Diffing + +### Comparing Branches + +```python +main_branch = repo.branch("main") +feature_branch = repo.branch("feature-branch") + +# Compare branches +print("Changes in feature branch vs main:") +for change in main_branch.diff(other_ref=feature_branch): + print(f" {change.type}: {change.path}") + if change.size_bytes: + print(f" Size: {change.size_bytes} bytes") + +# Compare with specific commit +print("\nChanges since last release:") +for change in main_branch.diff(other_ref="v1.0.0"): + print(f" {change.type}: {change.path}") +``` + +**Expected Output:** +``` +Changes in feature branch vs main: + added: data/users.csv + Size: 58 bytes + added: config/settings.json + Size: 35 bytes + changed: README.md + Size: 1024 bytes + +Changes since last release: + added: features/auth.py + Size: 2048 bytes + changed: config/app.yaml + Size: 512 bytes +``` + +### Advanced Diff Operations + +```python +# Diff with filtering +print("Changes to data files:") +for change in main_branch.diff(other_ref=feature_branch, prefix="data/"): + print(f" {change.type}: {change.path}") + +# Diff with common prefixes +print("\nChanges by directory:") +for change in main_branch.diff(other_ref=feature_branch, delimiter="/"): + if hasattr(change, 'path'): + print(f" {change.type}: {change.path}") + else: + print(f" directory: {change}") +``` + +## Merging Operations + +### Basic Merging + +```python +feature_branch = repo.branch("feature-auth") +main_branch = repo.branch("main") + +# Merge feature branch into main +merge_commit_id = feature_branch.merge_into(main_branch) +print(f"Merge commit: {merge_commit_id}") + +# Get the merge commit details +merge_commit = main_branch.get_commit() +print(f"Merge message: {merge_commit.message}") +print(f"Parents: {merge_commit.parents}") +``` + +**Expected Output:** +``` +Merge commit: b2c3d4e5f6a7... +Merge message: Merge 'feature-auth' into 'main' +Parents: ['a1b2c3d4e5f6', 'c7a632d74f46'] +``` + +### Merge with Custom Message + +```python +# Merge with custom commit message and metadata +merge_commit_id = feature_branch.merge_into( + main_branch, + message="Integrate authentication feature", + metadata={ + "feature": "authentication", + "reviewer": "senior-dev", + "tests_passed": "true" + } +) + +print(f"Custom merge commit: {merge_commit_id}") +``` + +### Handling Merge Conflicts + +```python +from lakefs.exceptions import ConflictException + +def safe_merge(source_branch, target_branch, message=None): + try: + merge_commit = source_branch.merge_into( + target_branch, + message=message or f"Merge {source_branch.id} into {target_branch.id}" + ) + print(f"Merge successful: {merge_commit}") + return merge_commit + + except ConflictException as e: + print(f"Merge conflict detected: {e}") + print("Manual conflict resolution required") + return None + +# Usage +result = safe_merge(feature_branch, main_branch) +``` + +## Advanced Branch Operations + +### Cherry-picking Commits + +```python +# Cherry-pick a specific commit to current branch +source_commit = "a1b2c3d4e5f6..." +cherry_picked_commit = branch.cherry_pick(reference=source_commit) + +print(f"Cherry-picked commit: {cherry_picked_commit.id}") +print(f"Original message: {cherry_picked_commit.message}") + +# Cherry-pick from another branch +feature_branch = repo.branch("feature-experimental") +latest_commit = feature_branch.get_commit() +cherry_picked = branch.cherry_pick(reference=latest_commit) +``` + +### Reverting Changes + +```python +# Revert a specific commit +commit_to_revert = "a1b2c3d4e5f6..." +revert_commit = branch.revert(reference=commit_to_revert) + +print(f"Revert commit: {revert_commit.id}") +print(f"Revert message: {revert_commit.message}") + +# Revert merge commit (specify parent) +merge_commit = "b2c3d4e5f6a7..." +revert_commit = branch.revert( + reference=merge_commit, + parent_number=1 # Revert to first parent +) +``` + +### Resetting Changes + +```python +# Reset all uncommitted changes +branch.reset_changes(path_type="reset") +print("All changes reset") + +# Reset specific object +branch.reset_changes( + path_type="object", + path="data/file-to-reset.txt" +) +print("Specific file reset") + +# Reset common prefix (directory) +branch.reset_changes( + path_type="common_prefix", + path="data/temp/" +) +print("Directory reset") +``` + +### Deleting Branches + +```python +from lakefs.exceptions import ForbiddenException + +def delete_branch_safely(repo, branch_name): + try: + branch = repo.branch(branch_name) + branch.delete() + print(f"Branch {branch_name} deleted successfully") + return True + + except ForbiddenException: + print(f"Branch {branch_name} is protected and cannot be deleted") + return False + except Exception as e: + print(f"Error deleting branch {branch_name}: {e}") + return False + +# Usage +delete_branch_safely(repo, "feature-completed") +``` + +## Branch Protection and Versioning + +### Understanding Branch Protection + +```python +# Attempt operations on protected branches +def check_branch_protection(branch): + try: + # Try to commit to potentially protected branch + branch.object("test-file.txt").upload(data="test") + commit = branch.commit("Test commit") + print(f"Commit successful: {commit.id}") + + # Clean up test + branch.reset_changes(path_type="reset") + + except ForbiddenException: + print(f"Branch {branch.id} is protected") + except Exception as e: + print(f"Error: {e}") + +# Check main branch protection +main_branch = repo.branch("main") +check_branch_protection(main_branch) +``` + +### Versioning Concepts + +```python +# Demonstrate versioning with ref expressions +branch = repo.branch("main") + +# Current head +current = branch.get_commit() +print(f"Current: {current.id} - {current.message}") + +# Previous commits using ref expressions +previous_commits = [ + repo.ref("main~1").get_commit(), # 1 commit back + repo.ref("main~2").get_commit(), # 2 commits back + repo.ref("main~3").get_commit(), # 3 commits back +] + +print("\nCommit history:") +for i, commit in enumerate(previous_commits): + print(f" main~{i+1}: {commit.id[:8]} - {commit.message}") +``` + +## Batch Operations and Performance + +### Efficient Branch Operations + +```python +def create_multiple_branches(repo, branch_configs): + """Create multiple branches efficiently""" + branches = [] + + for config in branch_configs: + try: + branch = repo.branch(config['name']).create( + source_reference=config['source'], + exist_ok=True + ) + branches.append(branch) + print(f"Created/accessed branch: {config['name']}") + + except Exception as e: + print(f"Failed to create branch {config['name']}: {e}") + + return branches + +# Usage +branch_configs = [ + {'name': 'feature-api', 'source': 'main'}, + {'name': 'feature-ui', 'source': 'main'}, + {'name': 'hotfix-critical', 'source': 'v1.0.0'}, +] + +branches = create_multiple_branches(repo, branch_configs) +``` + +### Bulk Commit Operations + +```python +def bulk_commit_changes(branch, files_data, commit_message): + """Upload multiple files and commit in one operation""" + + # Upload all files + for file_path, content in files_data.items(): + branch.object(file_path).upload(data=content) + + # Single commit for all changes + commit = branch.commit( + message=commit_message, + metadata={"files_count": str(len(files_data))} + ) + + print(f"Committed {len(files_data)} files: {commit.id}") + return commit + +# Usage +files = { + "data/users.csv": "name,email\nAlice,alice@example.com", + "data/products.csv": "id,name,price\n1,Widget,10.99", + "config/settings.json": '{"version": "1.0"}' +} + +commit = bulk_commit_changes( + branch, + files, + "Add initial data files and configuration" +) +``` + +## Error Handling and Best Practices + +### Comprehensive Error Handling + +```python +from lakefs.exceptions import ( + NotFoundException, + ConflictException, + ForbiddenException, + NotAuthorizedException +) + +def robust_branch_operations(repo, branch_name, source_ref): + try: + # Create branch + branch = repo.branch(branch_name).create( + source_reference=source_ref, + exist_ok=False + ) + print(f"Created branch: {branch_name}") + + # Make changes + branch.object("data/test.txt").upload(data="test content") + + # Commit changes + commit = branch.commit("Add test data") + print(f"Committed: {commit.id}") + + return branch + + except ConflictException: + print(f"Branch {branch_name} already exists") + return repo.branch(branch_name) + + except NotFoundException: + print(f"Source reference {source_ref} not found") + return None + + except ForbiddenException: + print(f"Operation forbidden - check branch protection rules") + return None + + except NotAuthorizedException: + print("Not authorized to perform this operation") + return None + + except Exception as e: + print(f"Unexpected error: {e}") + return None + +# Usage +branch = robust_branch_operations(repo, "feature-test", "main") +``` + +### Best Practices + +```python +def branch_workflow_best_practices(): + """Demonstrate best practices for branch workflows""" + + repo = lakefs.repository("my-repo") + + # 1. Use descriptive branch names + branch_name = "feature/user-authentication-v2" + + # 2. Always specify source reference explicitly + branch = repo.branch(branch_name).create( + source_reference="main", + exist_ok=True + ) + + # 3. Check for uncommitted changes before major operations + uncommitted = list(branch.uncommitted()) + if uncommitted: + print(f"Warning: {len(uncommitted)} uncommitted changes") + + # 4. Use meaningful commit messages and metadata + if uncommitted: + commit = branch.commit( + message="Implement user authentication with JWT tokens", + metadata={ + "feature": "authentication", + "version": "2.0", + "tests": "passed", + "reviewer": "security-team" + } + ) + print(f"Committed with metadata: {commit.id}") + + # 5. Clean up feature branches after merging + main_branch = repo.branch("main") + try: + merge_commit = branch.merge_into(main_branch) + print(f"Merged successfully: {merge_commit}") + + # Delete feature branch after successful merge + branch.delete() + print(f"Cleaned up branch: {branch_name}") + + except Exception as e: + print(f"Merge failed: {e}") + +# Run best practices example +branch_workflow_best_practices() +``` + +## Key Points + +- **Lazy evaluation**: Branch objects don't connect to server until you access properties or call methods +- **Immutable commits**: Once created, commits cannot be modified +- **Branch protection**: Some branches may be protected from direct commits or deletion +- **Ref expressions**: Use Git-style expressions like `main~1` for relative references +- **Atomic operations**: Commits are atomic - either all changes are committed or none +- **Metadata support**: Both commits and merges support custom metadata + +## See Also + +- **[Repository Management](repositories.md)** - Creating and managing repositories +- **[Object Operations](objects-and-io.md)** - Working with files and data +- **[Transactions](transactions.md)** - Atomic multi-operation workflows +- **[Import Operations](imports-and-exports.md)** - Bulk data operations +- **[Best Practices](../reference/best-practices.md)** - Production deployment guidance \ No newline at end of file diff --git a/docs/src/integrations/python/high-level-sdk/imports-and-exports.md b/docs/src/integrations/python/high-level-sdk/imports-and-exports.md new file mode 100644 index 00000000000..96c28fa1afb --- /dev/null +++ b/docs/src/integrations/python/high-level-sdk/imports-and-exports.md @@ -0,0 +1,746 @@ +--- +title: Data Imports and Exports +description: Import and export data using the High-Level Python SDK +sdk_types: ["high-level"] +difficulty: "intermediate" +use_cases: ["data-import", "bulk-operations", "data-migration", "etl"] +topics: ["import", "export", "bulk-data", "migration"] +audience: ["data-engineers", "developers", "python-developers"] +last_updated: "2024-01-15" +--- + +# Data Imports and Exports + +Learn how to efficiently import existing data into lakeFS and export data from lakeFS repositories. The ImportManager provides powerful capabilities for bulk data operations with progress monitoring and error handling. + +## Import Concepts + +### ImportManager Overview +The ImportManager provides a fluent interface for importing data from external storage systems into lakeFS branches. It supports: +- **Multiple source types**: Individual objects and prefixes (directories) +- **Batch operations**: Import from multiple sources in a single operation +- **Asynchronous execution**: Non-blocking imports with progress monitoring +- **Atomic commits**: All imported data is committed together + +### Import Sources +- **Object imports**: Import specific files by their full URI +- **Prefix imports**: Import entire directories or prefixes +- **Mixed imports**: Combine objects and prefixes in a single operation + +### Import Process +1. Create ImportManager with commit message and metadata +2. Add sources (objects and/or prefixes) +3. Start import (synchronous or asynchronous) +4. Monitor progress and handle completion + +## Basic Import Operations + +### Simple Object Import + +```python +import lakefs + +branch = lakefs.repository("my-repo").branch("main") + +# Import a single object +importer = branch.import_data(commit_message="Import configuration file") +importer.object( + object_store_uri="s3://source-bucket/config/app.yaml", + destination="config/app.yaml" +) + +# Execute the import +result = importer.run() +print(f"Imported {result.ingested_objects} objects") +print(f"Commit ID: {result.commit.id}") +``` + +**Expected Output:** +``` +Imported 1 objects +Commit ID: a1b2c3d4e5f6... +``` + +### Prefix Import + +```python +# Import entire directory/prefix +importer = branch.import_data(commit_message="Import dataset") +importer.prefix( + object_store_uri="s3://source-bucket/datasets/user-data/", + destination="data/users/" +) + +result = importer.run() +print(f"Imported {result.ingested_objects} objects from prefix") +``` + +### Multiple Source Import + +```python +# Import from multiple sources in one operation +importer = branch.import_data( + commit_message="Import application data and configs", + metadata={ + "source": "production-backup", + "date": "2024-01-15", + "environment": "prod" + } +) + +# Add multiple sources +importer.prefix("s3://backup-bucket/data/", destination="data/") \ + .prefix("s3://backup-bucket/logs/", destination="logs/") \ + .object("s3://backup-bucket/config.json", destination="config/app.json") \ + .object("s3://backup-bucket/schema.sql", destination="db/schema.sql") + +# Execute import +result = importer.run() +print(f"Multi-source import completed: {result.ingested_objects} objects") +``` + +**Expected Output:** +``` +Multi-source import completed: 1247 objects +``` + +## Asynchronous Import Operations + +### Non-blocking Import + +```python +import time +from datetime import timedelta + +# Start import without blocking +importer = branch.import_data(commit_message="Large dataset import") +importer.prefix("s3://large-dataset-bucket/", destination="data/") + +# Start the import (non-blocking) +import_id = importer.start() +print(f"Import started with ID: {import_id}") + +# Monitor progress +while True: + status = importer.status() + + if status.completed: + print(f"Import completed! Total objects: {status.ingested_objects}") + print(f"Commit ID: {status.commit.id}") + break + + if status.error: + print(f"Import failed: {status.error.message}") + break + + print(f"In progress... {status.ingested_objects} objects imported so far") + time.sleep(10) # Check every 10 seconds +``` + +**Expected Output:** +``` +Import started with ID: import-123456 +In progress... 150 objects imported so far +In progress... 300 objects imported so far +In progress... 450 objects imported so far +Import completed! Total objects: 523 +Commit ID: b2c3d4e5f6a7... +``` + +### Import with Custom Polling + +```python +# Import with custom polling interval +importer = branch.import_data(commit_message="Custom polling import") +importer.prefix("s3://source/data/", destination="imported/") + +# Start and wait with custom interval +result = importer.run(poll_interval=timedelta(seconds=5)) +print(f"Import completed with custom polling: {result.ingested_objects} objects") +``` + +### Waiting for Completion + +```python +# Start import and wait separately +importer = branch.import_data(commit_message="Separate start/wait") +importer.prefix("s3://data-source/", destination="data/") + +# Start import +import_id = importer.start() +print(f"Started import: {import_id}") + +# Do other work here... +print("Performing other operations...") + +# Wait for completion +result = importer.wait(poll_interval=timedelta(seconds=3)) +print(f"Import finished: {result.ingested_objects} objects") +``` + +## Import Status and Monitoring + +### Detailed Status Information + +```python +# Start an import +importer = branch.import_data(commit_message="Status monitoring example") +importer.prefix("s3://source/large-dataset/", destination="data/") + +import_id = importer.start() + +# Get detailed status +status = importer.status() + +print(f"Import ID: {importer.import_id}") +print(f"Completed: {status.completed}") +print(f"Objects ingested: {status.ingested_objects}") +print(f"Update time: {status.update_time}") +print(f"Metarange ID: {status.metarange_id}") + +if status.error: + print(f"Error: {status.error.message}") + +if status.commit: + print(f"Commit ID: {status.commit.id}") + print(f"Commit message: {status.commit.message}") +``` + +### Progress Monitoring with Callbacks + +```python +def monitor_import_progress(importer, callback_interval=5): + """Monitor import progress with custom callback""" + + import_id = importer.start() + start_time = time.time() + + while True: + status = importer.status() + elapsed = time.time() - start_time + + if status.completed: + print(f"✅ Import completed in {elapsed:.1f}s") + print(f" Total objects: {status.ingested_objects}") + return status + + if status.error: + print(f"❌ Import failed after {elapsed:.1f}s: {status.error.message}") + return status + + # Progress callback + objects_per_second = status.ingested_objects / elapsed if elapsed > 0 else 0 + print(f"⏳ Progress: {status.ingested_objects} objects ({objects_per_second:.1f}/sec)") + + time.sleep(callback_interval) + +# Usage +importer = branch.import_data(commit_message="Monitored import") +importer.prefix("s3://large-source/", destination="data/") + +final_status = monitor_import_progress(importer) +``` + +## Import Configuration and Metadata + +### Import with Rich Metadata + +```python +# Import with comprehensive metadata +importer = branch.import_data( + commit_message="Production data import - Q4 2024", + metadata={ + "source_system": "production-db", + "import_date": "2024-01-15", + "data_version": "v2.1.0", + "environment": "production", + "imported_by": "data-pipeline", + "validation_status": "passed", + "record_count": "1000000", + "size_gb": "15.7" + } +) + +importer.prefix("s3://prod-backup/q4-data/", destination="data/q4/") +result = importer.run() + +# Verify metadata in commit +commit = result.commit +print("Import metadata:") +for key, value in commit.metadata.items(): + print(f" {key}: {value}") +``` + +### Conditional Import Based on Existing Data + +```python +def conditional_import(branch, source_uri, destination, force=False): + """Import data only if destination doesn't exist or force is True""" + + # Check if destination already has data + existing_objects = list(branch.objects(prefix=destination, max_amount=1)) + + if existing_objects and not force: + print(f"Destination {destination} already contains data. Use force=True to overwrite.") + return None + + # Proceed with import + importer = branch.import_data( + commit_message=f"Import to {destination}", + metadata={"overwrite": str(force)} + ) + importer.prefix(source_uri, destination=destination) + + return importer.run() + +# Usage +result = conditional_import( + branch, + "s3://source/data/", + "imported/data/", + force=False +) +``` + +## Error Handling and Recovery + +### Comprehensive Error Handling + +```python +from lakefs.exceptions import ImportManagerException, NotFoundException + +def robust_import(branch, sources, commit_message): + """Perform import with comprehensive error handling""" + + try: + # Create importer + importer = branch.import_data(commit_message=commit_message) + + # Add sources + for source_type, uri, destination in sources: + if source_type == "prefix": + importer.prefix(uri, destination=destination) + elif source_type == "object": + importer.object(uri, destination=destination) + else: + raise ValueError(f"Unknown source type: {source_type}") + + # Start import + import_id = importer.start() + print(f"Import started: {import_id}") + + # Wait for completion with timeout + max_wait_time = 3600 # 1 hour + start_time = time.time() + + while time.time() - start_time < max_wait_time: + status = importer.status() + + if status.completed: + print(f"Import successful: {status.ingested_objects} objects") + return status + + if status.error: + print(f"Import error: {status.error.message}") + return None + + time.sleep(10) + + # Timeout - cancel import + print("Import timeout, cancelling...") + importer.cancel() + return None + + except ImportManagerException as e: + print(f"Import manager error: {e}") + return None + except NotFoundException as e: + print(f"Resource not found: {e}") + return None + except Exception as e: + print(f"Unexpected error: {e}") + return None + +# Usage +sources = [ + ("prefix", "s3://source/data/", "imported/data/"), + ("object", "s3://source/config.yaml", "config/app.yaml") +] + +result = robust_import(branch, sources, "Robust import example") +``` + +### Import Cancellation + +```python +import threading + +def cancellable_import(branch, source_uri, destination): + """Import with cancellation capability""" + + importer = branch.import_data(commit_message="Cancellable import") + importer.prefix(source_uri, destination=destination) + + # Start import + import_id = importer.start() + print(f"Import started: {import_id}") + + # Simulate user cancellation after 30 seconds + def cancel_after_delay(): + time.sleep(30) + try: + importer.cancel() + print("Import cancelled by user") + except Exception as e: + print(f"Cancellation failed: {e}") + + cancel_thread = threading.Thread(target=cancel_after_delay) + cancel_thread.start() + + # Monitor import + try: + result = importer.wait() + print(f"Import completed: {result.ingested_objects} objects") + return result + except ImportManagerException as e: + print(f"Import was cancelled or failed: {e}") + return None + +# Usage +result = cancellable_import(branch, "s3://large-source/", "data/") +``` + +## Advanced Import Patterns + +### Batch Import with Validation + +```python +def validated_batch_import(branch, import_configs): + """Import multiple datasets with validation""" + + results = [] + + for config in import_configs: + name = config['name'] + sources = config['sources'] + validation_func = config.get('validation') + + print(f"Starting import: {name}") + + # Create importer + importer = branch.import_data( + commit_message=f"Import {name}", + metadata={"batch_import": "true", "dataset": name} + ) + + # Add sources + for source in sources: + if source['type'] == 'prefix': + importer.prefix(source['uri'], destination=source['destination']) + else: + importer.object(source['uri'], destination=source['destination']) + + # Execute import + try: + result = importer.run() + + # Validate if validation function provided + if validation_func: + if validation_func(branch, result): + print(f"✅ {name}: Import and validation successful") + else: + print(f"❌ {name}: Import successful but validation failed") + else: + print(f"✅ {name}: Import successful ({result.ingested_objects} objects)") + + results.append({"name": name, "result": result, "success": True}) + + except Exception as e: + print(f"❌ {name}: Import failed - {e}") + results.append({"name": name, "error": str(e), "success": False}) + + return results + +# Example validation function +def validate_csv_import(branch, import_result): + """Validate that CSV files were imported correctly""" + csv_objects = list(branch.objects(prefix="data/", max_amount=100)) + csv_files = [obj for obj in csv_objects if obj.path.endswith('.csv')] + return len(csv_files) > 0 + +# Usage +import_configs = [ + { + "name": "user-data", + "sources": [ + {"type": "prefix", "uri": "s3://source/users/", "destination": "data/users/"} + ], + "validation": validate_csv_import + }, + { + "name": "config-files", + "sources": [ + {"type": "object", "uri": "s3://source/app.yaml", "destination": "config/app.yaml"}, + {"type": "object", "uri": "s3://source/db.yaml", "destination": "config/db.yaml"} + ] + } +] + +results = validated_batch_import(branch, import_configs) +``` + +### Incremental Import Pattern + +```python +def incremental_import(branch, source_prefix, destination_prefix, last_import_marker=None): + """Import only new data since last import""" + + # In a real implementation, you would track what was imported previously + # This is a simplified example + + importer = branch.import_data( + commit_message=f"Incremental import from {source_prefix}", + metadata={ + "import_type": "incremental", + "last_marker": last_import_marker or "none", + "source": source_prefix + } + ) + + # Add source (in practice, you'd filter based on modification time or other criteria) + importer.prefix(source_prefix, destination=destination_prefix) + + result = importer.run() + + # Store marker for next incremental import + new_marker = result.commit.id + print(f"Incremental import completed. Next marker: {new_marker}") + + return result, new_marker + +# Usage +result, marker = incremental_import( + branch, + "s3://source/daily-data/", + "data/daily/" +) +``` + +## Export Operations + +### Basic Export Patterns + +```python +def export_to_s3(branch, prefix, s3_bucket, s3_prefix): + """Export lakeFS objects to S3 (example pattern)""" + import boto3 + + s3_client = boto3.client('s3') + exported_count = 0 + + for obj_info in branch.objects(prefix=prefix): + # Read from lakeFS + obj = branch.object(obj_info.path) + + with obj.reader(mode='rb') as reader: + content = reader.read() + + # Write to S3 + s3_key = s3_prefix + obj_info.path[len(prefix):] + s3_client.put_object( + Bucket=s3_bucket, + Key=s3_key, + Body=content, + ContentType=obj_info.content_type or 'application/octet-stream' + ) + + exported_count += 1 + print(f"Exported: {obj_info.path} -> s3://{s3_bucket}/{s3_key}") + + print(f"Export completed: {exported_count} objects") + return exported_count + +# Usage +export_count = export_to_s3( + branch, + "data/processed/", + "export-bucket", + "lakefs-export/" +) +``` + +### Streaming Export for Large Files + +```python +def streaming_export(branch, object_path, local_path): + """Export large object using streaming""" + + obj = branch.object(object_path) + + with obj.reader(mode='rb') as reader: + with open(local_path, 'wb') as writer: + chunk_size = 64 * 1024 # 64KB chunks + total_bytes = 0 + + while True: + chunk = reader.read(chunk_size) + if not chunk: + break + + writer.write(chunk) + total_bytes += len(chunk) + + # Progress indicator + if total_bytes % (1024 * 1024) == 0: # Every MB + print(f"Exported {total_bytes // (1024 * 1024)} MB...") + + print(f"Export completed: {total_bytes} bytes -> {local_path}") + +# Usage +streaming_export(branch, "data/large-dataset.dat", "exported-dataset.dat") +``` + +## Best Practices and Performance + +### Large Dataset Import Best Practices + +```python +def optimized_large_import(branch, source_uri, destination): + """Best practices for importing large datasets""" + + # Use descriptive commit message with metadata + importer = branch.import_data( + commit_message=f"Large dataset import: {destination}", + metadata={ + "source": source_uri, + "import_strategy": "optimized", + "expected_size": "large", + "monitoring": "enabled" + } + ) + + # Add source + importer.prefix(source_uri, destination=destination) + + # Start import + import_id = importer.start() + print(f"Large import started: {import_id}") + + # Monitor with longer intervals for large imports + poll_interval = 30 # 30 seconds + last_count = 0 + + while True: + status = importer.status() + + if status.completed: + print(f"✅ Large import completed: {status.ingested_objects} objects") + return status + + if status.error: + print(f"❌ Large import failed: {status.error.message}") + return None + + # Show progress with rate calculation + current_count = status.ingested_objects or 0 + rate = (current_count - last_count) / poll_interval + print(f"⏳ Progress: {current_count} objects ({rate:.1f} objects/sec)") + last_count = current_count + + time.sleep(poll_interval) + +# Usage +result = optimized_large_import( + branch, + "s3://massive-dataset/", + "data/massive/" +) +``` + +### Import Performance Monitoring + +```python +class ImportPerformanceMonitor: + """Monitor and log import performance metrics""" + + def __init__(self): + self.start_time = None + self.metrics = [] + + def start_monitoring(self, importer): + """Start monitoring an import operation""" + self.start_time = time.time() + import_id = importer.start() + + print(f"Monitoring import: {import_id}") + return self._monitor_loop(importer) + + def _monitor_loop(self, importer): + """Monitor import progress and collect metrics""" + + while True: + current_time = time.time() + status = importer.status() + + # Collect metrics + metric = { + "timestamp": current_time, + "elapsed": current_time - self.start_time, + "objects": status.ingested_objects or 0, + "completed": status.completed, + "error": status.error.message if status.error else None + } + self.metrics.append(metric) + + if status.completed: + self._print_summary(status) + return status + + if status.error: + print(f"Import failed: {status.error.message}") + return None + + # Progress update + rate = metric["objects"] / metric["elapsed"] if metric["elapsed"] > 0 else 0 + print(f"Progress: {metric['objects']} objects, {rate:.1f} obj/sec") + + time.sleep(10) + + def _print_summary(self, final_status): + """Print performance summary""" + total_time = time.time() - self.start_time + total_objects = final_status.ingested_objects + avg_rate = total_objects / total_time if total_time > 0 else 0 + + print(f"\n📊 Import Performance Summary:") + print(f" Total time: {total_time:.1f} seconds") + print(f" Total objects: {total_objects}") + print(f" Average rate: {avg_rate:.1f} objects/second") + print(f" Commit ID: {final_status.commit.id}") + +# Usage +monitor = ImportPerformanceMonitor() + +importer = branch.import_data(commit_message="Monitored import") +importer.prefix("s3://source-data/", destination="data/") + +result = monitor.start_monitoring(importer) +``` + +## Key Points + +- **Atomic operations**: All imports are committed atomically - either all data is imported or none +- **Progress monitoring**: Use asynchronous imports for large datasets with progress tracking +- **Error handling**: Implement comprehensive error handling and recovery strategies +- **Metadata**: Use rich metadata to track import sources, dates, and validation status +- **Performance**: Monitor import rates and optimize polling intervals for large datasets +- **Cancellation**: Long-running imports can be cancelled if needed +- **Validation**: Implement post-import validation to ensure data integrity + +## See Also + +- **[Repository Management](repositories.md)** - Creating and managing repositories +- **[Branch Operations](branches-and-commits.md)** - Version control operations +- **[Object Operations](objects-and-io.md)** - Individual object management +- **[Transactions](transactions.md)** - Atomic multi-operation workflows +- **[Best Practices](../reference/best-practices.md)** - Production deployment guidance \ No newline at end of file diff --git a/docs/src/integrations/python/high-level-sdk/index.md b/docs/src/integrations/python/high-level-sdk/index.md new file mode 100644 index 00000000000..a1c54354289 --- /dev/null +++ b/docs/src/integrations/python/high-level-sdk/index.md @@ -0,0 +1,166 @@ +--- +title: High-Level Python SDK +description: Comprehensive documentation for the lakeFS High-Level Python SDK +sdk_types: ["high-level"] +difficulty: "beginner" +use_cases: ["general", "data-pipelines", "etl", "transactions"] +topics: ["overview", "architecture", "concepts"] +audience: ["data-engineers", "developers", "python-developers"] +last_updated: "2024-01-15" +--- + +# High-Level Python SDK + +The High-Level Python SDK provides a simplified, Pythonic interface for working with lakeFS. Built on top of the Generated SDK, it offers advanced features like transactions, streaming I/O, and intuitive object management while maintaining the full power of the underlying API. + +## Key Concepts + +### Repository-Centric Design +The High-Level SDK is organized around repositories as the primary entry point. All operations flow from repository objects to branches, commits, and objects, providing a natural hierarchy that mirrors lakeFS's data model. + +### Lazy Evaluation +Objects are created lazily - creating a `Repository`, `Branch`, or `StoredObject` instance doesn't immediately interact with the server. Operations only execute when you call action methods like `create()`, `upload()`, or `commit()`. + +### Fluent Interface +The SDK supports method chaining for common workflows: +```python +repo = lakefs.repository("my-repo").create(storage_namespace="s3://bucket/path") +commit = repo.branch("main").object("file.txt").upload(data="content").commit("Add file") +``` + +### Built-in Error Handling +All operations include comprehensive error handling with specific exception types for different failure scenarios, making it easier to build robust applications. + +## Key Features + +- **Simplified API** - Pythonic interface that abstracts complex operations +- **Transaction Support** - Atomic operations with automatic rollback capabilities +- **Streaming I/O** - File-like objects for efficient handling of large datasets +- **Import Management** - Sophisticated data import operations with progress tracking +- **Batch Operations** - Efficient bulk operations for better performance +- **Generated SDK Access** - Direct access to underlying Generated SDK when needed +- **Automatic Authentication** - Seamless credential discovery from environment or config files + +## Architecture Overview + +The High-Level SDK is structured in layers: + +``` +High-Level SDK (lakefs package) +├── Repository Management +├── Branch & Reference Operations +├── Object I/O & Streaming +├── Transaction Management +├── Import/Export Operations +└── Generated SDK (lakefs_sdk) + └── Direct API Access +``` + +## Core Classes + +### Repository +The main entry point for all operations. Represents a lakeFS repository and provides access to branches, tags, and metadata. + +### Branch +Extends Reference with write capabilities. Supports object uploads, commits, merges, and transaction management. + +### Reference +Read-only access to any lakeFS reference (branch, commit, or tag). Provides object listing and reading capabilities. + +### StoredObject & WriteableObject +Represent objects in lakeFS with full I/O capabilities including streaming, metadata management, and batch operations. + +### ImportManager +Handles complex data import operations with support for various source types and progress monitoring. + +## Documentation Sections + +- **[Quickstart](quickstart.md)** - Get started with basic operations +- **[Repositories](repositories.md)** - Repository management operations +- **[Branches & Commits](branches-and-commits.md)** - Version control operations +- **[Objects & I/O](objects-and-io.md)** - Object operations and streaming +- **[Imports & Exports](imports-and-exports.md)** - Data import/export operations +- **[Transactions](transactions.md)** - Atomic operation patterns +- **[Advanced Features](advanced.md)** - Advanced patterns and optimization + +## Quick Example + +```python +import lakefs + +# Create a repository +repo = lakefs.repository("my-repo").create( + storage_namespace="s3://my-bucket/repos/my-repo" +) + +# Create a branch and upload data +branch = repo.branch("feature-branch").create(source_reference="main") +obj = branch.object("data/file.txt").upload(data="Hello, lakeFS!") + +# Commit changes +commit = branch.commit(message="Add new data file") +print(f"Committed: {commit.id}") +``` + +## Installation + +```bash +pip install lakefs +``` + +## Authentication + +The SDK automatically discovers credentials from: +1. Environment variables (`LAKEFS_ACCESS_KEY_ID`, `LAKEFS_SECRET_ACCESS_KEY`, `LAKEFS_ENDPOINT`) +2. Configuration file (`~/.lakectl.yaml`) +3. Explicit client configuration + +## When to Use High-Level SDK + +Choose the High-Level SDK when you need: +- **Simplified workflows** - Common operations with minimal code +- **Transaction support** - Atomic operations across multiple changes +- **Streaming I/O** - Efficient handling of large files +- **Import management** - Complex data ingestion workflows +- **Python-first experience** - Pythonic interfaces and error handling + +For direct API control or operations not covered by the high-level interface, you can access the underlying Generated SDK through the `client` property. + +## Next Steps + +Start with the [quickstart guide](quickstart.md) to learn the basics, then explore specific features in the detailed sections. + +## See Also + +**Getting Started:** +- [Python SDK Overview](../index.md) - Compare all Python SDK options +- [Installation Guide](../getting-started.md) - Setup and authentication +- [SDK Selection Guide](../index.md#sdk-selection-decision-matrix) - Choose the right SDK + +**High-Level SDK Documentation:** +- [Quickstart Guide](quickstart.md) - Basic operations and examples +- [Repository Management](repositories.md) - Create, configure, and manage repositories +- [Version Control](branches-and-commits.md) - Branches, commits, and merging +- [Object Operations](objects-and-io.md) - Upload, download, and streaming I/O +- [Data Import/Export](imports-and-exports.md) - Bulk data operations +- [Transaction Patterns](transactions.md) - Atomic operations and rollback +- [Advanced Features](advanced.md) - Performance optimization and patterns + +**Alternative SDK Options:** +- [Generated SDK](../generated-sdk/index.md) - Direct API access for advanced use cases +- [lakefs-spec](../lakefs-spec/index.md) - Filesystem interface for data science +- [Boto3 Integration](../boto3/index.md) - S3-compatible operations + +**Learning Resources:** +- [Data Science Tutorial](../tutorials/data-science-workflow.md) - End-to-end data analysis workflow +- [ETL Pipeline Tutorial](../tutorials/etl-pipeline.md) - Building data pipelines +- [ML Experiment Tracking](../tutorials/ml-experiment-tracking.md) - Model versioning + +**Reference Materials:** +- [API Comparison](../reference/api-comparison.md) - Feature comparison across SDKs +- [Best Practices](../reference/best-practices.md) - Production deployment guidance +- [Troubleshooting](../reference/troubleshooting.md) - Common issues and solutions + +**External Resources:** +- [High-Level SDK API Reference](https://pydocs-lakefs.lakefs.io){:target="_blank"} - Complete API documentation +- [Generated SDK Access](../generated-sdk/direct-access.md) - Using Generated SDK from High-Level SDK \ No newline at end of file diff --git a/docs/src/integrations/python/high-level-sdk/objects-and-io.md b/docs/src/integrations/python/high-level-sdk/objects-and-io.md new file mode 100644 index 00000000000..4d1c477b013 --- /dev/null +++ b/docs/src/integrations/python/high-level-sdk/objects-and-io.md @@ -0,0 +1,709 @@ +--- +title: Objects and I/O Operations +description: Object management and streaming I/O with the High-Level Python SDK +sdk_types: ["high-level"] +difficulty: "intermediate" +use_cases: ["object-storage", "file-operations", "streaming", "large-files"] +topics: ["objects", "io", "streaming", "upload", "download"] +audience: ["data-engineers", "developers", "python-developers"] +last_updated: "2024-01-15" +--- + +# Objects and I/O Operations + +Learn how to efficiently manage objects and perform I/O operations using the High-Level Python SDK. Objects in lakeFS represent files and data with full versioning capabilities and streaming I/O support. + +## Object Concepts + +### Object Types +- **StoredObject**: Read-only objects from references (commits, tags, branches) +- **WriteableObject**: Read-write objects from branches that support uploads and modifications +- **ObjectReader**: File-like interface for reading object data +- **ObjectWriter**: File-like interface for writing object data + +### Object Paths +Objects are identified by their path within a repository and reference: +- Paths use forward slashes (`/`) as separators +- Paths are relative to the repository root +- No leading slash required (e.g., `data/file.txt`, not `/data/file.txt`) + +### Streaming I/O +The SDK provides file-like objects for efficient streaming of large datasets without loading everything into memory. + +## Object Upload Operations + +### Simple Upload + +```python +import lakefs + +branch = lakefs.repository("my-repo").branch("main") + +# Upload string data +obj = branch.object("data/file.txt").upload( + data="Hello, lakeFS!", + content_type="text/plain" +) + +print(f"Uploaded: {obj.path}") +print(f"Repository: {obj.repo}") +print(f"Reference: {obj.ref}") +``` + +**Expected Output:** +``` +Uploaded: data/file.txt +Repository: my-repo +Reference: main +``` + +### Upload Binary Data + +```python +# Upload binary data from file +with open("local-file.pdf", "rb") as f: + obj = branch.object("documents/report.pdf").upload( + data=f.read(), + content_type="application/pdf", + metadata={"author": "data-team", "version": "1.0"} + ) + +# Upload binary data directly +image_data = b'\x89PNG\r\n\x1a\n...' # PNG image bytes +obj = branch.object("images/logo.png").upload( + data=image_data, + content_type="image/png" +) +``` + +### Upload with Different Modes + +```python +# Create new file (fail if exists) +try: + obj = branch.object("data/exclusive.txt").upload( + data="Exclusive content", + mode="x" # Exclusive creation + ) + print("File created successfully") +except ObjectExistsException: + print("File already exists") + +# Overwrite existing file +obj = branch.object("data/overwrite.txt").upload( + data="New content", + mode="w" # Create or overwrite +) + +# Binary modes +obj = branch.object("data/binary.dat").upload( + data=b"Binary data", + mode="wb" # Binary write +) +``` + +### Upload with Metadata + +```python +# Upload with custom metadata +obj = branch.object("data/dataset.csv").upload( + data="name,age,city\nAlice,30,NYC\nBob,25,LA", + content_type="text/csv", + metadata={ + "source": "user_survey", + "created_by": "data_pipeline", + "version": "2.1", + "schema_version": "1.0", + "record_count": "2" + } +) + +print("Upload complete with metadata") +``` + +## Streaming Upload Operations + +### Basic Streaming Upload + +```python +import csv +import json + +# Stream CSV data +obj = branch.object("data/users.csv") + +with obj.writer(mode='w', content_type="text/csv") as writer: + csv_writer = csv.writer(writer) + csv_writer.writerow(["ID", "Name", "Email", "Age"]) + + # Write data row by row (memory efficient) + for i in range(1000): + csv_writer.writerow([i, f"User{i}", f"user{i}@example.com", 20 + (i % 50)]) + +print("Streamed 1000 records to CSV") +``` + +### Large File Streaming + +```python +# Stream large file efficiently +def upload_large_file(branch, source_path, target_path): + """Upload large file using streaming to minimize memory usage""" + + obj = branch.object(target_path) + + with open(source_path, 'rb') as source: + with obj.writer(mode='wb') as writer: + # Stream in chunks + chunk_size = 64 * 1024 # 64KB chunks + while True: + chunk = source.read(chunk_size) + if not chunk: + break + writer.write(chunk) + + return obj + +# Usage +large_obj = upload_large_file( + branch, + "local-large-file.dat", + "data/large-dataset.dat" +) +``` + +### JSON Streaming + +```python +import json + +# Stream JSON data +obj = branch.object("data/config.json") + +config_data = { + "database": { + "host": "localhost", + "port": 5432, + "name": "myapp" + }, + "features": { + "auth": True, + "logging": True, + "metrics": True + } +} + +with obj.writer(mode='w', content_type="application/json") as writer: + json.dump(config_data, writer, indent=2) + +print("JSON configuration uploaded") +``` + +## Object Download Operations + +### Simple Download + +```python +# Read as string +content = branch.object("data/file.txt").reader().read() +print(f"Content: {content}") + +# Read as bytes +binary_content = branch.object("data/file.txt").reader(mode='rb').read() +print(f"Bytes: {len(binary_content)} bytes") +``` + +**Expected Output:** +``` +Content: Hello, lakeFS! +Bytes: 14 bytes +``` + +### Streaming Download + +```python +import csv + +# Stream CSV data for processing +obj = branch.object("data/users.csv") +processed_count = 0 + +with obj.reader(mode='r') as reader: + csv_reader = csv.DictReader(reader) + + for row in csv_reader: + # Process each row without loading entire file + if int(row['Age']) > 30: + print(f"Senior user: {row['Name']}") + processed_count += 1 + +print(f"Processed {processed_count} senior users") +``` + +### Binary Download + +```python +# Download binary data +obj = branch.object("documents/report.pdf") + +# Method 1: Direct read +with obj.reader(mode='rb') as reader: + pdf_data = reader.read() + with open("downloaded-report.pdf", "wb") as f: + f.write(pdf_data) + +# Method 2: Streaming download +with obj.reader(mode='rb') as reader: + with open("streamed-report.pdf", "wb") as f: + chunk_size = 64 * 1024 + while True: + chunk = reader.read(chunk_size) + if not chunk: + break + f.write(chunk) + +print("PDF downloaded successfully") +``` + +### Partial Reading and Seeking + +```python +# Read specific portions of file +obj = branch.object("data/large-file.txt") + +with obj.reader(mode='r') as reader: + # Read first 100 characters + header = reader.read(100) + print(f"Header: {header}") + + # Seek to middle of file + file_size = obj.stat().size_bytes + reader.seek(file_size // 2) + + # Read from middle + middle_content = reader.read(50) + print(f"Middle: {middle_content}") + + # Seek to end and read backwards + reader.seek(-100, os.SEEK_END) + tail = reader.read() + print(f"Tail: {tail}") +``` + +## Object Metadata and Properties + +### Object Statistics + +```python +obj = branch.object("data/dataset.csv") +stats = obj.stat() + +print(f"Path: {stats.path}") +print(f"Size: {stats.size_bytes} bytes") +print(f"Content Type: {stats.content_type}") +print(f"Checksum: {stats.checksum}") +print(f"Modified Time: {stats.mtime}") +print(f"Physical Address: {stats.physical_address}") + +# Access custom metadata +if stats.metadata: + print("Custom Metadata:") + for key, value in stats.metadata.items(): + print(f" {key}: {value}") +``` + +**Expected Output:** +``` +Path: data/dataset.csv +Size: 1024 bytes +Content Type: text/csv +Checksum: sha256:a1b2c3d4... +Modified Time: 1640995200 +Physical Address: s3://bucket/path/to/object +Custom Metadata: + source: user_survey + version: 2.1 +``` + +### Pre-signed URLs + +```python +# Get object stats with pre-signed URL +stats = obj.stat(pre_sign=True) +print(f"Pre-signed URL: {stats.physical_address}") + +# Use pre-signed URLs for direct access +with obj.reader(pre_sign=True) as reader: + content = reader.read() + print("Read using pre-signed URL") +``` + +### Object Existence Checking + +```python +# Check if object exists +if branch.object("data/maybe-exists.txt").exists(): + print("Object exists") + content = branch.object("data/maybe-exists.txt").reader().read() +else: + print("Object not found, creating it...") + branch.object("data/maybe-exists.txt").upload(data="Default content") + +# Batch existence checking +paths_to_check = ["data/file1.txt", "data/file2.txt", "data/file3.txt"] +existing_objects = [] + +for path in paths_to_check: + if branch.object(path).exists(): + existing_objects.append(path) + +print(f"Existing objects: {existing_objects}") +``` + +## Object Listing and Discovery + +### List All Objects + +```python +# List all objects in branch +print("All objects:") +for obj_info in branch.objects(): + print(f" {obj_info.path} ({obj_info.size_bytes} bytes)") +``` + +**Expected Output:** +``` +All objects: + data/users.csv (2048 bytes) + data/config.json (512 bytes) + documents/report.pdf (102400 bytes) +``` + +### Filtered Object Listing + +```python +# List objects with prefix +print("Data files:") +for obj_info in branch.objects(prefix="data/"): + print(f" {obj_info.path}") + +# List objects with pagination +print("First 10 objects:") +for obj_info in branch.objects(max_amount=10): + print(f" {obj_info.path}") + +# List objects after specific path +print("Objects after 'data/m':") +for obj_info in branch.objects(after="data/m"): + print(f" {obj_info.path}") +``` + +### Directory-like Listing + +```python +# List with delimiter for directory-like structure +print("Top-level directories and files:") +for item in branch.objects(delimiter="/"): + if hasattr(item, 'path') and item.path.endswith('/'): + print(f" 📁 {item.path}") + else: + print(f" 📄 {item.path}") + +# List specific directory contents +print("Contents of data/ directory:") +for item in branch.objects(prefix="data/", delimiter="/"): + if hasattr(item, 'path'): + print(f" {item.path}") +``` + +### Advanced Object Discovery + +```python +def find_objects_by_pattern(branch, pattern, max_results=100): + """Find objects matching a pattern""" + import re + + matching_objects = [] + regex = re.compile(pattern) + + for obj_info in branch.objects(max_amount=max_results): + if regex.search(obj_info.path): + matching_objects.append(obj_info) + + return matching_objects + +# Find all CSV files +csv_files = find_objects_by_pattern(branch, r'\.csv$') +print(f"Found {len(csv_files)} CSV files") + +# Find all files in subdirectories +nested_files = find_objects_by_pattern(branch, r'/.+/') +print(f"Found {len(nested_files)} files in subdirectories") +``` + +## Object Operations + +### Copying Objects + +```python +# Copy within same branch +source = branch.object("data/original.txt") +target_obj = source.copy( + destination_branch_id="main", + destination_path="data/copy.txt" +) + +print(f"Copied to: {target_obj.path}") + +# Copy to different branch +feature_branch = repo.branch("feature-branch") +copied_obj = source.copy( + destination_branch_id="feature-branch", + destination_path="data/feature-copy.txt" +) + +# Verify copy exists +if copied_obj.exists(): + print("Copy successful") +``` + +### Deleting Objects + +```python +# Delete single object +obj = branch.object("data/temp-file.txt") +if obj.exists(): + obj.delete() + print("Object deleted") + +# Delete multiple objects efficiently +objects_to_delete = [ + "temp/file1.txt", + "temp/file2.txt", + "temp/file3.txt" +] + +# Use branch-level delete for efficiency +branch.delete_objects(objects_to_delete) +print(f"Deleted {len(objects_to_delete)} objects") +``` + +### Object Comparison + +```python +def compare_objects(obj1, obj2): + """Compare two objects by content""" + stats1 = obj1.stat() + stats2 = obj2.stat() + + # Quick comparison by checksum + if stats1.checksum == stats2.checksum: + return True + + # Detailed comparison by content + with obj1.reader(mode='rb') as r1, obj2.reader(mode='rb') as r2: + return r1.read() == r2.read() + +# Usage +obj1 = branch.object("data/file1.txt") +obj2 = branch.object("data/file2.txt") + +if compare_objects(obj1, obj2): + print("Objects are identical") +else: + print("Objects differ") +``` + +## Advanced I/O Patterns + +### Concurrent Object Processing + +```python +import concurrent.futures +import threading + +def process_object(obj_info): + """Process a single object""" + obj = branch.object(obj_info.path) + + try: + with obj.reader(mode='r') as reader: + content = reader.read() + # Process content here + return f"Processed {obj_info.path}: {len(content)} chars" + except Exception as e: + return f"Error processing {obj_info.path}: {e}" + +# Process objects concurrently +objects_to_process = list(branch.objects(prefix="data/", max_amount=10)) + +with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: + results = list(executor.map(process_object, objects_to_process)) + +for result in results: + print(result) +``` + +### Object Transformation Pipeline + +```python +def transform_csv_object(source_obj, target_obj, transform_func): + """Transform CSV data from source to target object""" + import csv + + with source_obj.reader(mode='r') as reader: + with target_obj.writer(mode='w', content_type="text/csv") as writer: + csv_reader = csv.DictReader(reader) + + # Get fieldnames and potentially modify them + fieldnames = csv_reader.fieldnames + csv_writer = csv.DictWriter(writer, fieldnames=fieldnames) + csv_writer.writeheader() + + # Transform each row + for row in csv_reader: + transformed_row = transform_func(row) + csv_writer.writerow(transformed_row) + +# Example transformation +def uppercase_names(row): + row['name'] = row['name'].upper() + return row + +# Apply transformation +source = branch.object("data/users.csv") +target = branch.object("data/users_transformed.csv") +transform_csv_object(source, target, uppercase_names) +``` + +### Memory-Efficient Large File Processing + +```python +def process_large_file_in_chunks(obj, chunk_size=1024*1024): + """Process large file in chunks to minimize memory usage""" + + total_size = obj.stat().size_bytes + processed_bytes = 0 + + with obj.reader(mode='rb') as reader: + while processed_bytes < total_size: + chunk = reader.read(chunk_size) + if not chunk: + break + + # Process chunk here + processed_bytes += len(chunk) + progress = (processed_bytes / total_size) * 100 + print(f"Progress: {progress:.1f}%") + + print("Processing complete") + +# Usage +large_obj = branch.object("data/large-dataset.dat") +process_large_file_in_chunks(large_obj) +``` + +## Error Handling and Best Practices + +### Comprehensive Error Handling + +```python +from lakefs.exceptions import ( + ObjectNotFoundException, + ObjectExistsException, + PermissionException +) + +def robust_object_operations(branch, path, data): + try: + # Try to upload object + obj = branch.object(path).upload( + data=data, + mode="x" # Exclusive creation + ) + print(f"Object created: {path}") + return obj + + except ObjectExistsException: + print(f"Object {path} already exists") + # Decide whether to overwrite or use existing + return branch.object(path) + + except PermissionException: + print(f"Permission denied for {path}") + return None + + except Exception as e: + print(f"Unexpected error: {e}") + return None + +# Usage +obj = robust_object_operations(branch, "data/test.txt", "test content") +``` + +### Best Practices + +```python +def object_best_practices(): + """Demonstrate best practices for object operations""" + + branch = lakefs.repository("my-repo").branch("main") + + # 1. Always use context managers for I/O + obj = branch.object("data/example.txt") + + with obj.writer(mode='w') as writer: + writer.write("Content written safely") + # Writer automatically closed and data committed + + # 2. Check existence before operations + if obj.exists(): + with obj.reader(mode='r') as reader: + content = reader.read() + + # 3. Use appropriate content types + branch.object("data/config.json").upload( + data='{"key": "value"}', + content_type="application/json" + ) + + # 4. Add meaningful metadata + branch.object("data/dataset.csv").upload( + data="name,value\ntest,123", + content_type="text/csv", + metadata={ + "source": "data_pipeline", + "version": "1.0", + "created_by": "automated_process" + } + ) + + # 5. Use streaming for large files + large_obj = branch.object("data/large-file.txt") + with large_obj.writer(mode='w') as writer: + for i in range(10000): + writer.write(f"Line {i}\n") + + print("Best practices demonstrated") + +# Run best practices example +object_best_practices() +``` + +## Key Points + +- **File-like interface**: Objects support standard Python I/O operations +- **Streaming support**: Efficient handling of large files without memory issues +- **Metadata support**: Custom metadata can be attached to objects +- **Pre-signed URLs**: Direct access to underlying storage when supported +- **Atomic operations**: Uploads are atomic - either complete or fail entirely +- **Content type detection**: Automatic content type detection with manual override +- **Path flexibility**: Use forward slashes for cross-platform compatibility + +## See Also + +- **[Repository Management](repositories.md)** - Creating and managing repositories +- **[Branch Operations](branches-and-commits.md)** - Version control operations +- **[Transactions](transactions.md)** - Atomic multi-object operations +- **[Import Operations](imports-and-exports.md)** - Bulk data operations +- **[Best Practices](../reference/best-practices.md)** - Production deployment guidance \ No newline at end of file diff --git a/docs/src/integrations/python/high-level-sdk/quickstart.md b/docs/src/integrations/python/high-level-sdk/quickstart.md new file mode 100644 index 00000000000..5e013123f76 --- /dev/null +++ b/docs/src/integrations/python/high-level-sdk/quickstart.md @@ -0,0 +1,371 @@ +--- +title: High-Level SDK Quickstart +description: Quick start guide for the lakeFS High-Level Python SDK +sdk_types: ["high-level"] +difficulty: "beginner" +use_cases: ["getting-started", "basic-operations", "first-steps"] +topics: ["quickstart", "examples", "workflow"] +audience: ["data-engineers", "developers", "python-developers"] +last_updated: "2024-01-15" +--- + +# High-Level SDK Quickstart + +Get started quickly with the lakeFS High-Level Python SDK. This guide covers the essential operations you'll need for most workflows. + +## Installation + +```bash +pip install lakefs +``` + +## Prerequisites + +Before starting, ensure you have: +- A running lakeFS server +- Valid credentials (access key and secret key) +- A storage namespace (S3 bucket, Azure container, or local path) + +## Authentication Setup + +The SDK automatically discovers credentials from multiple sources: + +### Environment Variables +```bash +export LAKEFS_ACCESS_KEY_ID="your-access-key" +export LAKEFS_SECRET_ACCESS_KEY="your-secret-key" +export LAKEFS_ENDPOINT="http://localhost:8000" +``` + +### Configuration File +Create `~/.lakectl.yaml`: +```yaml +credentials: + access_key_id: your-access-key + secret_access_key: your-secret-key +server: + endpoint_url: http://localhost:8000 +``` + +### Explicit Configuration +```python +import lakefs +from lakefs.client import Client + +client = Client( + username="your-access-key", + password="your-secret-key", + host="http://localhost:8000" +) + +repo = lakefs.Repository("my-repo", client=client) +``` + +## Basic Workflow + +### 1. Import the SDK + +```python +import lakefs +``` + +### 2. Create a Repository + +```python +# Create a new repository +repo = lakefs.repository("quickstart-repo").create( + storage_namespace="s3://my-bucket/repos/quickstart", + default_branch="main", + exist_ok=True # Don't fail if repository already exists +) + +print(f"Repository created: {repo.id}") +print(f"Default branch: {repo.properties.default_branch}") +``` + +**Expected Output:** +``` +Repository created: quickstart-repo +Default branch: main +``` + +### 3. Work with Branches + +```python +# Get the main branch +main_branch = repo.branch("main") + +# Create a new feature branch +feature_branch = repo.branch("feature-branch").create(source_reference="main") + +print(f"Created branch: {feature_branch.id}") +print(f"Source: {feature_branch.get_commit().id}") +``` + +**Expected Output:** +``` +Created branch: feature-branch +Source: c7a632d74f46c... +``` + +### 4. Upload and Manage Objects + +```python +# Upload a simple text file +obj = feature_branch.object("data/sample.txt").upload( + data="Hello from lakeFS!", + content_type="text/plain" +) + +print(f"Uploaded: {obj.path}") +print(f"Size: {obj.size_bytes} bytes") + +# Upload binary data +with open("local-file.csv", "rb") as f: + csv_obj = feature_branch.object("data/dataset.csv").upload( + data=f, + content_type="text/csv" + ) + +print(f"Uploaded CSV: {csv_obj.path}") +``` + +**Expected Output:** +``` +Uploaded: data/sample.txt +Size: 18 bytes +Uploaded CSV: data/dataset.csv +``` + +### 5. Read Objects + +```python +# Read text content directly +content = feature_branch.object("data/sample.txt").reader().read() +print(f"Content: {content}") + +# Stream large files +with feature_branch.object("data/dataset.csv").reader(mode="r") as f: + for line_num, line in enumerate(f, 1): + print(f"Line {line_num}: {line.strip()}") + if line_num >= 3: # Show first 3 lines + break +``` + +**Expected Output:** +``` +Content: Hello from lakeFS! +Line 1: name,age,city +Line 2: Alice,30,New York +Line 3: Bob,25,San Francisco +``` + +### 6. Commit Changes + +```python +# Check what's changed +changes = list(feature_branch.uncommitted()) +print(f"Uncommitted changes: {len(changes)}") +for change in changes: + print(f" {change.type}: {change.path}") + +# Commit the changes +commit = feature_branch.commit( + message="Add sample data files", + metadata={"author": "quickstart-guide", "purpose": "demo"} +) + +print(f"Commit ID: {commit.id}") +print(f"Message: {commit.message}") +``` + +**Expected Output:** +``` +Uncommitted changes: 2 + added: data/sample.txt + added: data/dataset.csv +Commit ID: a1b2c3d4e5f6... +Message: Add sample data files +``` + +### 7. Merge to Main + +```python +# Merge feature branch to main +merge_result = main_branch.merge( + source_ref="feature-branch", + message="Merge feature branch with sample data" +) + +print(f"Merge commit: {merge_result.id}") + +# Verify the files are now in main +main_objects = list(main_branch.objects(prefix="data/")) +print(f"Objects in main: {[obj.path for obj in main_objects]}") +``` + +**Expected Output:** +``` +Merge commit: f6e5d4c3b2a1... +Objects in main: ['data/sample.txt', 'data/dataset.csv'] +``` + +## Common Patterns + +### Working with Existing Repositories + +```python +# Connect to existing repository +repo = lakefs.repository("existing-repo") + +# List all branches +branches = list(repo.branches()) +print(f"Branches: {[b.id for b in branches]}") + +# Get specific branch +dev_branch = repo.branch("development") +``` + +### Batch Operations + +```python +# Upload multiple files efficiently +files_to_upload = [ + ("data/file1.txt", "Content 1"), + ("data/file2.txt", "Content 2"), + ("data/file3.txt", "Content 3") +] + +branch = repo.branch("batch-demo").create(source_reference="main") + +for path, content in files_to_upload: + branch.object(path).upload(data=content) + +# Commit all at once +commit = branch.commit("Add multiple files") +``` + +### Error Handling + +```python +from lakefs.exceptions import NotFoundException, ConflictException + +try: + # Try to create a branch that might already exist + branch = repo.branch("existing-branch").create(source_reference="main") +except ConflictException: + # Branch already exists, get it instead + branch = repo.branch("existing-branch") + print("Using existing branch") + +try: + # Try to read a file that might not exist + content = branch.object("missing-file.txt").reader().read() +except NotFoundException: + print("File not found, creating it...") + branch.object("missing-file.txt").upload(data="Default content") +``` + +## Complete Example + +Here's a complete workflow that demonstrates the key concepts: + +```python +import lakefs +from lakefs.exceptions import ConflictException + +def quickstart_workflow(): + # Create or connect to repository + repo = lakefs.repository("demo-repo").create( + storage_namespace="s3://my-bucket/demo", + exist_ok=True + ) + + # Create feature branch + try: + feature = repo.branch("add-data").create(source_reference="main") + except ConflictException: + feature = repo.branch("add-data") + + # Add some data + feature.object("users.csv").upload( + data="name,email\nAlice,alice@example.com\nBob,bob@example.com" + ) + + feature.object("config.json").upload( + data='{"version": "1.0", "environment": "production"}' + ) + + # Commit changes + commit = feature.commit("Add user data and configuration") + + # Merge to main + main = repo.branch("main") + merge_result = main.merge( + source_ref="add-data", + message="Merge user data" + ) + + print(f"Workflow complete! Merge commit: {merge_result.id}") + + # List final objects + objects = list(main.objects()) + print(f"Repository now contains: {[obj.path for obj in objects]}") + +if __name__ == "__main__": + quickstart_workflow() +``` + +## Key Points + +- **Lazy evaluation**: Objects are created without server calls until you perform actions +- **Fluent interface**: Chain operations for concise workflows +- **Automatic error handling**: Comprehensive exception types for different scenarios +- **Streaming support**: Efficient handling of large files with file-like objects +- **Batch operations**: Upload multiple objects before committing for better performance + +## Next Steps + +Now that you understand the basics, explore more advanced features: + +- **[Repository Management](repositories.md)** - Advanced repository operations and metadata +- **[Branch Operations](branches-and-commits.md)** - Detailed branching, merging, and versioning +- **[Object I/O](objects-and-io.md)** - Streaming, metadata, and advanced object operations +- **[Transactions](transactions.md)** - Atomic operations and rollback scenarios +- **[Import Operations](imports-and-exports.md)** - Bulk data import and export workflows + +## See Also + +**Prerequisites and Setup:** +- [Installation Guide](../getting-started.md) - Complete setup instructions +- [Authentication Methods](../getting-started.md#authentication-and-configuration) - All credential configuration options +- [SDK Selection Guide](../index.md#sdk-selection-decision-matrix) - Choose the right SDK + +**High-Level SDK Deep Dive:** +- [High-Level SDK Overview](index.md) - Architecture and key concepts +- [Repository Management](repositories.md) - Advanced repository operations and metadata +- [Branch Operations](branches-and-commits.md) - Detailed branching, merging, and versioning +- [Object I/O](objects-and-io.md) - Streaming, metadata, and advanced object operations +- [Transactions](transactions.md) - Atomic operations and rollback scenarios +- [Import Operations](imports-and-exports.md) - Bulk data import and export workflows +- [Advanced Features](advanced.md) - Performance optimization and patterns + +**Alternative SDK Options:** +- [Generated SDK Examples](../generated-sdk/examples.md) - Direct API access patterns +- [lakefs-spec Quickstart](../lakefs-spec/filesystem-api.md) - Filesystem-like operations +- [Boto3 S3 Operations](../boto3/s3-operations.md) - S3-compatible interface + +**Learning Resources:** +- [Data Science Tutorial](../tutorials/data-science-workflow.md) - End-to-end data analysis workflow +- [ETL Pipeline Tutorial](../tutorials/etl-pipeline.md) - Building production data pipelines +- [ML Experiment Tracking](../tutorials/ml-experiment-tracking.md) - Model versioning workflow + +**Reference Materials:** +- [API Comparison](../reference/api-comparison.md) - Feature comparison across all SDKs +- [Best Practices](../reference/best-practices.md) - Production deployment guidance +- [Troubleshooting](../reference/troubleshooting.md) - Common issues and solutions +- [Error Handling Patterns](../reference/troubleshooting.md#error-handling) - Exception handling strategies + +**External Resources:** +- [High-Level SDK API Reference](https://pydocs-lakefs.lakefs.io){:target="_blank"} - Complete API documentation +- [lakeFS Concepts](https://docs.lakefs.io/understand/){:target="_blank"} - Core lakeFS concepts and terminology \ No newline at end of file diff --git a/docs/src/integrations/python/high-level-sdk/repositories.md b/docs/src/integrations/python/high-level-sdk/repositories.md new file mode 100644 index 00000000000..3d05934ef79 --- /dev/null +++ b/docs/src/integrations/python/high-level-sdk/repositories.md @@ -0,0 +1,587 @@ +--- +title: Repository Management +description: Managing repositories with the High-Level Python SDK +sdk_types: ["high-level"] +difficulty: "beginner" +use_cases: ["repository-management", "setup", "configuration"] +topics: ["repositories", "management", "configuration", "metadata"] +audience: ["data-engineers", "developers", "python-developers"] +last_updated: "2024-01-15" +--- + +# Repository Management + +Learn how to create, configure, and manage lakeFS repositories using the High-Level Python SDK. Repositories are the top-level containers in lakeFS that hold all your data, branches, and version history. + +## Repository Concepts + +### Repository Structure +A lakeFS repository consists of: +- **Storage namespace**: The underlying storage location (S3 bucket, Azure container, etc.) +- **Default branch**: The main branch created automatically (typically "main") +- **Metadata**: Repository-level configuration and properties +- **Branches and tags**: Version control references within the repository + +### Lazy Initialization +Repository objects are created lazily - instantiating a `Repository` object doesn't immediately connect to the server. Operations only execute when you call action methods. + +## Creating Repositories + +### Basic Repository Creation + +```python +import lakefs + +# Create a new repository +repo = lakefs.repository("my-repo").create( + storage_namespace="s3://my-bucket/repos/my-repo" +) + +print(f"Repository created: {repo.id}") +print(f"Storage namespace: {repo.properties.storage_namespace}") +print(f"Default branch: {repo.properties.default_branch}") +``` + +**Expected Output:** +``` +Repository created: my-repo +Storage namespace: s3://my-bucket/repos/my-repo +Default branch: main +``` + +### Repository with Custom Configuration + +```python +# Create repository with custom settings +repo = lakefs.repository("custom-repo").create( + storage_namespace="s3://my-bucket/repos/custom", + default_branch="develop", + include_samples=True # Include sample data +) + +print(f"Created with default branch: {repo.properties.default_branch}") +``` + +**Expected Output:** +``` +Created with default branch: develop +``` + +### Safe Repository Creation + +```python +from lakefs.exceptions import ConflictException + +try: + # Try to create repository + repo = lakefs.repository("existing-repo").create( + storage_namespace="s3://my-bucket/repos/existing", + exist_ok=False # Fail if repository exists + ) + print("Repository created successfully") +except ConflictException: + print("Repository already exists") + # Connect to existing repository instead + repo = lakefs.repository("existing-repo") +``` + +### Using exist_ok Parameter + +```python +# Create repository or connect to existing one +repo = lakefs.repository("safe-repo").create( + storage_namespace="s3://my-bucket/repos/safe", + exist_ok=True # Don't fail if repository exists +) + +# This will either create a new repository or return the existing one +print(f"Repository ready: {repo.id}") +``` + +## Connecting to Existing Repositories + +### Basic Connection + +```python +# Connect to existing repository (no server call yet) +repo = lakefs.repository("existing-repo") + +# Access properties (triggers server call) +print(f"Repository: {repo.id}") +print(f"Created: {repo.properties.creation_date}") +print(f"Storage: {repo.properties.storage_namespace}") +``` + +### With Custom Client + +```python +from lakefs.client import Client + +# Use custom client configuration +client = Client( + username="custom-access-key", + password="custom-secret-key", + host="https://my-lakefs.example.com" +) + +repo = lakefs.Repository("my-repo", client=client) +``` + +## Repository Properties and Metadata + +### Accessing Repository Properties + +```python +repo = lakefs.repository("my-repo") + +# Get repository properties +props = repo.properties +print(f"Repository ID: {props.id}") +print(f"Creation Date: {props.creation_date}") +print(f"Default Branch: {props.default_branch}") +print(f"Storage Namespace: {props.storage_namespace}") + +# Properties are cached after first access +print(f"Same properties object: {props is repo.properties}") +``` + +**Expected Output:** +``` +Repository ID: my-repo +Creation Date: 1640995200 +Default Branch: main +Storage Namespace: s3://my-bucket/repos/my-repo +Same properties object: True +``` + +### Repository Metadata + +```python +# Access repository metadata +metadata = repo.metadata +print(f"Metadata: {metadata}") + +# Metadata is a dictionary of key-value pairs +for key, value in metadata.items(): + print(f"{key}: {value}") +``` + +**Expected Output:** +``` +Metadata: {'created_by': 'admin', 'purpose': 'production'} +created_by: admin +purpose: production +``` + +## Listing Repositories + +### List All Repositories + +```python +# List all repositories (using default client) +for repo in lakefs.repositories(): + print(f"Repository: {repo.id}") + print(f" Storage: {repo.properties.storage_namespace}") + print(f" Default branch: {repo.properties.default_branch}") + print(f" Created: {repo.properties.creation_date}") + print() +``` + +**Expected Output:** +``` +Repository: repo1 + Storage: s3://bucket1/repos/repo1 + Default branch: main + Created: 1640995200 + +Repository: repo2 + Storage: s3://bucket2/repos/repo2 + Default branch: develop + Created: 1641081600 +``` + +### Filtered Repository Listing + +```python +# List repositories with prefix filter +for repo in lakefs.repositories(prefix="prod-"): + print(f"Production repo: {repo.id}") + +# List repositories with pagination +for repo in lakefs.repositories(after="repo-m", max_amount=10): + print(f"Repository: {repo.id}") +``` + +### Using Custom Client for Listing + +```python +from lakefs.client import Client + +client = Client(host="https://my-lakefs.example.com") + +# List repositories using custom client +for repo in lakefs.repositories(client=client, prefix="team-"): + print(f"Team repository: {repo.id}") +``` + +## Repository Navigation + +### Accessing Branches + +```python +repo = lakefs.repository("my-repo") + +# Get specific branch +main_branch = repo.branch("main") +dev_branch = repo.branch("development") + +# List all branches +print("All branches:") +for branch in repo.branches(): + print(f" {branch.id}") + +# List branches with filtering +print("Feature branches:") +for branch in repo.branches(prefix="feature-"): + print(f" {branch.id}") +``` + +### Accessing Tags + +```python +# Get specific tag +v1_tag = repo.tag("v1.0.0") + +# List all tags +print("All tags:") +for tag in repo.tags(): + print(f" {tag.id}") + +# List recent tags +print("Recent tags:") +for tag in repo.tags(max_amount=5): + print(f" {tag.id}") +``` + +### Accessing References + +```python +# Access any reference (branch, commit, or tag) +main_ref = repo.ref("main") +commit_ref = repo.ref("c7a632d74f46c...") +tag_ref = repo.ref("v1.0.0") + +# Using ref expressions +previous_commit = repo.ref("main~1") # Previous commit on main +head_commit = repo.commit("c7a632d74f46c...") # Specific commit +``` + +## Repository Operations + +### Repository Information + +```python +repo = lakefs.repository("my-repo") + +# Display comprehensive repository information +def show_repo_info(repo): + props = repo.properties + metadata = repo.metadata + + print(f"Repository: {props.id}") + print(f"Created: {props.creation_date}") + print(f"Storage: {props.storage_namespace}") + print(f"Default Branch: {props.default_branch}") + + if metadata: + print("Metadata:") + for key, value in metadata.items(): + print(f" {key}: {value}") + + # Count branches and tags + branch_count = len(list(repo.branches(max_amount=1000))) + tag_count = len(list(repo.tags(max_amount=1000))) + + print(f"Branches: {branch_count}") + print(f"Tags: {tag_count}") + +show_repo_info(repo) +``` + +### Repository Statistics + +```python +def get_repo_stats(repo): + """Get comprehensive repository statistics""" + stats = { + 'id': repo.id, + 'properties': repo.properties._asdict(), + 'metadata': repo.metadata, + 'branches': [], + 'tags': [], + 'total_objects': 0 + } + + # Collect branch information + for branch in repo.branches(): + branch_info = { + 'id': branch.id, + 'commit_id': branch.get_commit().id, + 'object_count': len(list(branch.objects(max_amount=1000))) + } + stats['branches'].append(branch_info) + stats['total_objects'] += branch_info['object_count'] + + # Collect tag information + for tag in repo.tags(): + stats['tags'].append({ + 'id': tag.id, + 'commit_id': tag.get_commit().id + }) + + return stats + +# Get and display stats +stats = get_repo_stats(repo) +print(f"Repository {stats['id']} has {len(stats['branches'])} branches and {len(stats['tags'])} tags") +print(f"Total objects across all branches: {stats['total_objects']}") +``` + +## Repository Deletion + +### Basic Deletion + +```python +from lakefs.exceptions import NotFoundException + +repo = lakefs.repository("repo-to-delete") + +try: + repo.delete() + print("Repository deleted successfully") +except NotFoundException: + print("Repository not found") +``` + +### Safe Deletion with Confirmation + +```python +def delete_repository_safely(repo_id: str, confirm: bool = False): + """Safely delete a repository with confirmation""" + repo = lakefs.repository(repo_id) + + if not confirm: + print(f"This will permanently delete repository '{repo_id}'") + print("Set confirm=True to proceed") + return False + + try: + # Show repository info before deletion + props = repo.properties + print(f"Deleting repository: {props.id}") + print(f"Storage namespace: {props.storage_namespace}") + + repo.delete() + print("Repository deleted successfully") + return True + + except NotFoundException: + print(f"Repository '{repo_id}' not found") + return False + except Exception as e: + print(f"Error deleting repository: {e}") + return False + +# Usage +delete_repository_safely("test-repo", confirm=True) +``` + +## Error Handling + +### Common Repository Errors + +```python +from lakefs.exceptions import ( + NotFoundException, + ConflictException, + NotAuthorizedException, + ServerException +) + +def handle_repository_operations(): + try: + # Try various repository operations + repo = lakefs.repository("my-repo").create( + storage_namespace="s3://my-bucket/repos/my-repo" + ) + + except ConflictException: + print("Repository already exists") + repo = lakefs.repository("my-repo") + + except NotAuthorizedException: + print("Not authorized to create repository") + return None + + except ServerException as e: + print(f"Server error: {e}") + return None + + try: + # Access repository properties + props = repo.properties + print(f"Repository: {props.id}") + + except NotFoundException: + print("Repository not found") + return None + + return repo +``` + +### Validation and Best Practices + +```python +def validate_repository_config(repo_id: str, storage_namespace: str): + """Validate repository configuration before creation""" + + # Validate repository ID + if not repo_id or not repo_id.replace('-', '').replace('_', '').isalnum(): + raise ValueError("Repository ID must contain only alphanumeric characters, hyphens, and underscores") + + # Validate storage namespace + if not storage_namespace.startswith(('s3://', 'gs://', 'azure://', 'file://')): + raise ValueError("Storage namespace must start with a valid protocol (s3://, gs://, azure://, file://)") + + print(f"Configuration valid for repository: {repo_id}") + return True + +# Usage +try: + validate_repository_config("my-repo", "s3://my-bucket/repos/my-repo") + repo = lakefs.repository("my-repo").create( + storage_namespace="s3://my-bucket/repos/my-repo" + ) +except ValueError as e: + print(f"Configuration error: {e}") +``` + +## Advanced Repository Patterns + +### Repository Factory Pattern + +```python +class RepositoryManager: + """Centralized repository management""" + + def __init__(self, client=None): + self.client = client or lakefs.Client() + self._repositories = {} + + def get_or_create_repository(self, repo_id: str, storage_namespace: str, **kwargs): + """Get existing repository or create new one""" + if repo_id in self._repositories: + return self._repositories[repo_id] + + try: + repo = lakefs.Repository(repo_id, client=self.client).create( + storage_namespace=storage_namespace, + exist_ok=True, + **kwargs + ) + self._repositories[repo_id] = repo + return repo + + except Exception as e: + print(f"Failed to get/create repository {repo_id}: {e}") + return None + + def list_managed_repositories(self): + """List all managed repositories""" + return list(self._repositories.keys()) + +# Usage +manager = RepositoryManager() +repo1 = manager.get_or_create_repository("repo1", "s3://bucket/repo1") +repo2 = manager.get_or_create_repository("repo2", "s3://bucket/repo2") +``` + +### Repository Cloning Pattern + +```python +def clone_repository_structure(source_repo_id: str, target_repo_id: str, + target_storage: str): + """Clone repository structure (branches and tags) to new repository""" + + source = lakefs.repository(source_repo_id) + target = lakefs.repository(target_repo_id).create( + storage_namespace=target_storage, + exist_ok=True + ) + + # Clone branches + for branch in source.branches(): + if branch.id != source.properties.default_branch: + try: + target.branch(branch.id).create( + source_reference=source.properties.default_branch + ) + print(f"Cloned branch: {branch.id}") + except ConflictException: + print(f"Branch {branch.id} already exists") + + # Clone tags + for tag in source.tags(): + try: + commit_id = tag.get_commit().id + target.tag(tag.id).create(commit_id=commit_id) + print(f"Cloned tag: {tag.id}") + except ConflictException: + print(f"Tag {tag.id} already exists") + + return target + +# Usage +cloned_repo = clone_repository_structure("source-repo", "target-repo", "s3://bucket/target") +``` + +## Key Points + +- **Lazy evaluation**: Repository objects don't connect to server until you access properties or call methods +- **Caching**: Repository properties are cached after first access for performance +- **Error handling**: Use specific exception types for robust error handling +- **Navigation**: Use repository objects as entry points to access branches, tags, and references +- **Metadata**: Repository metadata provides additional configuration and information + +## See Also + +**High-Level SDK Workflow:** +- **[Quickstart Guide](quickstart.md)** - Basic repository operations and setup +- **[Branch Operations](branches-and-commits.md)** - Working with branches and commits +- **[Object Management](objects-and-io.md)** - Managing objects within repositories +- **[Transaction Patterns](transactions.md)** - Atomic operations across repositories +- **[Import/Export Operations](imports-and-exports.md)** - Bulk data operations + +**Repository Management:** +- **[High-Level SDK Overview](index.md)** - Architecture and key concepts +- **[Advanced Features](advanced.md)** - Performance optimization and patterns +- **[Generated SDK Access](../generated-sdk/direct-access.md)** - Direct API access for advanced operations + +**Alternative Approaches:** +- **[Generated SDK Repository API](../generated-sdk/api-reference.md#repository-operations)** - Direct API access +- **[lakefs-spec Limitations](../lakefs-spec/index.md#when-to-use-lakefs-spec)** - Why lakefs-spec doesn't support repository management +- **[Boto3 Limitations](../boto3/index.md#key-features)** - S3 compatibility doesn't include repository operations + +**Learning Resources:** +- **[Data Science Tutorial](../tutorials/data-science-workflow.md)** - Repository setup for data science workflows +- **[ETL Pipeline Tutorial](../tutorials/etl-pipeline.md)** - Repository management in data pipelines +- **[ML Experiment Tracking](../tutorials/ml-experiment-tracking.md)** - Repository organization for ML projects + +**Reference Materials:** +- **[API Comparison](../reference/api-comparison.md#core-repository-operations)** - Repository features across SDKs +- **[Best Practices](../reference/best-practices.md#repository-management)** - Production deployment guidance +- **[Troubleshooting](../reference/troubleshooting.md#repository-issues)** - Common repository issues and solutions + +**External Resources:** +- **[lakeFS Repository Concepts](https://docs.lakefs.io/understand/model.html#repository){:target="_blank"}** - Core lakeFS repository concepts +- **[High-Level SDK API Reference](https://pydocs-lakefs.lakefs.io/repository.html){:target="_blank"}** - Complete repository API documentation \ No newline at end of file diff --git a/docs/src/integrations/python/high-level-sdk/transactions.md b/docs/src/integrations/python/high-level-sdk/transactions.md new file mode 100644 index 00000000000..70309495dea --- /dev/null +++ b/docs/src/integrations/python/high-level-sdk/transactions.md @@ -0,0 +1,962 @@ +--- +title: Transaction Handling +description: Atomic operations using transactions in the High-Level Python SDK +sdk_types: ["high-level"] +difficulty: "intermediate" +use_cases: ["transactions", "atomic-operations", "data-consistency", "rollback"] +topics: ["transactions", "atomicity", "consistency", "rollback"] +audience: ["data-engineers", "developers", "python-developers"] +last_updated: "2024-01-15" +--- + +# Transaction Handling + +Learn how to use transactions for atomic operations that ensure data consistency in your lakeFS repositories. Transactions provide ACID-like properties for complex multi-object operations. + +## Transaction Concepts + +### How Transactions Work +lakeFS transactions use an ephemeral branch pattern: +1. **Create ephemeral branch** - A temporary branch is created from the source branch +2. **Perform operations** - All changes are made on the ephemeral branch +3. **Commit changes** - Changes are committed to the ephemeral branch +4. **Merge back** - The ephemeral branch is merged back to the source branch +5. **Cleanup** - The ephemeral branch is deleted + +### Transaction Properties +- **Atomic**: All operations succeed or all fail together +- **Consistent**: Repository remains in a valid state +- **Isolated**: Operations don't interfere with concurrent changes +- **Durable**: Committed changes are permanently stored + +### Important Notes +- Transactions don't "lock" the source branch +- Concurrent changes to the source branch may cause transaction conflicts +- Transactions are designed for related operations that should succeed or fail together + +## Basic Transaction Usage + +### Simple Transaction Pattern + +```python +import lakefs + +branch = lakefs.repository("my-repo").branch("main") + +# Basic transaction with context manager +with branch.transact(commit_message="Atomic data update") as tx: + # All operations within this block are atomic + tx.object("data/file1.txt").upload(data="New content 1") + tx.object("data/file2.txt").upload(data="New content 2") + + # If we reach the end without exceptions, changes are committed + # If an exception occurs, all changes are rolled back + +print("Transaction completed successfully") +``` + +**Expected Output:** +``` +Transaction completed successfully +``` + +### Transaction with Metadata + +```python +# Transaction with rich metadata +with branch.transact( + commit_message="Update user profiles and preferences", + commit_metadata={ + "operation": "user_data_update", + "version": "1.2.0", + "updated_by": "data_pipeline", + "batch_id": "batch_001" + } +) as tx: + # Upload user data + tx.object("users/profiles.json").upload( + data='{"users": [{"id": 1, "name": "Alice"}]}', + content_type="application/json" + ) + + # Upload preferences + tx.object("users/preferences.json").upload( + data='{"theme": "dark", "notifications": true}', + content_type="application/json" + ) + +print("User data updated atomically") +``` + +### Accessing Transaction Properties + +```python +with branch.transact(commit_message="Initial message") as tx: + # Access transaction properties + print(f"Transaction ID: {tx.id}") + print(f"Source branch: {tx.source_id}") + print(f"Commit message: {tx.commit_message}") + + # Modify transaction properties during execution + tx.commit_message = "Updated commit message" + tx.commit_metadata = {"updated": "true"} + + # Perform operations + tx.object("data/example.txt").upload(data="Transaction example") +``` + +## Transaction Operations + +### File Operations in Transactions + +```python +with branch.transact(commit_message="Comprehensive file operations") as tx: + # Upload new files + tx.object("data/new-dataset.csv").upload( + data="id,name,value\n1,Alice,100\n2,Bob,200", + content_type="text/csv" + ) + + # Update existing files + config_obj = tx.object("config/settings.json") + if config_obj.exists(): + # Read current config + current_config = json.loads(config_obj.reader().read()) + + # Update configuration + current_config["version"] = "2.0" + current_config["last_updated"] = "2024-01-15" + + # Write updated config + config_obj.upload( + data=json.dumps(current_config, indent=2), + content_type="application/json" + ) + + # Delete old files + old_files = list(tx.objects(prefix="data/old/")) + for obj_info in old_files: + tx.object(obj_info.path).delete() + + # Batch delete using branch method + temp_files = ["temp/file1.txt", "temp/file2.txt", "temp/file3.txt"] + tx.delete_objects(temp_files) + +print("File operations completed atomically") +``` + +### Data Processing Pipeline + +```python +import json +import csv +from datetime import datetime + +def process_data_pipeline(branch, input_path, output_path): + """Process data through multiple stages atomically""" + + with branch.transact( + commit_message=f"Data pipeline: {input_path} -> {output_path}", + commit_metadata={ + "pipeline": "data_transformation", + "input": input_path, + "output": output_path, + "timestamp": datetime.now().isoformat() + } + ) as tx: + # Stage 1: Read and validate input + input_obj = tx.object(input_path) + if not input_obj.exists(): + raise ValueError(f"Input file {input_path} not found") + + raw_data = json.loads(input_obj.reader().read()) + + # Stage 2: Transform data + processed_records = [] + for record in raw_data.get("records", []): + processed_record = { + "id": record["id"], + "name": record["name"].upper(), + "value": record["value"] * 1.1, # Apply 10% increase + "processed_at": datetime.now().isoformat(), + "status": "processed" + } + processed_records.append(processed_record) + + # Stage 3: Write processed data + output_obj = tx.object(output_path) + with output_obj.writer(mode='w', content_type="text/csv") as writer: + if processed_records: + fieldnames = processed_records[0].keys() + csv_writer = csv.DictWriter(writer, fieldnames=fieldnames) + csv_writer.writeheader() + csv_writer.writerows(processed_records) + + # Stage 4: Create processing summary + summary = { + "input_file": input_path, + "output_file": output_path, + "records_processed": len(processed_records), + "processing_time": datetime.now().isoformat(), + "status": "completed" + } + + summary_path = f"summaries/processing_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" + tx.object(summary_path).upload( + data=json.dumps(summary, indent=2), + content_type="application/json" + ) + + # Stage 5: Archive input file + archive_path = f"archive/{input_path}" + input_obj.copy(tx.source_id, archive_path) + input_obj.delete() + + return len(processed_records) + +# Usage +processed_count = process_data_pipeline( + branch, + "raw/user_data.json", + "processed/user_data.csv" +) +print(f"Pipeline completed: {processed_count} records processed") +``` + +## Error Handling and Rollback + +### Automatic Rollback on Exceptions + +```python +def demonstrate_rollback(branch): + """Demonstrate automatic rollback on transaction failure""" + + # Check initial state + initial_objects = list(branch.objects(prefix="demo/")) + print(f"Initial objects: {len(initial_objects)}") + + try: + with branch.transact(commit_message="Failing transaction") as tx: + # These operations will be rolled back + tx.object("demo/file1.txt").upload(data="Content 1") + tx.object("demo/file2.txt").upload(data="Content 2") + tx.object("demo/file3.txt").upload(data="Content 3") + + print("Files uploaded in transaction...") + + # Check objects within transaction + tx_objects = list(tx.objects(prefix="demo/")) + print(f"Objects in transaction: {len(tx_objects)}") + + # This will cause the transaction to fail and rollback + raise ValueError("Simulated error - transaction will rollback!") + + # This line won't be reached + tx.object("demo/file4.txt").upload(data="Content 4") + + except ValueError as e: + print(f"Transaction failed: {e}") + + # Check final state - should be same as initial + final_objects = list(branch.objects(prefix="demo/")) + print(f"Final objects: {len(final_objects)}") + print(f"Rollback successful: {len(initial_objects) == len(final_objects)}") + +# Run demonstration +demonstrate_rollback(branch) +``` + +**Expected Output:** +``` +Initial objects: 0 +Files uploaded in transaction... +Objects in transaction: 3 +Transaction failed: Simulated error - transaction will rollback! +Final objects: 0 +Rollback successful: True +``` + +### Conditional Transaction Execution + +```python +def conditional_update(branch, condition_check): + """Execute transaction only if conditions are met""" + + with branch.transact(commit_message="Conditional update") as tx: + # Check conditions before proceeding + config_obj = tx.object("config/settings.json") + + if not config_obj.exists(): + raise ValueError("Configuration file not found") + + config = json.loads(config_obj.reader().read()) + + # Apply condition check + if not condition_check(config): + raise ValueError("Conditions not met for update") + + # Proceed with updates + config["last_updated"] = datetime.now().isoformat() + config["update_count"] = config.get("update_count", 0) + 1 + + config_obj.upload( + data=json.dumps(config, indent=2), + content_type="application/json" + ) + + # Log the update + log_entry = { + "timestamp": datetime.now().isoformat(), + "action": "conditional_update", + "config_version": config.get("version", "unknown") + } + + tx.object("logs/updates.json").upload( + data=json.dumps(log_entry), + content_type="application/json" + ) + + return config + +# Example condition check +def version_check(config): + return config.get("version", 0) < 3 + +try: + updated_config = conditional_update(branch, version_check) + print("Conditional update successful") +except ValueError as e: + print(f"Update skipped: {e}") +``` + +### Transaction Retry Pattern + +```python +from lakefs.exceptions import TransactionException +import time +import random + +def retry_transaction(branch, operation_func, max_retries=3, base_delay=1): + """Execute transaction with exponential backoff retry""" + + for attempt in range(max_retries): + try: + with branch.transact( + commit_message=f"Retry operation (attempt {attempt + 1})" + ) as tx: + result = operation_func(tx) + print(f"Transaction succeeded on attempt {attempt + 1}") + return result + + except TransactionException as e: + if attempt == max_retries - 1: + print(f"Transaction failed after {max_retries} attempts") + raise + + # Exponential backoff with jitter + delay = base_delay * (2 ** attempt) + random.uniform(0, 1) + print(f"Attempt {attempt + 1} failed: {e}. Retrying in {delay:.1f}s...") + time.sleep(delay) + + except Exception as e: + # Non-transaction errors shouldn't be retried + print(f"Non-retryable error: {e}") + raise + +def risky_operation(tx): + """Simulate an operation that might fail due to conflicts""" + # Simulate random failure + if random.random() < 0.7: # 70% chance of failure + raise TransactionException("Simulated conflict") + + tx.object("data/result.txt").upload(data="Operation successful") + return "success" + +# Usage +try: + result = retry_transaction(branch, risky_operation) + print(f"Final result: {result}") +except Exception as e: + print(f"Operation ultimately failed: {e}") +``` + +## Advanced Transaction Patterns + +### Multi-Stage Data Processing + +```python +class DataProcessor: + """Multi-stage data processor using transactions""" + + def __init__(self, branch): + self.branch = branch + + def process_dataset(self, dataset_name, stages): + """Process dataset through multiple stages atomically""" + + with self.branch.transact( + commit_message=f"Process dataset: {dataset_name}", + commit_metadata={ + "dataset": dataset_name, + "stages": len(stages), + "processor": "DataProcessor" + } + ) as tx: + + current_data = None + stage_results = [] + + for i, stage in enumerate(stages): + stage_name = stage.__name__ + print(f"Executing stage {i+1}: {stage_name}") + + try: + # Execute stage + stage_result = stage(tx, current_data, dataset_name) + current_data = stage_result.get("data") + stage_results.append({ + "stage": stage_name, + "status": "success", + "output": stage_result.get("output_path"), + "records": stage_result.get("record_count", 0) + }) + + except Exception as e: + print(f"Stage {stage_name} failed: {e}") + raise # This will rollback the entire transaction + + # Create processing report + report = { + "dataset": dataset_name, + "processing_time": datetime.now().isoformat(), + "stages_completed": len(stage_results), + "stage_results": stage_results, + "status": "completed" + } + + report_path = f"reports/{dataset_name}_processing_report.json" + tx.object(report_path).upload( + data=json.dumps(report, indent=2), + content_type="application/json" + ) + + return report + +def extract_stage(tx, previous_data, dataset_name): + """Extract data from source""" + source_path = f"raw/{dataset_name}.json" + source_obj = tx.object(source_path) + + if not source_obj.exists(): + raise ValueError(f"Source data not found: {source_path}") + + data = json.loads(source_obj.reader().read()) + + return { + "data": data, + "output_path": source_path, + "record_count": len(data.get("records", [])) + } + +def transform_stage(tx, input_data, dataset_name): + """Transform extracted data""" + if not input_data: + raise ValueError("No input data for transformation") + + # Transform records + transformed_records = [] + for record in input_data.get("records", []): + transformed_record = { + "id": record["id"], + "name": record["name"].strip().title(), + "value": float(record["value"]) * 1.1, + "transformed_at": datetime.now().isoformat() + } + transformed_records.append(transformed_record) + + # Save transformed data + output_path = f"transformed/{dataset_name}.json" + transformed_data = {"records": transformed_records} + + tx.object(output_path).upload( + data=json.dumps(transformed_data, indent=2), + content_type="application/json" + ) + + return { + "data": transformed_data, + "output_path": output_path, + "record_count": len(transformed_records) + } + +def load_stage(tx, input_data, dataset_name): + """Load transformed data to final destination""" + if not input_data: + raise ValueError("No input data for loading") + + # Convert to CSV format + output_path = f"processed/{dataset_name}.csv" + + with tx.object(output_path).writer(mode='w', content_type="text/csv") as writer: + records = input_data.get("records", []) + if records: + fieldnames = records[0].keys() + csv_writer = csv.DictWriter(writer, fieldnames=fieldnames) + csv_writer.writeheader() + csv_writer.writerows(records) + + return { + "data": input_data, + "output_path": output_path, + "record_count": len(input_data.get("records", [])) + } + +# Usage +processor = DataProcessor(branch) +stages = [extract_stage, transform_stage, load_stage] + +try: + report = processor.process_dataset("user_data", stages) + print(f"Processing completed: {report['stages_completed']} stages") +except Exception as e: + print(f"Processing failed and rolled back: {e}") +``` + +### Concurrent Transaction Handling + +```python +import threading +import queue +import time + +class TransactionWorker: + """Worker for handling concurrent transactions""" + + def __init__(self, branch, worker_id): + self.branch = branch + self.worker_id = worker_id + self.results = [] + + def process_work_item(self, work_item): + """Process a single work item in a transaction""" + + item_id = work_item["id"] + operation = work_item["operation"] + data = work_item["data"] + + try: + with self.branch.transact( + commit_message=f"Worker {self.worker_id}: Process item {item_id}", + commit_metadata={ + "worker_id": str(self.worker_id), + "item_id": str(item_id), + "operation": operation + } + ) as tx: + + if operation == "create": + path = f"work_items/{item_id}.json" + tx.object(path).upload( + data=json.dumps(data, indent=2), + content_type="application/json" + ) + + elif operation == "update": + path = f"work_items/{item_id}.json" + existing_obj = tx.object(path) + + if existing_obj.exists(): + existing_data = json.loads(existing_obj.reader().read()) + existing_data.update(data) + existing_data["updated_by"] = f"worker_{self.worker_id}" + existing_data["updated_at"] = datetime.now().isoformat() + + existing_obj.upload( + data=json.dumps(existing_data, indent=2), + content_type="application/json" + ) + else: + raise ValueError(f"Item {item_id} not found for update") + + elif operation == "delete": + path = f"work_items/{item_id}.json" + tx.object(path).delete() + + # Log the operation + log_entry = { + "worker_id": self.worker_id, + "item_id": item_id, + "operation": operation, + "timestamp": datetime.now().isoformat(), + "status": "completed" + } + + log_path = f"logs/worker_{self.worker_id}_{int(time.time())}.json" + tx.object(log_path).upload( + data=json.dumps(log_entry), + content_type="application/json" + ) + + return {"status": "success", "item_id": item_id} + + except Exception as e: + return {"status": "error", "item_id": item_id, "error": str(e)} + +def concurrent_transaction_demo(branch, work_items, num_workers=3): + """Demonstrate concurrent transaction processing""" + + work_queue = queue.Queue() + result_queue = queue.Queue() + + # Add work items to queue + for item in work_items: + work_queue.put(item) + + def worker_thread(worker_id): + """Worker thread function""" + worker = TransactionWorker(branch, worker_id) + + while True: + try: + work_item = work_queue.get(timeout=1) + result = worker.process_work_item(work_item) + result_queue.put(result) + work_queue.task_done() + + except queue.Empty: + break + except Exception as e: + result_queue.put({ + "status": "error", + "worker_id": worker_id, + "error": str(e) + }) + + # Start worker threads + threads = [] + for i in range(num_workers): + thread = threading.Thread(target=worker_thread, args=(i,)) + thread.start() + threads.append(thread) + + # Wait for completion + work_queue.join() + + # Collect results + results = [] + while not result_queue.empty(): + results.append(result_queue.get()) + + # Wait for threads to finish + for thread in threads: + thread.join() + + return results + +# Example usage +work_items = [ + {"id": 1, "operation": "create", "data": {"name": "Item 1", "value": 100}}, + {"id": 2, "operation": "create", "data": {"name": "Item 2", "value": 200}}, + {"id": 3, "operation": "create", "data": {"name": "Item 3", "value": 300}}, + {"id": 1, "operation": "update", "data": {"value": 150}}, + {"id": 2, "operation": "delete", "data": {}}, +] + +results = concurrent_transaction_demo(branch, work_items) +print(f"Processed {len(results)} items concurrently") + +success_count = len([r for r in results if r["status"] == "success"]) +error_count = len([r for r in results if r["status"] == "error"]) +print(f"Success: {success_count}, Errors: {error_count}") +``` + +## Transaction Best Practices + +### Transaction Scope and Size + +```python +# ✅ Good: Focused, related operations +def good_transaction_example(branch): + with branch.transact(commit_message="Update user profile") as tx: + # Related operations that should succeed/fail together + tx.object("users/profile.json").upload(data=profile_data) + tx.object("users/preferences.json").upload(data=preferences_data) + tx.object("users/index.json").upload(data=updated_index) + +# ❌ Avoid: Unrelated operations in single transaction +def avoid_large_transaction(branch): + with branch.transact(commit_message="Daily batch processing") as tx: + # Too many unrelated operations + # This makes the transaction prone to conflicts and hard to debug + for i in range(1000): # Too many operations + tx.object(f"data/file_{i}.txt").upload(data=f"content {i}") + + # Unrelated operations mixed together + tx.object("config/settings.json").upload(data=config_data) + tx.object("logs/daily.log").upload(data=log_data) +``` + +### Optimal Batch Sizes + +```python +def optimal_batch_processing(branch, items, batch_size=50): + """Process items in optimal batch sizes""" + + total_batches = (len(items) + batch_size - 1) // batch_size + results = [] + + for batch_num in range(total_batches): + start_idx = batch_num * batch_size + end_idx = min(start_idx + batch_size, len(items)) + batch_items = items[start_idx:end_idx] + + try: + with branch.transact( + commit_message=f"Process batch {batch_num + 1}/{total_batches}", + commit_metadata={ + "batch_number": str(batch_num + 1), + "total_batches": str(total_batches), + "items_in_batch": str(len(batch_items)) + } + ) as tx: + + batch_results = [] + for item in batch_items: + # Process individual item + result = process_item(tx, item) + batch_results.append(result) + + # Create batch summary + summary = { + "batch_number": batch_num + 1, + "items_processed": len(batch_results), + "timestamp": datetime.now().isoformat() + } + + tx.object(f"summaries/batch_{batch_num + 1}.json").upload( + data=json.dumps(summary, indent=2), + content_type="application/json" + ) + + results.extend(batch_results) + print(f"Completed batch {batch_num + 1}/{total_batches}") + + except Exception as e: + print(f"Batch {batch_num + 1} failed: {e}") + # Continue with next batch rather than failing everything + continue + + return results + +def process_item(tx, item): + """Process a single item within transaction""" + path = f"processed/{item['id']}.json" + tx.object(path).upload( + data=json.dumps(item, indent=2), + content_type="application/json" + ) + return {"id": item["id"], "status": "processed"} +``` + +### Error Handling Strategies + +```python +class TransactionManager: + """Advanced transaction management with error handling""" + + def __init__(self, branch): + self.branch = branch + self.failed_operations = [] + + def execute_with_fallback(self, primary_operation, fallback_operation, + commit_message): + """Execute operation with fallback on failure""" + + # Try primary operation + try: + with self.branch.transact(commit_message=commit_message) as tx: + result = primary_operation(tx) + return {"status": "success", "method": "primary", "result": result} + + except Exception as primary_error: + print(f"Primary operation failed: {primary_error}") + + # Try fallback operation + try: + with self.branch.transact( + commit_message=f"{commit_message} (fallback)" + ) as tx: + result = fallback_operation(tx) + return {"status": "success", "method": "fallback", "result": result} + + except Exception as fallback_error: + print(f"Fallback operation also failed: {fallback_error}") + self.failed_operations.append({ + "commit_message": commit_message, + "primary_error": str(primary_error), + "fallback_error": str(fallback_error), + "timestamp": datetime.now().isoformat() + }) + return {"status": "failed", "errors": [primary_error, fallback_error]} + + def get_failed_operations(self): + """Get list of failed operations for analysis""" + return self.failed_operations + +# Example usage +def primary_data_update(tx): + """Primary method - more complex but preferred""" + # Complex data transformation + tx.object("data/complex_result.json").upload(data='{"method": "complex"}') + return "complex_processing_complete" + +def fallback_data_update(tx): + """Fallback method - simpler but reliable""" + # Simple data update + tx.object("data/simple_result.json").upload(data='{"method": "simple"}') + return "simple_processing_complete" + +manager = TransactionManager(branch) +result = manager.execute_with_fallback( + primary_data_update, + fallback_data_update, + "Update data with fallback" +) + +print(f"Operation result: {result}") +``` + +### Transaction Monitoring and Debugging + +```python +class TransactionMonitor: + """Monitor transaction performance and behavior""" + + def __init__(self): + self.transaction_logs = [] + + def monitored_transaction(self, branch, commit_message, operation_func, + commit_metadata=None): + """Execute transaction with monitoring""" + + start_time = time.time() + transaction_id = f"tx_{int(start_time)}" + + log_entry = { + "transaction_id": transaction_id, + "commit_message": commit_message, + "start_time": start_time, + "metadata": commit_metadata + } + + try: + with branch.transact( + commit_message=commit_message, + commit_metadata=commit_metadata + ) as tx: + # Log transaction start + print(f"🚀 Starting transaction: {transaction_id}") + + # Execute operation + result = operation_func(tx) + + # Calculate duration + duration = time.time() - start_time + + # Log success + log_entry.update({ + "status": "success", + "duration": duration, + "end_time": time.time(), + "result": str(result)[:100] # Truncate long results + }) + + print(f"✅ Transaction completed: {transaction_id} ({duration:.2f}s)") + + return result + + except Exception as e: + # Calculate duration + duration = time.time() - start_time + + # Log failure + log_entry.update({ + "status": "failed", + "duration": duration, + "end_time": time.time(), + "error": str(e) + }) + + print(f"❌ Transaction failed: {transaction_id} ({duration:.2f}s) - {e}") + raise + + finally: + self.transaction_logs.append(log_entry) + + def get_transaction_stats(self): + """Get transaction statistics""" + if not self.transaction_logs: + return {"message": "No transactions recorded"} + + successful = [log for log in self.transaction_logs if log["status"] == "success"] + failed = [log for log in self.transaction_logs if log["status"] == "failed"] + + avg_duration = sum(log["duration"] for log in self.transaction_logs) / len(self.transaction_logs) + + return { + "total_transactions": len(self.transaction_logs), + "successful": len(successful), + "failed": len(failed), + "success_rate": len(successful) / len(self.transaction_logs) * 100, + "average_duration": avg_duration, + "longest_transaction": max(self.transaction_logs, key=lambda x: x["duration"])["duration"], + "shortest_transaction": min(self.transaction_logs, key=lambda x: x["duration"])["duration"] + } + +# Usage example +monitor = TransactionMonitor() + +def sample_operation(tx): + tx.object("test/file1.txt").upload(data="test content 1") + tx.object("test/file2.txt").upload(data="test content 2") + time.sleep(0.1) # Simulate processing time + return "operation_complete" + +# Execute monitored transactions +for i in range(5): + try: + monitor.monitored_transaction( + branch, + f"Test transaction {i+1}", + sample_operation, + {"test_run": str(i+1)} + ) + except Exception: + pass # Continue with other transactions + +# Get statistics +stats = monitor.get_transaction_stats() +print(f"\n📊 Transaction Statistics:") +for key, value in stats.items(): + print(f" {key}: {value}") +``` + +## Key Points + +- **Atomic operations**: All changes in a transaction succeed or fail together +- **Ephemeral branches**: Transactions use temporary branches for isolation +- **No locking**: Transactions don't lock the source branch, conflicts are possible +- **Context managers**: Always use `with` statements for proper cleanup +- **Error handling**: Implement comprehensive error handling and fallback strategies +- **Batch sizing**: Keep transactions focused and reasonably sized +- **Monitoring**: Monitor transaction performance and success rates +- **Rollback**: Failed transactions automatically clean up ephemeral branches + +## See Also + +- **[Repository Management](repositories.md)** - Creating and managing repositories +- **[Branch Operations](branches-and-commits.md)** - Version control operations +- **[Object Operations](objects-and-io.md)** - Individual object management +- **[Import Operations](imports-and-exports.md)** - Bulk data operations +- **[Advanced Features](advanced.md)** - Advanced patterns and optimization +- **[Best Practices](../reference/best-practices.md)** - Production deployment guidance \ No newline at end of file diff --git a/docs/src/integrations/python/index.md b/docs/src/integrations/python/index.md new file mode 100644 index 00000000000..ca5996d4a06 --- /dev/null +++ b/docs/src/integrations/python/index.md @@ -0,0 +1,244 @@ +--- +title: Python Integration Overview +description: Comprehensive guide to using Python with lakeFS - SDK comparison and getting started +sdk_types: ["high-level", "generated", "lakefs-spec", "boto3"] +difficulty: "beginner" +use_cases: ["general", "decision-making", "getting-started"] +topics: ["overview", "comparison", "selection"] +audience: ["data-engineers", "data-scientists", "developers"] +last_updated: "2024-01-15" +--- + +# Python Integration with lakeFS + +lakeFS provides multiple Python integration options to suit different use cases and development patterns. This comprehensive guide helps you choose the right SDK and get started quickly. + +!!! warning "Legacy SDK Deprecation" + If your project is currently using the [legacy Python `lakefs-client`](https://pypi.org/project/lakefs-client/), please be aware that this version has been [deprecated](../posts/deprecate-py-legacy.md). As of release **v1.44.0**, it's no longer supported for new updates or features. + +## SDK Architecture and Relationships + +Understanding the relationship between different Python SDKs helps you make informed decisions: + +```mermaid +graph TD + A[Your Application] --> B[High-Level SDK] + A --> C[Generated SDK] + A --> D[lakefs-spec] + A --> E[Boto3] + + B --> C + C --> F[lakeFS API] + D --> F + E --> G[lakeFS S3 Gateway] + G --> F + + style B fill:#e1f5fe + style C fill:#f3e5f5 + style D fill:#e8f5e8 + style E fill:#fff3e0 +``` + +- **High-Level SDK** is built on top of the **Generated SDK**, providing simplified interfaces while maintaining access to the underlying client +- **Generated SDK** provides direct access to all lakeFS API endpoints based on the OpenAPI specification +- **lakefs-spec** offers a filesystem-like interface compatible with the fsspec ecosystem +- **Boto3** integrates through lakeFS's S3-compatible gateway + +## Comprehensive SDK Comparison + +| Feature | High-Level SDK | Generated SDK | lakefs-spec | Boto3 | +|---------|----------------|---------------|-------------|-------| +| **Installation** | `pip install lakefs` | `pip install lakefs-sdk` | `pip install lakefs-spec` | `pip install boto3` | +| **API Style** | Object-oriented, simplified | Direct API mapping | Filesystem-like | S3-compatible | +| **Learning Curve** | Easy | Moderate | Easy | Easy (if familiar with S3) | +| **Repository Management** | ✅ Full support | ✅ Full support | ❌ Not supported | ❌ Not supported | +| **Branch Operations** | ✅ Simplified interface | ✅ Full API access | ❌ Limited | ❌ Not supported | +| **Object Operations** | ✅ Streaming I/O | ✅ Manual handling | ✅ File-like operations | ✅ S3-style operations | +| **Transactions** | ✅ Built-in support | ⚠️ Manual implementation | ✅ Context managers | ❌ Not supported | +| **Data Science Integration** | ⚠️ Via file-like objects | ❌ Manual integration | ✅ Native pandas/dask support | ⚠️ Via S3 compatibility | +| **Async Support** | ❌ Sync only | ⚠️ Limited | ❌ Sync only | ⚠️ Via aioboto3 | +| **Error Handling** | ✅ Pythonic exceptions | ✅ API-level exceptions | ✅ Filesystem exceptions | ✅ Boto3 exceptions | +| **Performance** | Good | Best (direct API) | Good | Good | +| **Maintenance** | lakeFS team | Auto-generated | Third-party | AWS/Community | + +### SDK Strengths and Use Cases + +#### High-Level SDK +**Strengths:** +- Intuitive, Pythonic API design +- Built-in transaction support with context managers +- Streaming I/O operations for large files +- Automatic connection management and retries +- Access to underlying Generated SDK when needed + +**Best for:** +- Data engineers building ETL pipelines +- Python developers new to lakeFS +- Applications requiring transaction semantics +- Workflows with large file uploads/downloads + +**Example use cases:** +- Data pipeline orchestration +- Batch data processing +- Model training workflows +- Data quality validation + +#### Generated SDK +**Strengths:** +- Complete API coverage (all endpoints available) +- Direct mapping to lakeFS REST API +- Fine-grained control over requests +- Auto-generated from OpenAPI specification +- Consistent with other language SDKs + +**Best for:** +- Advanced users needing full API control +- Custom tooling and integrations +- Operations not covered by High-Level SDK +- Performance-critical applications + +**Example use cases:** +- Custom lakeFS management tools +- Advanced metadata operations +- Integration with existing API frameworks +- Performance-optimized data access + +#### lakefs-spec +**Strengths:** +- Filesystem-like API familiar to Python developers +- Native integration with pandas, dask, and other fsspec libraries +- Transparent handling of lakeFS URIs +- Built-in transaction support +- Third-party maintained with active community + +**Best for:** +- Data scientists and analysts +- Jupyter notebook workflows +- Existing fsspec-based applications +- Quick prototyping and exploration + +**Example use cases:** +- Interactive data analysis +- Machine learning experimentation +- Data exploration in notebooks +- Integration with existing data science stacks + +#### Boto3 +**Strengths:** +- Familiar S3-compatible interface +- Minimal code changes from existing S3 workflows +- Extensive documentation and community support +- Integration with AWS ecosystem tools +- Support for multipart uploads and presigned URLs + +**Best for:** +- Migrating existing S3-based applications +- Teams familiar with AWS S3 +- Applications using S3-compatible tools +- Hybrid S3/lakeFS deployments + +**Example use cases:** +- S3 application migration +- Backup and archival workflows +- Integration with S3-compatible tools +- Gradual lakeFS adoption + +## SDK Selection Decision Matrix + +Use this decision tree to choose the right SDK for your needs: + +### 🤔 What's your primary use case? + +#### 📊 Data Science & Analytics +- **Working with pandas/dask?** → [lakefs-spec](lakefs-spec/) +- **Need transactions in notebooks?** → [lakefs-spec](lakefs-spec/) or [High-Level SDK](high-level-sdk/) +- **Building ML pipelines?** → [High-Level SDK](high-level-sdk/) + +#### 🔧 Data Engineering & ETL +- **Building data pipelines?** → [High-Level SDK](high-level-sdk/) +- **Need transaction support?** → [High-Level SDK](high-level-sdk/) +- **Processing large files?** → [High-Level SDK](high-level-sdk/) (streaming I/O) + +#### 🏗️ Application Development +- **Building lakeFS management tools?** → [Generated SDK](generated-sdk/) +- **Need full API control?** → [Generated SDK](generated-sdk/) +- **Integrating with existing systems?** → [Generated SDK](generated-sdk/) + +#### 🔄 Migration from S3 +- **Existing S3 codebase?** → [Boto3](boto3/) +- **Using S3-compatible tools?** → [Boto3](boto3/) +- **Gradual migration strategy?** → [Boto3](boto3/) + [Boto S3 Router](boto3/s3-router.md) + +### 🎯 Feature-Based Selection + +| If you need... | Choose... | Why? | +|----------------|-----------|------| +| **Simplest API** | High-Level SDK | Pythonic, intuitive interface | +| **Complete API access** | Generated SDK | All endpoints available | +| **Pandas integration** | lakefs-spec | Native fsspec support | +| **S3 compatibility** | Boto3 | Familiar S3 interface | +| **Transaction support** | High-Level SDK or lakefs-spec | Built-in context managers | +| **Streaming large files** | High-Level SDK | Optimized I/O operations | +| **Custom tooling** | Generated SDK | Full control and flexibility | +| **Jupyter notebooks** | lakefs-spec | Filesystem-like operations | + +### 🚀 Experience Level Guide + +#### New to lakeFS +1. Start with [Getting Started](getting-started.md) +2. Try [High-Level SDK](high-level-sdk/quickstart.md) for general use +3. Or [lakefs-spec](lakefs-spec/) for data science workflows + +#### Experienced with lakeFS +- Use [Generated SDK](generated-sdk/) for advanced operations +- Combine multiple SDKs as needed +- Check [best practices](reference/best-practices.md) for optimization + +#### Migrating from S3 +1. Review [Boto3 configuration](boto3/configuration.md) +2. Consider [Boto S3 Router](boto3/s3-router.md) for hybrid setups +3. Plan gradual migration with [migration guide](boto3/migration-guide.md) + +## Quick Start + +1. **[Getting Started](getting-started.md)** - Installation and setup guide +2. **Choose your SDK** - Select the appropriate SDK for your use case +3. **Follow tutorials** - Learn with real-world examples + +## Documentation Sections + +- **[Getting Started](getting-started.md)** - Installation, authentication, and basic setup +- **[High-Level SDK](high-level-sdk/)** - Comprehensive SDK documentation +- **[Generated SDK](generated-sdk/)** - Direct API access patterns +- **[lakefs-spec](lakefs-spec/)** - Filesystem API and data science integrations +- **[Boto3](boto3/)** - S3-compatible operations +- **[Tutorials](tutorials/)** - Real-world examples and workflows +- **[Reference](reference/)** - API comparison, best practices, and troubleshooting + +## Need Help? + +- Check the [troubleshooting guide](reference/troubleshooting.md) +- Review [best practices](reference/best-practices.md) +- Compare [API features](reference/api-comparison.md) + +## See Also + +**Getting Started:** +- [Installation and Setup Guide](getting-started.md) - Complete setup instructions for all SDKs +- [Authentication Methods](getting-started.md#authentication-and-configuration) - Configure credentials and connections + +**SDK-Specific Documentation:** +- [High-Level SDK Overview](high-level-sdk/index.md) - Simplified Python interface +- [Generated SDK Overview](generated-sdk/index.md) - Direct API access +- [lakefs-spec Overview](lakefs-spec/index.md) - Filesystem operations +- [Boto3 Integration](boto3/index.md) - S3-compatible interface + +**Learning Resources:** +- [Real-World Tutorials](tutorials/index.md) - End-to-end examples and workflows +- [Best Practices Guide](reference/best-practices.md) - Production deployment guidance +- [API Comparison Matrix](reference/api-comparison.md) - Feature comparison across SDKs + +**External Resources:** +- [High-Level SDK Documentation](https://pydocs-lakefs.lakefs.io){:target="_blank"} - Complete API reference +- [Generated SDK Documentation](https://pydocs-sdk.lakefs.io){:target="_blank"} - Auto-generated API docs +- [lakefs-spec Documentation](https://lakefs-spec.org/){:target="_blank"} - Third-party filesystem interface \ No newline at end of file diff --git a/docs/src/integrations/python/lakefs-spec/filesystem-api.md b/docs/src/integrations/python/lakefs-spec/filesystem-api.md new file mode 100644 index 00000000000..1f4324037f0 --- /dev/null +++ b/docs/src/integrations/python/lakefs-spec/filesystem-api.md @@ -0,0 +1,346 @@ +--- +title: Filesystem API Operations +description: Core filesystem operations using lakefs-spec +sdk_types: ["lakefs-spec"] +difficulty: "beginner" +use_cases: ["filesystem-operations", "file-management", "path-operations"] +topics: ["filesystem", "operations", "paths", "files"] +audience: ["data-scientists", "analysts", "python-developers"] +last_updated: "2024-01-15" +--- + +# Filesystem API Operations + +Learn how to perform standard filesystem operations using lakefs-spec's filesystem interface. + +## Basic Setup + +### Initialize Filesystem +```python +from lakefs_spec import LakeFSFileSystem + +# Auto-discover credentials from ~/.lakectl.yaml or environment +fs = LakeFSFileSystem() + +# Or specify credentials explicitly +fs = LakeFSFileSystem( + host="http://localhost:8000", + username="AKIAIOSFODNN7EXAMPLE", + password="wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" +) +``` + +### URI Format +lakefs-spec uses the `lakefs://` URI scheme: +``` +lakefs://repository/branch/path/to/file +lakefs://my-repo/main/data/file.txt +lakefs://my-repo/feature-branch/datasets/data.parquet +``` + +## File Operations + +### Writing Files + +#### Write Text Files +```python +from pathlib import Path + +# Write string content directly +fs.write_text("lakefs://my-repo/main/data/sample.txt", "Hello, lakeFS!") + +# Write from local file +local_file = Path("local_data.txt") +local_file.write_text("Local content") +fs.put_file("local_data.txt", "lakefs://my-repo/main/data/uploaded.txt") +``` + +#### Write Binary Files +```python +# Write binary data +binary_data = b"Binary content here" +fs.write_bytes("lakefs://my-repo/main/data/binary.dat", binary_data) + +# Upload local binary file +fs.put_file("image.jpg", "lakefs://my-repo/main/images/uploaded.jpg") +``` + +#### Streaming Write +```python +# Write large files with streaming +with fs.open("lakefs://my-repo/main/data/large_file.txt", "w") as f: + for i in range(1000): + f.write(f"Line {i}\n") +``` + +### Reading Files + +#### Read Text Files +```python +# Read entire file as string +content = fs.read_text("lakefs://my-repo/main/data/sample.txt") +print(content) # "Hello, lakeFS!" + +# Download to local file +fs.get_file("lakefs://my-repo/main/data/sample.txt", "downloaded.txt") +``` + +#### Read Binary Files +```python +# Read binary data +binary_content = fs.read_bytes("lakefs://my-repo/main/data/binary.dat") + +# Download binary file +fs.get_file("lakefs://my-repo/main/images/uploaded.jpg", "downloaded.jpg") +``` + +#### Streaming Read +```python +# Read large files with streaming +with fs.open("lakefs://my-repo/main/data/large_file.txt", "r") as f: + for line in f: + print(line.strip()) +``` + +### File Information + +#### Check File Existence +```python +# Check if file exists +if fs.exists("lakefs://my-repo/main/data/sample.txt"): + print("File exists") +else: + print("File not found") +``` + +#### Get File Information +```python +# Get file info/stats +info = fs.info("lakefs://my-repo/main/data/sample.txt") +print(f"Size: {info['size']} bytes") +print(f"Type: {info['type']}") +print(f"Modified: {info['mtime']}") + +# Get detailed stats +stats = fs.stat("lakefs://my-repo/main/data/sample.txt") +print(f"Checksum: {stats.get('checksum')}") +print(f"Content-Type: {stats.get('content_type')}") +``` + +## Directory Operations + +### Listing Contents + +#### List Files and Directories +```python +# List all files in a directory +files = fs.ls("lakefs://my-repo/main/data/") +for file_path in files: + print(file_path) + +# List with details +files_detailed = fs.ls("lakefs://my-repo/main/data/", detail=True) +for file_info in files_detailed: + print(f"{file_info['name']} - {file_info['size']} bytes") +``` + +#### Recursive Listing +```python +# List all files recursively +all_files = fs.find("lakefs://my-repo/main/") +for file_path in all_files: + print(file_path) + +# Find specific file types +csv_files = fs.glob("lakefs://my-repo/main/**/*.csv") +for csv_file in csv_files: + print(csv_file) +``` + +#### Directory Tree +```python +# Walk directory tree +for root, dirs, files in fs.walk("lakefs://my-repo/main/data/"): + print(f"Directory: {root}") + for file in files: + print(f" File: {file}") +``` + +### Creating Directories +```python +# Create directory (implicit with file creation) +fs.makedirs("lakefs://my-repo/main/new_directory/", exist_ok=True) + +# Directories are created automatically when writing files +fs.write_text("lakefs://my-repo/main/new_dir/file.txt", "Content") +``` + +## File Management + +### Copying Files +```python +# Copy within same repository +fs.copy( + "lakefs://my-repo/main/data/source.txt", + "lakefs://my-repo/main/data/copy.txt" +) + +# Copy between branches +fs.copy( + "lakefs://my-repo/main/data/file.txt", + "lakefs://my-repo/feature-branch/data/file.txt" +) + +# Copy multiple files +source_files = fs.glob("lakefs://my-repo/main/data/*.txt") +for source in source_files: + target = source.replace("/main/", "/backup/") + fs.copy(source, target) +``` + +### Moving/Renaming Files +```python +# Move/rename file +fs.move( + "lakefs://my-repo/main/data/old_name.txt", + "lakefs://my-repo/main/data/new_name.txt" +) + +# Move to different directory +fs.move( + "lakefs://my-repo/main/temp/file.txt", + "lakefs://my-repo/main/permanent/file.txt" +) +``` + +### Deleting Files +```python +# Delete single file +fs.rm("lakefs://my-repo/main/data/unwanted.txt") + +# Delete multiple files +files_to_delete = [ + "lakefs://my-repo/main/temp/file1.txt", + "lakefs://my-repo/main/temp/file2.txt" +] +fs.rm(files_to_delete) + +# Delete directory and all contents +fs.rm("lakefs://my-repo/main/temp/", recursive=True) +``` + +## Advanced Operations + +### Batch Operations +```python +def batch_upload(fs, local_dir, remote_base): + """Upload multiple files efficiently""" + from pathlib import Path + + local_path = Path(local_dir) + uploaded_files = [] + + for local_file in local_path.rglob("*"): + if local_file.is_file(): + # Calculate relative path + rel_path = local_file.relative_to(local_path) + remote_path = f"{remote_base}/{rel_path}" + + # Upload file + fs.put_file(str(local_file), remote_path) + uploaded_files.append(remote_path) + + return uploaded_files + +# Usage +uploaded = batch_upload(fs, "local_data/", "lakefs://my-repo/main/uploaded/") +print(f"Uploaded {len(uploaded)} files") +``` + +### Working with Metadata +```python +# Files with custom metadata are handled through the underlying lakeFS API +# lakefs-spec focuses on filesystem operations + +# Get file info including lakeFS-specific metadata +info = fs.info("lakefs://my-repo/main/data/file.txt") +if 'lakefs_metadata' in info: + print(f"lakeFS metadata: {info['lakefs_metadata']}") +``` + +### Error Handling +```python +from fsspec.exceptions import FileNotFoundError + +def safe_file_operation(fs, path): + """Safely perform file operations with error handling""" + try: + if fs.exists(path): + content = fs.read_text(path) + return content + else: + print(f"File not found: {path}") + return None + except FileNotFoundError: + print(f"File not found: {path}") + return None + except PermissionError: + print(f"Permission denied: {path}") + return None + except Exception as e: + print(f"Unexpected error: {e}") + return None + +# Usage +content = safe_file_operation(fs, "lakefs://my-repo/main/data/file.txt") +``` + +## Performance Tips + +### Efficient File Operations +```python +# Use batch operations when possible +files_to_upload = ["file1.txt", "file2.txt", "file3.txt"] +remote_paths = [f"lakefs://my-repo/main/data/{f}" for f in files_to_upload] + +# Upload in batch +for local, remote in zip(files_to_upload, remote_paths): + fs.put_file(local, remote) + +# Use streaming for large files +def stream_large_file(fs, local_path, remote_path): + with open(local_path, 'rb') as local_f: + with fs.open(remote_path, 'wb') as remote_f: + chunk_size = 8192 # 8KB chunks + while True: + chunk = local_f.read(chunk_size) + if not chunk: + break + remote_f.write(chunk) +``` + +### Caching and Connection Management +```python +# lakefs-spec handles connection pooling automatically +# For better performance with many operations, reuse the filesystem instance + +class LakeFSManager: + def __init__(self): + self.fs = LakeFSFileSystem() + + def upload_dataset(self, local_files, remote_base): + """Upload multiple files using the same filesystem instance""" + for local_file in local_files: + remote_path = f"{remote_base}/{Path(local_file).name}" + self.fs.put_file(local_file, remote_path) + +# Usage +manager = LakeFSManager() +manager.upload_dataset(["data1.csv", "data2.csv"], "lakefs://my-repo/main/datasets/") +``` + +## Next Steps + +- Learn about [data science integrations](integrations.md) +- Explore [transaction patterns](transactions.md) +- Check the [lakefs-spec documentation](https://lakefs-spec.org/) \ No newline at end of file diff --git a/docs/src/integrations/python/lakefs-spec/index.md b/docs/src/integrations/python/lakefs-spec/index.md new file mode 100644 index 00000000000..c35405eda9e --- /dev/null +++ b/docs/src/integrations/python/lakefs-spec/index.md @@ -0,0 +1,398 @@ +--- +title: lakefs-spec Integration +description: Filesystem API for lakeFS using lakefs-spec with comprehensive setup and configuration guide +sdk_types: ["lakefs-spec"] +difficulty: "beginner" +use_cases: ["data-science", "filesystem-operations", "pandas-integration", "jupyter-notebooks"] +topics: ["filesystem", "fsspec", "data-science", "uri-patterns"] +audience: ["data-scientists", "analysts", "python-developers"] +last_updated: "2024-01-15" +--- + +# lakefs-spec Integration + +The [lakefs-spec](https://lakefs-spec.org/) project provides a filesystem-like interface to lakeFS, built on the [fsspec](https://github.com/fsspec/filesystem_spec) ecosystem. This enables seamless integration with data science libraries like pandas, dask, and other fsspec-compatible tools. + +## Overview + +lakefs-spec implements the fsspec protocol for lakeFS, allowing you to use lakeFS repositories as if they were regular filesystems. This approach is particularly powerful for data science workflows where you want to leverage existing fsspec-compatible libraries without changing your code structure. + +### Key Features + +- **Filesystem API** - Standard filesystem operations (read, write, list, delete, metadata) +- **fsspec Compatibility** - Works seamlessly with pandas, dask, and 100+ other fsspec libraries +- **URI-based Access** - Simple `lakefs://` URI scheme for intuitive path handling +- **Transaction Support** - Atomic operations with automatic rollback capabilities +- **Data Science Integration** - Native support for popular data science workflows +- **Path Resolution** - Intelligent handling of lakeFS repository, branch, and object paths + +### When to Use lakefs-spec + +Choose lakefs-spec when you need: + +- **Data Science Workflows** - Working with pandas, dask, or other data science tools +- **Filesystem Semantics** - Standard filesystem operations and path handling +- **Existing fsspec Code** - Migrating from other fsspec filesystems (S3, GCS, Azure) +- **Simple Integration** - Minimal code changes for lakeFS adoption +- **Library Compatibility** - Access to the broad fsspec ecosystem + +## Installation and Setup + +### Installation + +Install lakefs-spec using pip: + +```bash +# Basic installation +pip install lakefs-spec + +# With optional dependencies for enhanced functionality +pip install lakefs-spec[pandas,dask] + +# Development installation with all dependencies +pip install lakefs-spec[dev] +``` + +### System Requirements + +- Python 3.8 or higher +- lakeFS server 0.90.0 or higher +- Network access to your lakeFS instance + +## Configuration and Authentication + +lakefs-spec supports multiple authentication methods to connect to your lakeFS instance. + +### Method 1: Environment Variables + +Set environment variables for automatic authentication: + +```bash +export LAKEFS_ENDPOINT="https://your-lakefs-instance.com" +export LAKEFS_ACCESS_KEY_ID="your-access-key" +export LAKEFS_SECRET_ACCESS_KEY="your-secret-key" +``` + +```python +from lakefs_spec import LakeFSFileSystem + +# Automatically uses environment variables +fs = LakeFSFileSystem() +``` + +### Method 2: lakectl Configuration File + +lakefs-spec automatically discovers credentials from your lakectl configuration: + +```yaml +# ~/.lakectl.yaml +credentials: + access_key_id: your-access-key + secret_access_key: your-secret-key +server: + endpoint_url: https://your-lakefs-instance.com +``` + +```python +from lakefs_spec import LakeFSFileSystem + +# Automatically uses ~/.lakectl.yaml +fs = LakeFSFileSystem() +``` + +### Method 3: Direct Configuration + +Pass configuration directly to the filesystem: + +```python +from lakefs_spec import LakeFSFileSystem + +# Direct configuration +fs = LakeFSFileSystem( + host="https://your-lakefs-instance.com", + username="your-access-key", + password="your-secret-key" +) +``` + +### Method 4: Configuration Dictionary + +Use a configuration dictionary for complex setups: + +```python +from lakefs_spec import LakeFSFileSystem + +config = { + "host": "https://your-lakefs-instance.com", + "username": "your-access-key", + "password": "your-secret-key", + "verify_ssl": True, + "timeout": 30, + "retries": 3 +} + +fs = LakeFSFileSystem(**config) +``` + +### Advanced Configuration Options + +lakefs-spec supports additional configuration options for production environments: + +```python +from lakefs_spec import LakeFSFileSystem + +fs = LakeFSFileSystem( + host="https://your-lakefs-instance.com", + username="your-access-key", + password="your-secret-key", + + # SSL Configuration + verify_ssl=True, + ssl_cert_path="/path/to/cert.pem", + + # Connection Settings + timeout=60, + retries=5, + retry_delay=1.0, + + # Performance Settings + cache_size=1000, + block_size=5 * 1024 * 1024, # 5MB blocks + + # Logging + log_level="INFO" +) +``` + +## URI Patterns and Path Resolution + +lakefs-spec uses a structured URI scheme to access lakeFS resources: + +### URI Structure + +``` +lakefs://[repository]/[branch]/[path/to/object] +``` + +### Path Components + +- **Repository**: The lakeFS repository name +- **Branch**: The branch or commit reference +- **Path**: The object path within the repository + +### URI Examples + +```python +# Basic file access +'lakefs://my-repo/main/data/file.csv' + +# Nested directory structure +'lakefs://analytics/feature-branch/datasets/2024/january/sales.parquet' + +# Root directory listing +'lakefs://my-repo/main/' + +# Specific commit reference +'lakefs://my-repo/c7a4b2f8d9e1a3b5c6d7e8f9/data/snapshot.json' +``` + +### Path Resolution Examples + +```python +from lakefs_spec import LakeFSFileSystem +import pandas as pd + +fs = LakeFSFileSystem() + +# List repositories +repos = fs.ls('lakefs://') +print("Available repositories:", repos) + +# List branches in a repository +branches = fs.ls('lakefs://my-repo/') +print("Available branches:", branches) + +# List objects in a branch +objects = fs.ls('lakefs://my-repo/main/') +print("Objects in main branch:", objects) + +# Check if path exists +exists = fs.exists('lakefs://my-repo/main/data/file.csv') +print("File exists:", exists) + +# Get path information +info = fs.info('lakefs://my-repo/main/data/file.csv') +print("File info:", info) +``` + +### Working with Different Path Types + +```python +# Absolute paths +df = pd.read_csv('lakefs://repo/main/data/file.csv') + +# Relative paths (when working within a specific context) +fs = LakeFSFileSystem() +with fs.transaction('lakefs://repo/feature-branch/') as tx: + # Within transaction, paths are relative to the transaction root + df = pd.read_csv('data/file.csv') # Resolves to lakefs://repo/feature-branch/data/file.csv +``` + +## Quick Start Example + +Here's a complete example showing lakefs-spec setup and basic usage: + +```python +import pandas as pd +from lakefs_spec import LakeFSFileSystem + +# Initialize filesystem (uses environment variables or ~/.lakectl.yaml) +fs = LakeFSFileSystem() + +# Create sample data +data = { + 'timestamp': pd.date_range('2024-01-01', periods=100, freq='H'), + 'value': range(100), + 'category': ['A', 'B'] * 50 +} +df = pd.DataFrame(data) + +# Write data to lakeFS +df.to_parquet('lakefs://my-repo/main/data/sample.parquet', index=False) + +# Read data back +df_loaded = pd.read_parquet('lakefs://my-repo/main/data/sample.parquet') +print(f"Loaded {len(df_loaded)} rows") + +# List files in the data directory +files = fs.ls('lakefs://my-repo/main/data/') +print("Files in data directory:", files) + +# Check file metadata +info = fs.info('lakefs://my-repo/main/data/sample.parquet') +print("File size:", info['size'], "bytes") +``` + +## Filesystem API Concepts + +lakefs-spec implements standard filesystem operations that work consistently across different storage backends: + +### Core Operations + +- **Read Operations**: `open()`, `cat()`, `cat_file()` +- **Write Operations**: `open()` with write mode, `pipe()` +- **Directory Operations**: `ls()`, `mkdir()`, `rmdir()` +- **File Operations**: `rm()`, `mv()`, `cp()` +- **Metadata Operations**: `info()`, `exists()`, `size()` + +### Transaction Support + +lakefs-spec provides atomic operations through transactions: + +```python +# Atomic multi-file operation +with fs.transaction('lakefs://repo/branch/') as tx: + # All operations within this block are atomic + df1.to_parquet('data/file1.parquet') + df2.to_parquet('data/file2.parquet') + # Automatically commits on successful completion + # Automatically rolls back on any error +``` + +## Documentation Sections + +Explore detailed documentation for specific use cases: + +- **[Filesystem API](filesystem-api.md)** - Complete filesystem operations reference +- **[Data Science Integrations](integrations.md)** - pandas, dask, and other library examples +- **[Transactions](transactions.md)** - Atomic operation patterns and best practices + +## Troubleshooting + +### Common Issues + +**Authentication Errors** +```python +# Verify your configuration +fs = LakeFSFileSystem() +try: + repos = fs.ls('lakefs://') + print("Authentication successful") +except Exception as e: + print(f"Authentication failed: {e}") +``` + +**Path Resolution Issues** +```python +# Check if repository and branch exist +if fs.exists('lakefs://my-repo/'): + print("Repository exists") + if fs.exists('lakefs://my-repo/main/'): + print("Branch exists") + else: + print("Branch not found") +else: + print("Repository not found") +``` + +**Connection Issues** +```python +# Test connection with timeout +fs = LakeFSFileSystem(timeout=10) +try: + fs.ls('lakefs://') + print("Connection successful") +except TimeoutError: + print("Connection timeout - check network and endpoint") +``` + +## Next Steps + +- Learn about [filesystem operations](filesystem-api.md) for detailed API usage +- Explore [data science integrations](integrations.md) for pandas, dask, and other libraries +- Understand [transaction patterns](transactions.md) for atomic operations +- Check the [lakefs-spec documentation](https://lakefs-spec.org/) for advanced features + +## See Also + +**SDK Selection and Comparison:** +- [Python SDK Overview](../index.md) - Compare all Python SDK options +- [SDK Decision Matrix](../index.md#sdk-selection-decision-matrix) - Choose the right SDK for your use case +- [API Feature Comparison](../reference/api-comparison.md) - Detailed feature comparison across SDKs + +**lakefs-spec Documentation:** +- [Filesystem API Reference](filesystem-api.md) - Complete filesystem operations guide +- [Data Science Integrations](integrations.md) - pandas, dask, and other library examples +- [Transaction Patterns](transactions.md) - Atomic operations and rollback scenarios + +**Alternative SDK Options:** +- [High-Level SDK](../high-level-sdk/index.md) - Simplified Python interface with transactions +- [High-Level SDK Quickstart](../high-level-sdk/quickstart.md) - Object-oriented interface +- [Generated SDK](../generated-sdk/index.md) - Direct API access for custom operations +- [Boto3 Integration](../boto3/index.md) - S3-compatible operations + +**Setup and Configuration:** +- [Installation Guide](../getting-started.md) - Complete setup instructions for all SDKs +- [Authentication Methods](../getting-started.md#authentication-and-configuration) - All credential configuration options +- [Best Practices](../reference/best-practices.md#configuration) - Production configuration guidance + +**Data Science Workflows:** +- [Data Science Tutorial](../tutorials/data-science-workflow.md) - End-to-end data analysis workflow +- [pandas Integration Examples](integrations.md#pandas-integration) - Working with DataFrames +- [Jupyter Notebook Patterns](../tutorials/data-science-workflow.md#jupyter-notebooks) - Interactive analysis + +**Learning Resources:** +- [ETL Pipeline Tutorial](../tutorials/etl-pipeline.md) - Building data pipelines with filesystem operations +- [ML Experiment Tracking](../tutorials/ml-experiment-tracking.md) - Model versioning workflows +- [fsspec Ecosystem Guide](integrations.md#fsspec-ecosystem) - Compatible libraries and tools + +**Reference Materials:** +- [URI Pattern Guide](filesystem-api.md#uri-patterns) - Understanding lakefs:// URIs +- [Error Handling](../reference/troubleshooting.md#lakefs-spec-issues) - Common issues and solutions +- [Performance Optimization](../reference/best-practices.md#lakefs-spec-performance) - Optimize filesystem operations + +**External Resources:** +- [lakefs-spec Documentation](https://lakefs-spec.org/){:target="_blank"} - Official lakefs-spec documentation +- [fsspec Documentation](https://filesystem-spec.readthedocs.io/){:target="_blank"} - Core filesystem specification +- [pandas I/O Documentation](https://pandas.pydata.org/docs/user_guide/io.html){:target="_blank"} - pandas file operations +- [dask DataFrame Documentation](https://docs.dask.org/en/stable/dataframe.html){:target="_blank"} - dask integration patterns \ No newline at end of file diff --git a/docs/src/integrations/python/lakefs-spec/integrations.md b/docs/src/integrations/python/lakefs-spec/integrations.md new file mode 100644 index 00000000000..eb36a361d7f --- /dev/null +++ b/docs/src/integrations/python/lakefs-spec/integrations.md @@ -0,0 +1,478 @@ +--- +title: Data Science Library Integrations +description: Using lakefs-spec with pandas, dask, and other data science libraries +sdk_types: ["lakefs-spec"] +difficulty: "beginner" +use_cases: ["data-science", "pandas-integration", "dask-integration", "jupyter-notebooks"] +topics: ["integrations", "pandas", "dask", "data-science", "libraries"] +audience: ["data-scientists", "analysts", "python-developers"] +last_updated: "2024-01-15" +--- + +# Data Science Library Integrations + +lakefs-spec seamlessly integrates with popular data science libraries through the fsspec ecosystem, enabling direct data access with minimal code changes. + +## Pandas Integration + +### Reading Data with Pandas + +#### CSV Files +```python +import pandas as pd + +# Read CSV directly from lakeFS +df = pd.read_csv('lakefs://my-repo/main/data/sales.csv') +print(df.head()) + +# Read with specific parameters +df = pd.read_csv( + 'lakefs://my-repo/main/data/sales.csv', + parse_dates=['date'], + index_col='id' +) +``` + +#### Parquet Files +```python +# Read Parquet files +df = pd.read_parquet('lakefs://my-repo/main/data/sales.parquet') + +# Read specific columns +df = pd.read_parquet( + 'lakefs://my-repo/main/data/sales.parquet', + columns=['date', 'amount', 'customer_id'] +) + +# Read with filters +df = pd.read_parquet( + 'lakefs://my-repo/main/data/sales.parquet', + filters=[('amount', '>', 1000)] +) +``` + +#### JSON Files +```python +# Read JSON files +df = pd.read_json('lakefs://my-repo/main/data/events.json', lines=True) + +# Read nested JSON +df = pd.json_normalize( + pd.read_json('lakefs://my-repo/main/data/nested.json')['data'] +) +``` + +### Writing Data with Pandas + +#### Save DataFrames +```python +import pandas as pd + +# Create sample data +df = pd.DataFrame({ + 'date': pd.date_range('2024-01-01', periods=100), + 'value': range(100), + 'category': ['A', 'B'] * 50 +}) + +# Save as CSV +df.to_csv('lakefs://my-repo/main/processed/results.csv', index=False) + +# Save as Parquet +df.to_parquet('lakefs://my-repo/main/processed/results.parquet', index=False) + +# Save as JSON +df.to_json('lakefs://my-repo/main/processed/results.json', orient='records', lines=True) +``` + +#### Advanced Parquet Options +```python +# Save with compression +df.to_parquet( + 'lakefs://my-repo/main/data/compressed.parquet', + compression='snappy', + index=False +) + +# Save with partitioning +df.to_parquet( + 'lakefs://my-repo/main/data/partitioned/', + partition_cols=['category'], + index=False +) +``` + +### Data Processing Workflows + +#### ETL Pipeline Example +```python +import pandas as pd + +def process_sales_data(input_path, output_path): + """Complete ETL pipeline using pandas and lakefs-spec""" + + # Extract: Read raw data + raw_df = pd.read_csv(input_path) + + # Transform: Clean and process data + processed_df = raw_df.copy() + processed_df['date'] = pd.to_datetime(processed_df['date']) + processed_df['amount'] = processed_df['amount'].astype(float) + processed_df = processed_df.dropna() + + # Add calculated columns + processed_df['year'] = processed_df['date'].dt.year + processed_df['month'] = processed_df['date'].dt.month + processed_df['amount_category'] = pd.cut( + processed_df['amount'], + bins=[0, 100, 500, 1000, float('inf')], + labels=['small', 'medium', 'large', 'xlarge'] + ) + + # Load: Save processed data + processed_df.to_parquet(output_path, index=False) + + return len(processed_df) + +# Usage +records_processed = process_sales_data( + 'lakefs://my-repo/main/raw/sales.csv', + 'lakefs://my-repo/main/processed/sales_clean.parquet' +) +print(f"Processed {records_processed} records") +``` + +## Dask Integration + +### Distributed Processing with Dask + +#### Reading Large Datasets +```python +import dask.dataframe as dd + +# Read large CSV files with Dask +df = dd.read_csv('lakefs://my-repo/main/data/large_dataset.csv') + +# Read multiple files +df = dd.read_csv('lakefs://my-repo/main/data/*.csv') + +# Read Parquet with Dask +df = dd.read_parquet('lakefs://my-repo/main/data/partitioned_data/') +``` + +#### Processing Large Datasets +```python +import dask.dataframe as dd + +def process_large_dataset(): + """Process large datasets with Dask and lakefs-spec""" + + # Read large dataset + df = dd.read_parquet('lakefs://my-repo/main/raw/large_data/') + + # Perform distributed operations + result = (df + .groupby('category') + .amount.sum() + .compute()) # Trigger computation + + # Save results + result_df = pd.DataFrame({'category': result.index, 'total': result.values}) + result_df.to_csv('lakefs://my-repo/main/results/category_totals.csv', index=False) + + return result + +# Usage +totals = process_large_dataset() +print(totals) +``` + +#### Dask with Custom Storage Options +```python +import dask.dataframe as dd + +# Read with custom storage options (if needed) +df = dd.read_parquet( + 'lakefs://my-repo/main/data/dataset.parquet', + storage_options={ + 'host': 'http://localhost:8000', + 'username': 'access_key', + 'password': 'secret_key' + } +) +``` + +## Other Data Science Libraries + +### NumPy Integration +```python +import numpy as np +from lakefs_spec import LakeFSFileSystem + +fs = LakeFSFileSystem() + +# Save NumPy arrays +arr = np.random.rand(1000, 100) +with fs.open('lakefs://my-repo/main/arrays/data.npy', 'wb') as f: + np.save(f, arr) + +# Load NumPy arrays +with fs.open('lakefs://my-repo/main/arrays/data.npy', 'rb') as f: + loaded_arr = np.load(f) + +print(f"Array shape: {loaded_arr.shape}") +``` + +### Scikit-learn Integration +```python +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.ensemble import RandomForestClassifier +import joblib +from lakefs_spec import LakeFSFileSystem + +def ml_workflow_with_lakefs(): + """Complete ML workflow using lakefs-spec""" + + # Load training data + df = pd.read_csv('lakefs://my-repo/main/data/training_data.csv') + + # Prepare features and target + X = df.drop('target', axis=1) + y = df['target'] + + # Split data + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) + + # Train model + model = RandomForestClassifier() + model.fit(X_train, y_train) + + # Save model to lakeFS + fs = LakeFSFileSystem() + with fs.open('lakefs://my-repo/main/models/rf_model.pkl', 'wb') as f: + joblib.dump(model, f) + + # Save test results + predictions = model.predict(X_test) + results_df = pd.DataFrame({ + 'actual': y_test, + 'predicted': predictions + }) + results_df.to_csv('lakefs://my-repo/main/results/test_results.csv', index=False) + + return model.score(X_test, y_test) + +# Usage +accuracy = ml_workflow_with_lakefs() +print(f"Model accuracy: {accuracy:.3f}") +``` + +### Matplotlib/Seaborn Integration +```python +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns +from lakefs_spec import LakeFSFileSystem +import io + +def create_and_save_plots(): + """Create plots and save them to lakeFS""" + + # Load data + df = pd.read_csv('lakefs://my-repo/main/data/sales.csv') + + # Create plot + plt.figure(figsize=(10, 6)) + sns.lineplot(data=df, x='date', y='amount') + plt.title('Sales Over Time') + plt.xticks(rotation=45) + plt.tight_layout() + + # Save plot to lakeFS + fs = LakeFSFileSystem() + img_buffer = io.BytesIO() + plt.savefig(img_buffer, format='png', dpi=300) + img_buffer.seek(0) + + with fs.open('lakefs://my-repo/main/plots/sales_trend.png', 'wb') as f: + f.write(img_buffer.getvalue()) + + plt.close() + +# Usage +create_and_save_plots() +``` + +## Advanced Integration Patterns + +### Custom Data Loaders +```python +from lakefs_spec import LakeFSFileSystem +import pandas as pd + +class LakeFSDataLoader: + """Custom data loader for common data science tasks""" + + def __init__(self, repo, branch): + self.fs = LakeFSFileSystem() + self.base_path = f"lakefs://{repo}/{branch}" + + def load_dataset(self, dataset_name, file_format='parquet'): + """Load dataset by name""" + path = f"{self.base_path}/datasets/{dataset_name}.{file_format}" + + if file_format == 'parquet': + return pd.read_parquet(path) + elif file_format == 'csv': + return pd.read_csv(path) + elif file_format == 'json': + return pd.read_json(path, lines=True) + else: + raise ValueError(f"Unsupported format: {file_format}") + + def save_dataset(self, df, dataset_name, file_format='parquet'): + """Save dataset by name""" + path = f"{self.base_path}/datasets/{dataset_name}.{file_format}" + + if file_format == 'parquet': + df.to_parquet(path, index=False) + elif file_format == 'csv': + df.to_csv(path, index=False) + elif file_format == 'json': + df.to_json(path, orient='records', lines=True) + + def list_datasets(self): + """List available datasets""" + datasets_path = f"{self.base_path}/datasets/" + files = self.fs.ls(datasets_path) + return [f.split('/')[-1] for f in files if f.endswith(('.parquet', '.csv', '.json'))] + +# Usage +loader = LakeFSDataLoader('my-repo', 'main') + +# Load data +df = loader.load_dataset('sales_data') + +# Process data +processed_df = df.groupby('category').sum().reset_index() + +# Save results +loader.save_dataset(processed_df, 'sales_summary') + +# List available datasets +datasets = loader.list_datasets() +print(f"Available datasets: {datasets}") +``` + +### Jupyter Notebook Integration +```python +# In Jupyter notebooks, you can use lakefs-spec seamlessly + +import pandas as pd +import matplotlib.pyplot as plt + +# Load and visualize data directly from lakeFS +df = pd.read_csv('lakefs://my-repo/main/data/experiment_results.csv') + +# Create interactive plots +df.plot(x='date', y='metric', kind='line', figsize=(12, 6)) +plt.title('Experiment Results Over Time') +plt.show() + +# Save notebook outputs back to lakeFS +summary_stats = df.describe() +summary_stats.to_csv('lakefs://my-repo/main/analysis/summary_stats.csv') +``` + +### Multi-format Data Pipeline +```python +def multi_format_pipeline(repo, branch): + """Pipeline that handles multiple data formats""" + + base_path = f"lakefs://{repo}/{branch}" + + # Read different formats + csv_data = pd.read_csv(f"{base_path}/raw/data.csv") + json_data = pd.read_json(f"{base_path}/raw/events.json", lines=True) + parquet_data = pd.read_parquet(f"{base_path}/raw/metrics.parquet") + + # Combine and process + combined_data = pd.concat([ + csv_data.assign(source='csv'), + json_data.assign(source='json'), + parquet_data.assign(source='parquet') + ], ignore_index=True) + + # Save in different formats for different use cases + # Fast analytics: Parquet + combined_data.to_parquet(f"{base_path}/processed/combined.parquet", index=False) + + # Human readable: CSV + summary = combined_data.groupby('source').size().reset_index(name='count') + summary.to_csv(f"{base_path}/processed/summary.csv", index=False) + + # API consumption: JSON + combined_data.head(100).to_json( + f"{base_path}/processed/sample.json", + orient='records', + lines=True + ) + + return len(combined_data) + +# Usage +total_records = multi_format_pipeline('my-repo', 'main') +print(f"Processed {total_records} total records") +``` + +## Performance Optimization + +### Efficient Data Access Patterns +```python +# Use appropriate file formats for your use case +# Parquet for analytics workloads +df.to_parquet('lakefs://my-repo/main/data/analytics.parquet', + compression='snappy') + +# CSV for human-readable data +df.to_csv('lakefs://my-repo/main/data/readable.csv', index=False) + +# Use partitioning for large datasets +df.to_parquet('lakefs://my-repo/main/data/partitioned/', + partition_cols=['year', 'month'], + index=False) +``` + +### Caching Strategies +```python +# Cache frequently accessed data locally +import os + +def cached_read(lakefs_path, local_cache_dir='./cache'): + """Read with local caching""" + os.makedirs(local_cache_dir, exist_ok=True) + + # Create cache filename + cache_filename = lakefs_path.replace('/', '_').replace(':', '_') + cache_path = os.path.join(local_cache_dir, cache_filename) + + # Check if cached version exists + if os.path.exists(cache_path): + return pd.read_parquet(cache_path) + + # Read from lakeFS and cache + df = pd.read_parquet(lakefs_path) + df.to_parquet(cache_path, index=False) + + return df + +# Usage +df = cached_read('lakefs://my-repo/main/data/large_dataset.parquet') +``` + +## Next Steps + +- Learn about [transaction patterns](transactions.md) +- Explore the [lakefs-spec documentation](https://lakefs-spec.org/) +- Check out [pandas documentation](https://pandas.pydata.org/) for more data manipulation techniques \ No newline at end of file diff --git a/docs/src/integrations/python/lakefs-spec/transactions.md b/docs/src/integrations/python/lakefs-spec/transactions.md new file mode 100644 index 00000000000..dc27e15d3e6 --- /dev/null +++ b/docs/src/integrations/python/lakefs-spec/transactions.md @@ -0,0 +1,491 @@ +--- +title: Transaction Patterns with lakefs-spec +description: Atomic operations and transaction handling using lakefs-spec +sdk_types: ["lakefs-spec"] +difficulty: "intermediate" +use_cases: ["transactions", "atomic-operations", "data-consistency", "rollback"] +topics: ["transactions", "atomicity", "consistency", "context-managers"] +audience: ["data-scientists", "data-engineers", "python-developers"] +last_updated: "2024-01-15" +--- + +# Transaction Patterns with lakefs-spec + +lakefs-spec provides transaction support for atomic operations, ensuring data consistency across multiple file operations. + +## Transaction Basics + +### Simple Transaction +```python +from lakefs_spec import LakeFSFileSystem + +fs = LakeFSFileSystem() + +# Basic transaction pattern +with fs.transaction("my-repo", "main") as tx: + # All operations within this block are atomic + fs.write_text(f"my-repo/{tx.branch.id}/data/file1.txt", "Content 1") + fs.write_text(f"my-repo/{tx.branch.id}/data/file2.txt", "Content 2") + + # Commit changes + tx.commit(message="Add two files atomically") +``` + +### Transaction with Multiple Commits +```python +with fs.transaction("my-repo", "main") as tx: + # First set of changes + fs.write_text(f"my-repo/{tx.branch.id}/step1/data.txt", "Step 1 data") + tx.commit(message="Complete step 1") + + # Second set of changes + fs.write_text(f"my-repo/{tx.branch.id}/step2/data.txt", "Step 2 data") + tx.commit(message="Complete step 2") + + # Create a tag for the final state + final_commit = tx.commit(message="Final transaction state") + tx.tag(final_commit, name="transaction-v1.0") +``` + +## Data Processing Transactions + +### ETL Pipeline with Transactions +```python +import pandas as pd +from lakefs_spec import LakeFSFileSystem + +def atomic_etl_pipeline(repo, source_branch, raw_data_path): + """Perform ETL operations atomically""" + fs = LakeFSFileSystem() + + with fs.transaction(repo, source_branch) as tx: + branch_path = f"{repo}/{tx.branch.id}" + + # Extract: Read raw data + raw_df = pd.read_csv(f"lakefs://{branch_path}/{raw_data_path}") + + # Transform: Clean and process data + # Remove duplicates + clean_df = raw_df.drop_duplicates() + + # Handle missing values + clean_df = clean_df.fillna(method='forward') + + # Add derived columns + clean_df['processed_date'] = pd.Timestamp.now() + clean_df['record_count'] = len(clean_df) + + # Load: Save processed data + clean_df.to_parquet(f"lakefs://{branch_path}/processed/clean_data.parquet", index=False) + + # Create summary statistics + summary = clean_df.describe() + summary.to_csv(f"lakefs://{branch_path}/processed/summary_stats.csv") + + # Commit all changes atomically + commit_sha = tx.commit(message=f"ETL pipeline: processed {len(clean_df)} records") + + # Tag the successful processing + tx.tag(commit_sha, name=f"etl-{pd.Timestamp.now().strftime('%Y%m%d-%H%M%S')}") + + return len(clean_df), commit_sha + +# Usage +try: + record_count, commit_id = atomic_etl_pipeline("my-repo", "main", "raw/sales_data.csv") + print(f"Successfully processed {record_count} records in commit {commit_id}") +except Exception as e: + print(f"ETL pipeline failed: {e}") + # All changes are automatically rolled back +``` + +### Multi-Dataset Processing +```python +def process_multiple_datasets(repo, branch, dataset_configs): + """Process multiple datasets atomically""" + fs = LakeFSFileSystem() + + with fs.transaction(repo, branch) as tx: + branch_path = f"{repo}/{tx.branch.id}" + processed_datasets = [] + + for config in dataset_configs: + try: + # Read source data + if config['format'] == 'csv': + df = pd.read_csv(f"lakefs://{branch_path}/{config['source_path']}") + elif config['format'] == 'parquet': + df = pd.read_parquet(f"lakefs://{branch_path}/{config['source_path']}") + + # Apply transformations + for transform in config.get('transformations', []): + df = apply_transformation(df, transform) + + # Save processed data + output_path = f"lakefs://{branch_path}/{config['output_path']}" + if config['output_format'] == 'parquet': + df.to_parquet(output_path, index=False) + elif config['output_format'] == 'csv': + df.to_csv(output_path, index=False) + + processed_datasets.append({ + 'name': config['name'], + 'records': len(df), + 'output_path': config['output_path'] + }) + + except Exception as e: + print(f"Failed to process dataset {config['name']}: {e}") + raise # This will rollback the entire transaction + + # Create processing summary + summary_df = pd.DataFrame(processed_datasets) + summary_df.to_csv(f"lakefs://{branch_path}/processing_summary.csv", index=False) + + # Commit all changes + total_records = sum(d['records'] for d in processed_datasets) + commit_sha = tx.commit( + message=f"Processed {len(processed_datasets)} datasets ({total_records} total records)" + ) + + return processed_datasets, commit_sha + +def apply_transformation(df, transform): + """Apply a transformation to a DataFrame""" + if transform['type'] == 'filter': + return df.query(transform['condition']) + elif transform['type'] == 'select': + return df[transform['columns']] + elif transform['type'] == 'rename': + return df.rename(columns=transform['mapping']) + else: + return df + +# Usage +dataset_configs = [ + { + 'name': 'sales', + 'source_path': 'raw/sales.csv', + 'format': 'csv', + 'output_path': 'processed/sales_clean.parquet', + 'output_format': 'parquet', + 'transformations': [ + {'type': 'filter', 'condition': 'amount > 0'}, + {'type': 'select', 'columns': ['date', 'amount', 'customer_id']} + ] + }, + { + 'name': 'customers', + 'source_path': 'raw/customers.parquet', + 'format': 'parquet', + 'output_path': 'processed/customers_clean.parquet', + 'output_format': 'parquet', + 'transformations': [ + {'type': 'rename', 'mapping': {'cust_id': 'customer_id'}} + ] + } +] + +try: + results, commit_id = process_multiple_datasets("my-repo", "main", dataset_configs) + print(f"Successfully processed {len(results)} datasets in commit {commit_id}") +except Exception as e: + print(f"Multi-dataset processing failed: {e}") +``` + +## Machine Learning Workflows + +### Model Training with Transactions +```python +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import accuracy_score, classification_report +import joblib +import json + +def atomic_ml_training(repo, branch, data_path, model_config): + """Train ML model atomically with data versioning""" + fs = LakeFSFileSystem() + + with fs.transaction(repo, branch) as tx: + branch_path = f"{repo}/{tx.branch.id}" + + # Load training data + df = pd.read_parquet(f"lakefs://{branch_path}/{data_path}") + + # Prepare features and target + X = df.drop(model_config['target_column'], axis=1) + y = df[model_config['target_column']] + + # Split data + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=model_config.get('test_size', 0.2), random_state=42 + ) + + # Train model + model = RandomForestClassifier(**model_config.get('model_params', {})) + model.fit(X_train, y_train) + + # Evaluate model + y_pred = model.predict(X_test) + accuracy = accuracy_score(y_test, y_pred) + + # Save model artifacts + model_dir = f"lakefs://{branch_path}/models/{model_config['model_name']}" + + # Save the trained model + with fs.open(f"{model_dir}/model.pkl", 'wb') as f: + joblib.dump(model, f) + + # Save model metadata + metadata = { + 'model_name': model_config['model_name'], + 'accuracy': accuracy, + 'training_records': len(X_train), + 'test_records': len(X_test), + 'features': list(X.columns), + 'model_params': model_config.get('model_params', {}), + 'training_date': pd.Timestamp.now().isoformat() + } + + with fs.open(f"{model_dir}/metadata.json", 'w') as f: + json.dump(metadata, f, indent=2) + + # Save test results + results_df = pd.DataFrame({ + 'actual': y_test, + 'predicted': y_pred + }) + results_df.to_csv(f"{model_dir}/test_results.csv", index=False) + + # Save classification report + report = classification_report(y_test, y_pred, output_dict=True) + with fs.open(f"{model_dir}/classification_report.json", 'w') as f: + json.dump(report, f, indent=2) + + # Commit model training results + commit_sha = tx.commit( + message=f"Train {model_config['model_name']} model (accuracy: {accuracy:.3f})" + ) + + # Tag the model version + model_tag = f"{model_config['model_name']}-v{pd.Timestamp.now().strftime('%Y%m%d-%H%M%S')}" + tx.tag(commit_sha, name=model_tag) + + return { + 'accuracy': accuracy, + 'commit_sha': commit_sha, + 'model_tag': model_tag, + 'metadata': metadata + } + +# Usage +model_config = { + 'model_name': 'customer_churn_predictor', + 'target_column': 'churn', + 'test_size': 0.2, + 'model_params': { + 'n_estimators': 100, + 'max_depth': 10, + 'random_state': 42 + } +} + +try: + results = atomic_ml_training("my-repo", "main", "processed/customer_data.parquet", model_config) + print(f"Model training completed successfully:") + print(f" Accuracy: {results['accuracy']:.3f}") + print(f" Commit: {results['commit_sha']}") + print(f" Tag: {results['model_tag']}") +except Exception as e: + print(f"Model training failed: {e}") +``` + +## Error Handling and Recovery + +### Robust Transaction Patterns +```python +def robust_data_processing(repo, branch, operations): + """Process data with comprehensive error handling""" + fs = LakeFSFileSystem() + + try: + with fs.transaction(repo, branch) as tx: + branch_path = f"{repo}/{tx.branch.id}" + completed_operations = [] + + for i, operation in enumerate(operations): + try: + # Execute operation + result = execute_operation(fs, branch_path, operation) + completed_operations.append({ + 'operation': operation['name'], + 'status': 'success', + 'result': result + }) + + # Intermediate commit for long-running processes + if i % 5 == 4: # Commit every 5 operations + tx.commit(message=f"Completed operations {i-4} to {i}") + + except Exception as e: + print(f"Operation {operation['name']} failed: {e}") + completed_operations.append({ + 'operation': operation['name'], + 'status': 'failed', + 'error': str(e) + }) + + # Decide whether to continue or abort + if operation.get('critical', False): + raise # Abort transaction for critical operations + # Continue for non-critical operations + + # Save processing log + log_df = pd.DataFrame(completed_operations) + log_df.to_csv(f"lakefs://{branch_path}/processing_log.csv", index=False) + + # Final commit + successful_ops = sum(1 for op in completed_operations if op['status'] == 'success') + failed_ops = len(completed_operations) - successful_ops + + commit_sha = tx.commit( + message=f"Batch processing: {successful_ops} successful, {failed_ops} failed" + ) + + return completed_operations, commit_sha + + except Exception as e: + print(f"Transaction failed and rolled back: {e}") + return None, None + +def execute_operation(fs, branch_path, operation): + """Execute a single operation""" + if operation['type'] == 'transform': + # Load, transform, and save data + df = pd.read_csv(f"lakefs://{branch_path}/{operation['input_path']}") + # Apply transformation logic here + transformed_df = df # Placeholder + transformed_df.to_parquet(f"lakefs://{branch_path}/{operation['output_path']}", index=False) + return len(transformed_df) + + elif operation['type'] == 'aggregate': + # Aggregate data + df = pd.read_parquet(f"lakefs://{branch_path}/{operation['input_path']}") + aggregated = df.groupby(operation['group_by']).sum() + aggregated.to_csv(f"lakefs://{branch_path}/{operation['output_path']}") + return len(aggregated) + + else: + raise ValueError(f"Unknown operation type: {operation['type']}") + +# Usage +operations = [ + { + 'name': 'clean_sales_data', + 'type': 'transform', + 'input_path': 'raw/sales.csv', + 'output_path': 'processed/sales_clean.parquet', + 'critical': True + }, + { + 'name': 'aggregate_by_region', + 'type': 'aggregate', + 'input_path': 'processed/sales_clean.parquet', + 'output_path': 'aggregated/sales_by_region.csv', + 'group_by': 'region', + 'critical': False + } +] + +results, commit_id = robust_data_processing("my-repo", "main", operations) +if results: + print(f"Processing completed with commit {commit_id}") + for result in results: + print(f" {result['operation']}: {result['status']}") +``` + +## Advanced Transaction Patterns + +### Conditional Transactions +```python +def conditional_data_update(repo, branch, condition_check, update_operations): + """Perform updates only if conditions are met""" + fs = LakeFSFileSystem() + + with fs.transaction(repo, branch) as tx: + branch_path = f"{repo}/{tx.branch.id}" + + # Check conditions + condition_met = check_conditions(fs, branch_path, condition_check) + + if not condition_met: + print("Conditions not met, skipping updates") + return None + + # Perform updates + for operation in update_operations: + execute_update(fs, branch_path, operation) + + commit_sha = tx.commit(message="Conditional update completed") + return commit_sha + +def check_conditions(fs, branch_path, condition_check): + """Check if conditions are met for processing""" + if condition_check['type'] == 'file_exists': + return fs.exists(f"lakefs://{branch_path}/{condition_check['path']}") + + elif condition_check['type'] == 'data_threshold': + df = pd.read_csv(f"lakefs://{branch_path}/{condition_check['path']}") + return len(df) >= condition_check['min_records'] + + return False + +# Usage +condition = { + 'type': 'data_threshold', + 'path': 'raw/new_data.csv', + 'min_records': 1000 +} + +updates = [ + {'type': 'merge', 'source': 'raw/new_data.csv', 'target': 'processed/all_data.csv'} +] + +result = conditional_data_update("my-repo", "main", condition, updates) +``` + +## Best Practices + +### Transaction Design Principles + +1. **Keep Transactions Focused**: Group related operations together +2. **Handle Errors Gracefully**: Plan for partial failures +3. **Use Intermediate Commits**: For long-running processes +4. **Tag Important States**: Mark significant milestones +5. **Log Operations**: Maintain audit trails + +### Performance Considerations +```python +# Good: Focused transaction +with fs.transaction("my-repo", "main") as tx: + # Related operations only + process_daily_sales(tx.branch.id) + generate_daily_report(tx.branch.id) + tx.commit("Daily sales processing") + +# Avoid: Overly broad transaction +# with fs.transaction("my-repo", "main") as tx: +# # Too many unrelated operations +# process_sales() +# train_ml_model() +# generate_reports() +# cleanup_old_data() +``` + +## Next Steps + +- Review [filesystem operations](filesystem-api.md) for basic file handling +- Explore [data science integrations](integrations.md) for library-specific patterns +- Check the [lakefs-spec documentation](https://lakefs-spec.org/) for advanced features \ No newline at end of file diff --git a/docs/src/integrations/python/reference/api-comparison.md b/docs/src/integrations/python/reference/api-comparison.md new file mode 100644 index 00000000000..08bf2b3e6a8 --- /dev/null +++ b/docs/src/integrations/python/reference/api-comparison.md @@ -0,0 +1,374 @@ +--- +title: API Comparison +description: Comprehensive feature comparison across all Python SDK options +sdk_types: ["high-level", "generated", "lakefs-spec", "boto3"] +difficulty: "intermediate" +use_cases: ["general", "decision-making"] +--- + +# API Comparison + +This comprehensive comparison helps you choose the right Python SDK for your specific use case by comparing features, performance characteristics, and trade-offs across all available options. + +## Quick Decision Matrix + +| Use Case | Recommended SDK | Alternative | +|----------|----------------|-------------| +| **Data Science & Analytics** | lakefs-spec | High-Level SDK | +| **Production ETL Pipelines** | High-Level SDK | Generated SDK | +| **Existing S3 Workflows** | Boto3 | High-Level SDK | +| **Custom API Operations** | Generated SDK | High-Level SDK | +| **Jupyter Notebooks** | lakefs-spec | High-Level SDK | +| **ML Experiment Tracking** | High-Level SDK | lakefs-spec | +| **Large File Processing** | lakefs-spec | High-Level SDK | +| **Microservices Integration** | Generated SDK | High-Level SDK | + +## Feature Comparison Matrix + +### Core Repository Operations + +| Feature | High-Level SDK | Generated SDK | lakefs-spec | Boto3 | +|---------|----------------|---------------|-------------|-------| +| **Repository Management** | +| Create Repository | ✅ Full | ✅ Full | ❌ None | ❌ None | +| Delete Repository | ✅ Full | ✅ Full | ❌ None | ❌ None | +| List Repositories | ✅ Full | ✅ Full | ❌ None | ❌ None | +| Repository Metadata | ✅ Full | ✅ Full | ❌ None | ❌ None | +| **Branch Operations** | +| Create Branch | ✅ Full | ✅ Full | ✅ Limited | ❌ None | +| Delete Branch | ✅ Full | ✅ Full | ✅ Limited | ❌ None | +| List Branches | ✅ Full | ✅ Full | ✅ Limited | ❌ None | +| Branch Protection | ✅ Full | ✅ Full | ❌ None | ❌ None | +| **Commit Operations** | +| Create Commit | ✅ Full | ✅ Full | ✅ Full | ❌ None | +| List Commits | ✅ Full | ✅ Full | ✅ Limited | ❌ None | +| Commit Metadata | ✅ Full | ✅ Full | ✅ Limited | ❌ None | +| Cherry Pick | ✅ Full | ✅ Full | ❌ None | ❌ None | + +### Object Operations + +| Feature | High-Level SDK | Generated SDK | lakefs-spec | Boto3 | +|---------|----------------|---------------|-------------|-------| +| **Basic Operations** | +| Upload Object | ✅ Full | ✅ Full | ✅ Full | ✅ Full | +| Download Object | ✅ Full | ✅ Full | ✅ Full | ✅ Full | +| Delete Object | ✅ Full | ✅ Full | ✅ Full | ✅ Full | +| List Objects | ✅ Full | ✅ Full | ✅ Full | ✅ Full | +| **Advanced Operations** | +| Streaming I/O | ✅ Full | 🔶 Manual | ✅ Full | ✅ Full | +| Batch Operations | ✅ Full | 🔶 Manual | ✅ Full | ✅ Full | +| Object Metadata | ✅ Full | ✅ Full | ✅ Full | ✅ Full | +| Presigned URLs | ✅ Full | ✅ Full | ❌ None | ✅ Full | +| Multipart Upload | ✅ Full | ✅ Full | ✅ Full | ✅ Full | + +### Data Management Features + +| Feature | High-Level SDK | Generated SDK | lakefs-spec | Boto3 | +|---------|----------------|---------------|-------------|-------| +| **Transactions** | +| Atomic Operations | ✅ Full | 🔶 Manual | ✅ Full | ❌ None | +| Rollback Support | ✅ Full | 🔶 Manual | ✅ Full | ❌ None | +| Context Managers | ✅ Full | ❌ None | ✅ Full | ❌ None | +| **Import/Export** | +| Data Import | ✅ Full | ✅ Full | ❌ None | ❌ None | +| Import Status | ✅ Full | ✅ Full | ❌ None | ❌ None | +| Export Operations | ✅ Full | ✅ Full | ❌ None | ❌ None | +| **Merge Operations** | +| Branch Merging | ✅ Full | ✅ Full | ❌ None | ❌ None | +| Conflict Resolution | ✅ Full | ✅ Full | ❌ None | ❌ None | +| Merge Strategies | ✅ Full | ✅ Full | ❌ None | ❌ None | + +### Integration Capabilities + +| Feature | High-Level SDK | Generated SDK | lakefs-spec | Boto3 | +|---------|----------------|---------------|-------------|-------| +| **Data Science Libraries** | +| Pandas Integration | ✅ Full | 🔶 Manual | ✅ Native | 🔶 Manual | +| Dask Integration | ✅ Full | 🔶 Manual | ✅ Native | 🔶 Manual | +| PyArrow Integration | ✅ Full | 🔶 Manual | ✅ Native | 🔶 Manual | +| **File System Interface** | +| fsspec Compatibility | 🔶 Limited | ❌ None | ✅ Native | 🔶 Limited | +| Path-like Operations | ✅ Full | 🔶 Manual | ✅ Native | 🔶 Limited | +| Glob Patterns | ✅ Full | 🔶 Manual | ✅ Native | 🔶 Limited | +| **S3 Compatibility** | +| S3 API Compatibility | ❌ None | ❌ None | ❌ None | ✅ Full | +| Existing S3 Code | ❌ None | ❌ None | ❌ None | ✅ Full | +| S3 Tools Integration | ❌ None | ❌ None | ❌ None | ✅ Full | + +## Performance Characteristics + +### Throughput Comparison + +| Operation Type | High-Level SDK | Generated SDK | lakefs-spec | Boto3 | +|----------------|----------------|---------------|-------------|-------| +| **Small Files (< 1MB)** | +| Single Upload | Good | Good | Excellent | Good | +| Batch Upload | Excellent | Good | Excellent | Good | +| Single Download | Good | Good | Excellent | Good | +| Batch Download | Excellent | Good | Excellent | Good | +| **Large Files (> 100MB)** | +| Streaming Upload | Excellent | Good | Excellent | Excellent | +| Streaming Download | Excellent | Good | Excellent | Excellent | +| Multipart Upload | Excellent | Good | Excellent | Excellent | +| **Metadata Operations** | +| List Objects | Good | Good | Excellent | Good | +| Object Stats | Good | Good | Excellent | Good | +| Branch Operations | Excellent | Good | Good | N/A | + +### Memory Usage + +| SDK | Memory Efficiency | Notes | +|-----|------------------|-------| +| **High-Level SDK** | Good | Optimized for common patterns, connection pooling | +| **Generated SDK** | Fair | Direct API access, manual optimization needed | +| **lakefs-spec** | Excellent | Designed for large datasets, streaming-first | +| **Boto3** | Good | Mature S3 optimizations, configurable buffering | + +### Latency Characteristics + +| Operation | High-Level SDK | Generated SDK | lakefs-spec | Boto3 | +|-----------|----------------|---------------|-------------|-------| +| **Connection Setup** | Fast | Fast | Fast | Fast | +| **Authentication** | Fast | Fast | Fast | Fast | +| **First Request** | Medium | Medium | Fast | Medium | +| **Subsequent Requests** | Fast | Fast | Fast | Fast | +| **Batch Operations** | Fast | Medium | Fast | Fast | + +## Trade-offs Analysis + +### High-Level SDK + +**Strengths:** +- Comprehensive feature set with advanced capabilities +- Built-in transaction support and error handling +- Optimized for common lakeFS workflows +- Excellent documentation and examples +- Connection pooling and performance optimizations + +**Weaknesses:** +- Additional abstraction layer may hide some API details +- Larger dependency footprint +- May not expose all Generated SDK capabilities immediately + +**Best For:** +- Production applications requiring robust error handling +- Complex workflows with transactions +- Teams wanting comprehensive lakeFS integration +- Applications requiring advanced features like imports/exports + +### Generated SDK + +**Strengths:** +- Direct access to all lakeFS API capabilities +- Minimal abstraction, maximum control +- Automatically updated with API changes +- Smaller dependency footprint +- Full async support where available + +**Weaknesses:** +- Requires more boilerplate code +- Manual error handling and retry logic +- No built-in transaction support +- Less optimized for common patterns + +**Best For:** +- Custom integrations requiring specific API access +- Microservices with minimal dependencies +- Applications needing fine-grained control +- Integration with existing API client patterns + +### lakefs-spec + +**Strengths:** +- Native fsspec integration for data science workflows +- Excellent performance for file operations +- Seamless integration with pandas, dask, and other libraries +- Optimized for large dataset operations +- Familiar filesystem interface + +**Weaknesses:** +- Limited repository management capabilities +- No direct access to advanced lakeFS features +- Focused primarily on file operations +- Third-party maintenance dependency + +**Best For:** +- Data science and analytics workflows +- Jupyter notebook environments +- Large dataset processing +- Integration with existing fsspec-based tools +- Teams familiar with filesystem interfaces + +### Boto3 + +**Strengths:** +- Full S3 API compatibility +- Seamless migration from existing S3 workflows +- Mature ecosystem and tooling support +- Excellent performance for object operations +- Familiar interface for AWS users + +**Weaknesses:** +- No access to lakeFS-specific features (branches, commits, etc.) +- Limited to object operations only +- Requires S3 Gateway configuration +- No transaction support + +**Best For:** +- Migrating existing S3-based applications +- Teams with strong AWS/S3 expertise +- Applications requiring S3 tool compatibility +- Simple object storage use cases + +## Decision Guidelines + +### Choose High-Level SDK When: + +- Building production applications with complex lakeFS workflows +- Need transaction support and advanced error handling +- Want comprehensive feature access with minimal code +- Team prefers high-level abstractions +- Building ETL pipelines or data management systems + +```python +# Example: Complex workflow with transactions +import lakefs + +client = lakefs.Client() +repo = client.repository("my-repo") + +with repo.branch("feature").transaction() as tx: + # Multiple operations in atomic transaction + tx.upload("data/file1.csv", data1) + tx.upload("data/file2.csv", data2) + # Automatically commits or rolls back +``` + +### Choose Generated SDK When: + +- Need access to specific API endpoints not covered by High-Level SDK +- Building microservices with minimal dependencies +- Require fine-grained control over API interactions +- Integrating with existing API client patterns +- Need async support for specific operations + +```python +# Example: Direct API access for custom operations +from lakefs_sdk import LakeFSApi, Configuration + +config = Configuration(host="http://localhost:8000") +api = LakeFSApi(config) + +# Direct API call with full control +response = api.list_repositories( + prefix="project-", + amount=100, + after="cursor" +) +``` + +### Choose lakefs-spec When: + +- Working primarily with data science libraries +- Processing large datasets with streaming requirements +- Using Jupyter notebooks for analysis +- Need filesystem-like interface +- Integrating with existing fsspec-based workflows + +```python +# Example: Data science workflow +import pandas as pd +import lakefs_spec + +# Direct pandas integration +df = pd.read_parquet("lakefs://repo/branch/data/dataset.parquet") +processed_df = df.groupby("category").sum() +processed_df.to_parquet("lakefs://repo/branch/results/summary.parquet") +``` + +### Choose Boto3 When: + +- Migrating existing S3-based applications +- Need S3 tool compatibility +- Simple object storage requirements +- Team has strong AWS expertise +- Using S3-compatible tools and libraries + +```python +# Example: S3-compatible operations +import boto3 + +s3 = boto3.client('s3', endpoint_url='http://localhost:8000') +s3.put_object( + Bucket='repo', + Key='branch/path/to/file.txt', + Body=data +) +``` + +## Migration Paths + +### From S3 to lakeFS + +1. **Start with Boto3**: Minimal code changes, immediate compatibility +2. **Add lakefs-spec**: For data science workflows requiring filesystem interface +3. **Upgrade to High-Level SDK**: For advanced lakeFS features and better integration + +### From File Systems to lakeFS + +1. **Start with lakefs-spec**: Familiar filesystem interface +2. **Add High-Level SDK**: For repository management and advanced features +3. **Consider Generated SDK**: For custom integrations and specific API needs + +### Between lakeFS SDKs + +- **Generated → High-Level**: Gradual migration, can access Generated SDK through High-Level +- **High-Level → Generated**: For specific API access, use `client.sdk` property +- **Any SDK → lakefs-spec**: For data science workflows, can run in parallel + +## See Also + +**SDK Selection and Setup:** +- [Python SDK Overview](../index.md) - Complete SDK overview and selection guide +- [SDK Decision Matrix](../index.md#sdk-selection-decision-matrix) - Interactive decision guide +- [Getting Started Guide](../getting-started.md) - Installation and setup for all SDKs +- [Authentication Methods](../getting-started.md#authentication-and-configuration) - Credential configuration + +**SDK-Specific Documentation:** +- [High-Level SDK Overview](../high-level-sdk/index.md) - Detailed High-Level SDK documentation +- [High-Level SDK Quickstart](../high-level-sdk/quickstart.md) - Basic operations and examples +- [Generated SDK Overview](../generated-sdk/index.md) - Direct API access patterns +- [Generated SDK Examples](../generated-sdk/examples.md) - Common usage patterns +- [lakefs-spec Overview](../lakefs-spec/index.md) - Filesystem interface documentation +- [lakefs-spec Integrations](../lakefs-spec/integrations.md) - Data science library examples +- [Boto3 Integration](../boto3/index.md) - S3-compatible operations +- [Boto3 Configuration](../boto3/configuration.md) - Setup and authentication + +**Feature-Specific Guides:** +- [Transaction Patterns](../high-level-sdk/transactions.md) - Atomic operations across SDKs +- [Object I/O Operations](../high-level-sdk/objects-and-io.md) - File handling patterns +- [Data Import/Export](../high-level-sdk/imports-and-exports.md) - Bulk data operations +- [Filesystem Operations](../lakefs-spec/filesystem-api.md) - File-like operations +- [S3 Operations](../boto3/s3-operations.md) - S3-compatible patterns + +**Learning Resources:** +- [Data Science Tutorial](../tutorials/data-science-workflow.md) - End-to-end workflow examples +- [ETL Pipeline Tutorial](../tutorials/etl-pipeline.md) - Building data pipelines +- [ML Experiment Tracking](../tutorials/ml-experiment-tracking.md) - Model versioning workflows + +**Reference Materials:** +- [Best Practices](best-practices.md) - Production deployment guidelines +- [Performance Optimization](best-practices.md#performance) - SDK performance tuning +- [Troubleshooting](troubleshooting.md) - Common issues and solutions +- [Error Handling Patterns](troubleshooting.md#error-handling) - Exception handling strategies + +**Migration Guides:** +- [S3 Migration Patterns](../boto3/s3-operations.md#migration-patterns) - Convert S3 code to lakeFS +- [SDK Migration Strategies](best-practices.md#sdk-migration) - Moving between SDKs +- [Legacy Integration](best-practices.md#legacy-integration) - Integrate with existing systems + +**External Resources:** +- [High-Level SDK API Reference](https://pydocs-lakefs.lakefs.io){:target="_blank"} - Complete API documentation +- [Generated SDK API Reference](https://pydocs-sdk.lakefs.io){:target="_blank"} - Auto-generated API docs +- [lakefs-spec Documentation](https://lakefs-spec.org/){:target="_blank"} - Third-party filesystem interface +- [Boto3 Documentation](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html){:target="_blank"} - Official Boto3 documentation \ No newline at end of file diff --git a/docs/src/integrations/python/reference/best-practices.md b/docs/src/integrations/python/reference/best-practices.md new file mode 100644 index 00000000000..6b647ea1432 --- /dev/null +++ b/docs/src/integrations/python/reference/best-practices.md @@ -0,0 +1,1033 @@ +--- +title: Production Best Practices +description: Comprehensive guide for deploying Python lakeFS applications in production +sdk_types: ["high-level", "generated", "lakefs-spec", "boto3"] +difficulty: "advanced" +use_cases: ["production", "deployment", "performance"] +--- + +# Production Best Practices + +This guide covers essential practices for deploying and operating Python applications with lakeFS in production environments, including performance optimization, security considerations, monitoring, and operational best practices. + +## Connection Management and Performance + +### Connection Pooling + +Proper connection management is crucial for production performance and resource utilization. + +#### High-Level SDK Connection Pooling + +```python +import lakefs +from lakefs.config import Config + +# Configure connection pooling +config = Config( + host="https://lakefs.example.com", + access_key_id="your-access-key", + secret_access_key="your-secret-key", + # Connection pool settings + pool_connections=30, + pool_maxsize=30, + max_retries=3, + backoff_factor=0.3 +) + +# Create client with optimized settings +client = lakefs.Client(config=config) + +# Reuse client across your application +class DataService: + def __init__(self): + self.client = client # Reuse the same client instance + + def process_data(self, repo_name, branch_name): + repo = self.client.repository(repo_name) + branch = repo.branch(branch_name) + # Operations use the pooled connections + return branch.objects() +``` + +#### Generated SDK Connection Configuration + +```python +from lakefs_sdk import Configuration, ApiClient +import urllib3 + +# Configure connection pooling for Generated SDK +configuration = Configuration( + host="https://lakefs.example.com", + access_token="your-access-token" +) + +# Configure urllib3 pool manager +http = urllib3.PoolManager( + num_pools=10, + maxsize=30, + retries=urllib3.Retry( + total=3, + backoff_factor=0.3, + status_forcelist=[500, 502, 503, 504] + ) +) + +# Create API client with custom pool manager +api_client = ApiClient(configuration) +api_client.rest_client.pool_manager = http +``` + +#### lakefs-spec Connection Optimization + +```python +import lakefs_spec +import fsspec + +# Configure lakefs-spec with connection pooling +fs = fsspec.filesystem( + 'lakefs', + host='https://lakefs.example.com', + access_key_id='your-access-key', + secret_access_key='your-secret-key', + # Connection pool settings + client_kwargs={ + 'pool_connections': 30, + 'pool_maxsize': 30, + 'max_retries': 3 + } +) + +# Use the same filesystem instance across operations +def process_files(file_paths): + for path in file_paths: + with fs.open(path, 'rb') as f: + # Process file using the pooled connection + data = f.read() + yield process_data(data) +``` + +### Performance Optimization Techniques + +#### Batch Operations + +```python +# High-Level SDK: Efficient batch operations +import lakefs +from concurrent.futures import ThreadPoolExecutor, as_completed + +client = lakefs.Client() +repo = client.repository("my-repo") +branch = repo.branch("main") + +# Batch upload with threading +def upload_file(file_info): + path, data = file_info + return branch.object(path).upload(data) + +files_to_upload = [ + ("data/file1.csv", data1), + ("data/file2.csv", data2), + # ... more files +] + +# Use thread pool for concurrent uploads +with ThreadPoolExecutor(max_workers=10) as executor: + futures = [executor.submit(upload_file, file_info) + for file_info in files_to_upload] + + for future in as_completed(futures): + try: + result = future.result() + print(f"Upload completed: {result}") + except Exception as e: + print(f"Upload failed: {e}") +``` + +#### Streaming for Large Files + +```python +# High-Level SDK: Streaming large files +def stream_large_file(repo, branch_name, object_path, local_path): + branch = repo.branch(branch_name) + obj = branch.object(object_path) + + # Stream upload for large files + with open(local_path, 'rb') as f: + obj.upload(f, mode='rb') + + # Stream download for large files + with obj.reader() as reader: + with open(local_path + '.downloaded', 'wb') as f: + for chunk in reader: + f.write(chunk) + +# lakefs-spec: Efficient streaming +import lakefs_spec + +def stream_with_lakefs_spec(source_path, dest_path): + # Direct streaming without loading into memory + with fsspec.open(source_path, 'rb') as src: + with fsspec.open(dest_path, 'wb') as dst: + # Stream in chunks + while True: + chunk = src.read(8192) # 8KB chunks + if not chunk: + break + dst.write(chunk) +``` + +#### Caching Strategies + +```python +import functools +import time +from typing import Dict, Any + +# Repository metadata caching +class CachedLakeFSClient: + def __init__(self, client): + self.client = client + self._repo_cache = {} + self._cache_ttl = 300 # 5 minutes + + @functools.lru_cache(maxsize=128) + def get_repository_info(self, repo_name: str) -> Dict[str, Any]: + """Cache repository metadata""" + repo = self.client.repository(repo_name) + return { + 'name': repo.id, + 'creation_date': repo.creation_date, + 'default_branch': repo.default_branch + } + + def get_branch_with_cache(self, repo_name: str, branch_name: str): + """Cache branch objects for reuse""" + cache_key = f"{repo_name}:{branch_name}" + current_time = time.time() + + if (cache_key in self._repo_cache and + current_time - self._repo_cache[cache_key]['timestamp'] < self._cache_ttl): + return self._repo_cache[cache_key]['branch'] + + repo = self.client.repository(repo_name) + branch = repo.branch(branch_name) + + self._repo_cache[cache_key] = { + 'branch': branch, + 'timestamp': current_time + } + + return branch +``` + +## Security Best Practices + +### Credential Management + +#### Environment-Based Configuration + +```python +import os +import lakefs +from lakefs.config import Config + +# Production credential management +def create_secure_client(): + # Never hardcode credentials + config = Config( + host=os.environ['LAKEFS_HOST'], + access_key_id=os.environ['LAKEFS_ACCESS_KEY_ID'], + secret_access_key=os.environ['LAKEFS_SECRET_ACCESS_KEY'] + ) + + # Validate configuration + if not all([config.host, config.access_key_id, config.secret_access_key]): + raise ValueError("Missing required lakeFS credentials") + + return lakefs.Client(config=config) + +# Use AWS Secrets Manager or similar +import boto3 +import json + +def get_lakefs_credentials_from_secrets(): + """Retrieve credentials from AWS Secrets Manager""" + secrets_client = boto3.client('secretsmanager') + + try: + response = secrets_client.get_secret_value( + SecretId='lakefs/production/credentials' + ) + credentials = json.loads(response['SecretString']) + + return Config( + host=credentials['host'], + access_key_id=credentials['access_key_id'], + secret_access_key=credentials['secret_access_key'] + ) + except Exception as e: + raise RuntimeError(f"Failed to retrieve credentials: {e}") +``` + +#### SSL/TLS Configuration + +```python +import ssl +import lakefs +from lakefs.config import Config + +# Production SSL configuration +def create_secure_ssl_client(): + config = Config( + host="https://lakefs.example.com", + access_key_id=os.environ['LAKEFS_ACCESS_KEY_ID'], + secret_access_key=os.environ['LAKEFS_SECRET_ACCESS_KEY'], + # SSL configuration + verify_ssl=True, + ssl_ca_cert="/path/to/ca-bundle.crt", # Custom CA if needed + cert_file="/path/to/client.crt", # Client certificate + key_file="/path/to/client.key" # Client private key + ) + + return lakefs.Client(config=config) + +# For Generated SDK +from lakefs_sdk import Configuration +import urllib3 + +def configure_ssl_for_generated_sdk(): + configuration = Configuration( + host="https://lakefs.example.com", + access_token="your-token" + ) + + # Custom SSL context + ssl_context = ssl.create_default_context() + ssl_context.check_hostname = True + ssl_context.verify_mode = ssl.CERT_REQUIRED + + # Configure urllib3 with custom SSL + http = urllib3.PoolManager( + ssl_context=ssl_context, + cert_reqs='CERT_REQUIRED', + ca_certs='/path/to/ca-bundle.crt' + ) + + return configuration, http +``` + +### Access Control and Permissions + +```python +import lakefs +from typing import List, Dict + +class SecureLakeFSService: + """Production service with access control""" + + def __init__(self, config): + self.client = lakefs.Client(config) + self.allowed_repositories = self._load_allowed_repositories() + self.user_permissions = self._load_user_permissions() + + def _load_allowed_repositories(self) -> List[str]: + """Load allowed repositories from configuration""" + return os.environ.get('ALLOWED_REPOSITORIES', '').split(',') + + def _validate_repository_access(self, repo_name: str, user_id: str): + """Validate user has access to repository""" + if repo_name not in self.allowed_repositories: + raise PermissionError(f"Access denied to repository: {repo_name}") + + user_perms = self.user_permissions.get(user_id, []) + if repo_name not in user_perms: + raise PermissionError(f"User {user_id} lacks access to {repo_name}") + + def safe_repository_access(self, repo_name: str, user_id: str): + """Safely access repository with validation""" + self._validate_repository_access(repo_name, user_id) + return self.client.repository(repo_name) +``` + +## Deployment Best Practices + +### Containerization + +#### Docker Configuration + +```dockerfile +# Production Dockerfile +FROM python:3.11-slim + +# Create non-root user +RUN groupadd -r lakefs && useradd -r -g lakefs lakefs + +# Install dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application +COPY --chown=lakefs:lakefs . /app +WORKDIR /app + +# Switch to non-root user +USER lakefs + +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD python -c "import lakefs; client = lakefs.Client(); client.repositories.list()" || exit 1 + +CMD ["python", "app.py"] +``` + +#### Kubernetes Deployment + +```yaml +# production-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: lakefs-python-app +spec: + replicas: 3 + selector: + matchLabels: + app: lakefs-python-app + template: + metadata: + labels: + app: lakefs-python-app + spec: + containers: + - name: app + image: your-registry/lakefs-python-app:latest + env: + - name: LAKEFS_HOST + valueFrom: + secretKeyRef: + name: lakefs-credentials + key: host + - name: LAKEFS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: lakefs-credentials + key: access_key_id + - name: LAKEFS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: lakefs-credentials + key: secret_access_key + resources: + requests: + memory: "256Mi" + cpu: "250m" + limits: + memory: "512Mi" + cpu: "500m" + livenessProbe: + httpGet: + path: /health + port: 8080 + initialDelaySeconds: 30 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 5 + periodSeconds: 5 +``` + +### Configuration Management + +```python +import os +from dataclasses import dataclass +from typing import Optional +import lakefs + +@dataclass +class ProductionConfig: + """Production configuration management""" + lakefs_host: str + lakefs_access_key_id: str + lakefs_secret_access_key: str + + # Performance settings + connection_pool_size: int = 30 + max_retries: int = 3 + timeout: int = 30 + + # Security settings + verify_ssl: bool = True + ssl_ca_cert: Optional[str] = None + + # Operational settings + log_level: str = "INFO" + metrics_enabled: bool = True + + @classmethod + def from_environment(cls): + """Load configuration from environment variables""" + return cls( + lakefs_host=os.environ['LAKEFS_HOST'], + lakefs_access_key_id=os.environ['LAKEFS_ACCESS_KEY_ID'], + lakefs_secret_access_key=os.environ['LAKEFS_SECRET_ACCESS_KEY'], + connection_pool_size=int(os.environ.get('LAKEFS_POOL_SIZE', '30')), + max_retries=int(os.environ.get('LAKEFS_MAX_RETRIES', '3')), + timeout=int(os.environ.get('LAKEFS_TIMEOUT', '30')), + verify_ssl=os.environ.get('LAKEFS_VERIFY_SSL', 'true').lower() == 'true', + ssl_ca_cert=os.environ.get('LAKEFS_SSL_CA_CERT'), + log_level=os.environ.get('LOG_LEVEL', 'INFO'), + metrics_enabled=os.environ.get('METRICS_ENABLED', 'true').lower() == 'true' + ) + + def create_client(self) -> lakefs.Client: + """Create configured lakeFS client""" + config = lakefs.Config( + host=self.lakefs_host, + access_key_id=self.lakefs_access_key_id, + secret_access_key=self.lakefs_secret_access_key, + pool_connections=self.connection_pool_size, + pool_maxsize=self.connection_pool_size, + max_retries=self.max_retries, + verify_ssl=self.verify_ssl, + ssl_ca_cert=self.ssl_ca_cert + ) + + return lakefs.Client(config=config) +``` + +## Monitoring and Observability + +### Logging Configuration + +```python +import logging +import structlog +import lakefs +from pythonjsonlogger import jsonlogger + +def setup_production_logging(): + """Configure structured logging for production""" + + # Configure structlog + structlog.configure( + processors=[ + structlog.stdlib.filter_by_level, + structlog.stdlib.add_logger_name, + structlog.stdlib.add_log_level, + structlog.stdlib.PositionalArgumentsFormatter(), + structlog.processors.TimeStamper(fmt="iso"), + structlog.processors.StackInfoRenderer(), + structlog.processors.format_exc_info, + structlog.processors.UnicodeDecoder(), + structlog.processors.JSONRenderer() + ], + context_class=dict, + logger_factory=structlog.stdlib.LoggerFactory(), + wrapper_class=structlog.stdlib.BoundLogger, + cache_logger_on_first_use=True, + ) + + # Configure standard logging + handler = logging.StreamHandler() + formatter = jsonlogger.JsonFormatter( + '%(asctime)s %(name)s %(levelname)s %(message)s' + ) + handler.setFormatter(formatter) + + # Set up lakeFS client logging + lakefs_logger = logging.getLogger('lakefs') + lakefs_logger.setLevel(logging.INFO) + lakefs_logger.addHandler(handler) + + return structlog.get_logger() + +# Usage in application +logger = setup_production_logging() + +class LakeFSService: + def __init__(self): + self.client = lakefs.Client() + self.logger = logger.bind(service="lakefs") + + def upload_data(self, repo_name, branch_name, path, data): + self.logger.info( + "Starting data upload", + repo=repo_name, + branch=branch_name, + path=path, + size=len(data) + ) + + try: + repo = self.client.repository(repo_name) + branch = repo.branch(branch_name) + result = branch.object(path).upload(data) + + self.logger.info( + "Data upload completed", + repo=repo_name, + branch=branch_name, + path=path, + checksum=result.checksum + ) + + return result + + except Exception as e: + self.logger.error( + "Data upload failed", + repo=repo_name, + branch=branch_name, + path=path, + error=str(e), + exc_info=True + ) + raise +``` + +### Metrics and Monitoring + +```python +import time +import functools +from prometheus_client import Counter, Histogram, Gauge, start_http_server +import lakefs + +# Prometheus metrics +LAKEFS_OPERATIONS = Counter( + 'lakefs_operations_total', + 'Total lakeFS operations', + ['operation', 'repository', 'status'] +) + +LAKEFS_OPERATION_DURATION = Histogram( + 'lakefs_operation_duration_seconds', + 'Duration of lakeFS operations', + ['operation', 'repository'] +) + +LAKEFS_ACTIVE_CONNECTIONS = Gauge( + 'lakefs_active_connections', + 'Number of active lakeFS connections' +) + +def monitor_lakefs_operation(operation_name): + """Decorator to monitor lakeFS operations""" + def decorator(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + repo_name = kwargs.get('repo_name', 'unknown') + + start_time = time.time() + LAKEFS_ACTIVE_CONNECTIONS.inc() + + try: + result = func(*args, **kwargs) + LAKEFS_OPERATIONS.labels( + operation=operation_name, + repository=repo_name, + status='success' + ).inc() + return result + + except Exception as e: + LAKEFS_OPERATIONS.labels( + operation=operation_name, + repository=repo_name, + status='error' + ).inc() + raise + + finally: + duration = time.time() - start_time + LAKEFS_OPERATION_DURATION.labels( + operation=operation_name, + repository=repo_name + ).observe(duration) + LAKEFS_ACTIVE_CONNECTIONS.dec() + + return wrapper + return decorator + +# Usage +class MonitoredLakeFSService: + def __init__(self): + self.client = lakefs.Client() + + @monitor_lakefs_operation('upload') + def upload_file(self, repo_name, branch_name, path, data): + repo = self.client.repository(repo_name) + branch = repo.branch(branch_name) + return branch.object(path).upload(data) + + @monitor_lakefs_operation('download') + def download_file(self, repo_name, branch_name, path): + repo = self.client.repository(repo_name) + branch = repo.branch(branch_name) + return branch.object(path).reader().read() + +# Start metrics server +start_http_server(8000) +``` + +### Health Checks + +```python +import lakefs +from typing import Dict, Any +import time + +class HealthChecker: + """Production health checking for lakeFS connectivity""" + + def __init__(self, client: lakefs.Client): + self.client = client + self.last_check = 0 + self.check_interval = 30 # seconds + self.cached_status = None + + def check_health(self) -> Dict[str, Any]: + """Comprehensive health check""" + current_time = time.time() + + # Use cached result if recent + if (self.cached_status and + current_time - self.last_check < self.check_interval): + return self.cached_status + + health_status = { + 'timestamp': current_time, + 'status': 'healthy', + 'checks': {} + } + + # Check basic connectivity + try: + start_time = time.time() + repos = list(self.client.repositories.list(amount=1)) + response_time = time.time() - start_time + + health_status['checks']['connectivity'] = { + 'status': 'pass', + 'response_time': response_time + } + + except Exception as e: + health_status['status'] = 'unhealthy' + health_status['checks']['connectivity'] = { + 'status': 'fail', + 'error': str(e) + } + + # Check authentication + try: + # Try to access user info or perform authenticated operation + self.client.repositories.list(amount=1) + health_status['checks']['authentication'] = { + 'status': 'pass' + } + + except Exception as e: + health_status['status'] = 'unhealthy' + health_status['checks']['authentication'] = { + 'status': 'fail', + 'error': str(e) + } + + self.cached_status = health_status + self.last_check = current_time + + return health_status + + def is_healthy(self) -> bool: + """Simple boolean health check""" + return self.check_health()['status'] == 'healthy' + +# Flask health endpoint example +from flask import Flask, jsonify + +app = Flask(__name__) +health_checker = HealthChecker(lakefs.Client()) + +@app.route('/health') +def health(): + health_status = health_checker.check_health() + status_code = 200 if health_status['status'] == 'healthy' else 503 + return jsonify(health_status), status_code + +@app.route('/ready') +def ready(): + # Readiness check - can be more strict than health + if health_checker.is_healthy(): + return jsonify({'status': 'ready'}), 200 + else: + return jsonify({'status': 'not ready'}), 503 +``` + +## Error Handling and Resilience + +### Retry Strategies + +```python +import time +import random +from functools import wraps +from typing import Callable, Type, Tuple +import lakefs + +def exponential_backoff_retry( + max_retries: int = 3, + base_delay: float = 1.0, + max_delay: float = 60.0, + exceptions: Tuple[Type[Exception], ...] = (Exception,) +): + """Exponential backoff retry decorator""" + def decorator(func: Callable): + @wraps(func) + def wrapper(*args, **kwargs): + last_exception = None + + for attempt in range(max_retries + 1): + try: + return func(*args, **kwargs) + + except exceptions as e: + last_exception = e + + if attempt == max_retries: + raise + + # Calculate delay with jitter + delay = min(base_delay * (2 ** attempt), max_delay) + jitter = random.uniform(0, delay * 0.1) + time.sleep(delay + jitter) + + logger.warning( + f"Retry attempt {attempt + 1}/{max_retries} for {func.__name__}", + error=str(e), + delay=delay + jitter + ) + + raise last_exception + + return wrapper + return decorator + +# Usage +class ResilientLakeFSService: + def __init__(self): + self.client = lakefs.Client() + + @exponential_backoff_retry( + max_retries=3, + exceptions=(lakefs.exceptions.ServerException, ConnectionError) + ) + def upload_with_retry(self, repo_name, branch_name, path, data): + repo = self.client.repository(repo_name) + branch = repo.branch(branch_name) + return branch.object(path).upload(data) +``` + +### Circuit Breaker Pattern + +```python +import time +from enum import Enum +from typing import Callable, Any +import lakefs + +class CircuitState(Enum): + CLOSED = "closed" + OPEN = "open" + HALF_OPEN = "half_open" + +class CircuitBreaker: + """Circuit breaker for lakeFS operations""" + + def __init__( + self, + failure_threshold: int = 5, + timeout: float = 60.0, + expected_exception: Type[Exception] = Exception + ): + self.failure_threshold = failure_threshold + self.timeout = timeout + self.expected_exception = expected_exception + + self.failure_count = 0 + self.last_failure_time = None + self.state = CircuitState.CLOSED + + def call(self, func: Callable, *args, **kwargs) -> Any: + """Execute function with circuit breaker protection""" + + if self.state == CircuitState.OPEN: + if self._should_attempt_reset(): + self.state = CircuitState.HALF_OPEN + else: + raise Exception("Circuit breaker is OPEN") + + try: + result = func(*args, **kwargs) + self._on_success() + return result + + except self.expected_exception as e: + self._on_failure() + raise + + def _should_attempt_reset(self) -> bool: + """Check if enough time has passed to attempt reset""" + return (time.time() - self.last_failure_time) >= self.timeout + + def _on_success(self): + """Handle successful operation""" + self.failure_count = 0 + self.state = CircuitState.CLOSED + + def _on_failure(self): + """Handle failed operation""" + self.failure_count += 1 + self.last_failure_time = time.time() + + if self.failure_count >= self.failure_threshold: + self.state = CircuitState.OPEN + +# Usage +lakefs_circuit_breaker = CircuitBreaker( + failure_threshold=5, + timeout=60.0, + expected_exception=lakefs.exceptions.ServerException +) + +class ProtectedLakeFSService: + def __init__(self): + self.client = lakefs.Client() + + def safe_upload(self, repo_name, branch_name, path, data): + """Upload with circuit breaker protection""" + def upload_operation(): + repo = self.client.repository(repo_name) + branch = repo.branch(branch_name) + return branch.object(path).upload(data) + + return lakefs_circuit_breaker.call(upload_operation) +``` + +## Performance Monitoring and Optimization + +### Performance Profiling + +```python +import cProfile +import pstats +import io +from contextlib import contextmanager +import lakefs + +@contextmanager +def profile_lakefs_operation(operation_name: str): + """Context manager for profiling lakeFS operations""" + profiler = cProfile.Profile() + profiler.enable() + + try: + yield + finally: + profiler.disable() + + # Analyze results + s = io.StringIO() + ps = pstats.Stats(profiler, stream=s) + ps.sort_stats('cumulative') + ps.print_stats(20) # Top 20 functions + + logger.info( + f"Performance profile for {operation_name}", + profile_data=s.getvalue() + ) + +# Usage +def analyze_performance(): + client = lakefs.Client() + repo = client.repository("my-repo") + branch = repo.branch("main") + + with profile_lakefs_operation("batch_upload"): + # Perform operations to profile + for i in range(100): + branch.object(f"data/file_{i}.txt").upload(f"data_{i}") +``` + +### Memory Usage Optimization + +```python +import gc +import psutil +import os +from typing import Iterator +import lakefs + +class MemoryEfficientProcessor: + """Memory-efficient processing of large datasets""" + + def __init__(self, client: lakefs.Client): + self.client = client + self.process = psutil.Process(os.getpid()) + + def get_memory_usage(self) -> float: + """Get current memory usage in MB""" + return self.process.memory_info().rss / 1024 / 1024 + + def process_large_dataset( + self, + repo_name: str, + branch_name: str, + file_paths: list, + chunk_size: int = 1000 + ) -> Iterator[Any]: + """Process large dataset in memory-efficient chunks""" + + repo = self.client.repository(repo_name) + branch = repo.branch(branch_name) + + initial_memory = self.get_memory_usage() + logger.info(f"Starting processing, initial memory: {initial_memory:.2f} MB") + + for i in range(0, len(file_paths), chunk_size): + chunk = file_paths[i:i + chunk_size] + + # Process chunk + for file_path in chunk: + obj = branch.object(file_path) + with obj.reader() as reader: + # Process data in streaming fashion + for line in reader: + yield self.process_line(line) + + # Force garbage collection after each chunk + gc.collect() + + current_memory = self.get_memory_usage() + logger.info( + f"Processed chunk {i//chunk_size + 1}, " + f"memory: {current_memory:.2f} MB, " + f"delta: {current_memory - initial_memory:.2f} MB" + ) + + def process_line(self, line: bytes) -> Any: + """Process individual line - implement your logic here""" + return line.decode().strip() +``` + +## See Also + +- [API Comparison](api-comparison.md) - Choose the right SDK for your use case +- [Troubleshooting Guide](troubleshooting.md) - Common issues and solutions +- [High-Level SDK Advanced Features](../high-level-sdk/advanced.md) - Advanced SDK capabilities +- [Generated SDK Examples](../generated-sdk/examples.md) - Direct API usage patterns +- [Security Documentation](../../security/index.md) - lakeFS security best practices +- [Deployment Guide](../../deploy/index.md) - lakeFS deployment options \ No newline at end of file diff --git a/docs/src/integrations/python/reference/changelog.md b/docs/src/integrations/python/reference/changelog.md new file mode 100644 index 00000000000..99676a4dea7 --- /dev/null +++ b/docs/src/integrations/python/reference/changelog.md @@ -0,0 +1,14 @@ +--- +title: Changelog +description: SDK changes and updates for Python integrations +sdk_types: ["high-level", "generated", "lakefs-spec", "boto3"] +difficulty: "beginner" +use_cases: ["reference", "updates", "migration", "version-tracking"] +topics: ["changelog", "updates", "versions", "migration"] +audience: ["developers", "data-engineers", "maintainers"] +last_updated: "2024-01-15" +--- + +# Changelog + +Placeholder content for changelog documentation. \ No newline at end of file diff --git a/docs/src/integrations/python/reference/index.md b/docs/src/integrations/python/reference/index.md new file mode 100644 index 00000000000..ba223a4be69 --- /dev/null +++ b/docs/src/integrations/python/reference/index.md @@ -0,0 +1,83 @@ +--- +title: Python Reference +description: Reference documentation for Python and lakeFS +sdk_types: ["high-level", "generated", "lakefs-spec", "boto3"] +difficulty: "intermediate" +use_cases: ["reference", "comparison", "troubleshooting", "best-practices"] +topics: ["reference", "documentation", "resources"] +audience: ["developers", "data-engineers", "advanced-users"] +last_updated: "2024-01-15" +--- + +# Python Reference + +Reference documentation and resources for Python integration with lakeFS. + +## Reference Sections + +- **[API Comparison](api-comparison.md)** - Feature comparison across all Python SDKs +- **[Best Practices](best-practices.md)** - Production best practices and guidelines +- **[Troubleshooting](troubleshooting.md)** - Common issues and solutions +- **[Changelog](changelog.md)** - SDK changes and updates + +## Quick Reference + +### SDK Selection Guide +- **High-Level SDK**: Most users, simplified API, transactions +- **Generated SDK**: Direct API access, custom operations +- **lakefs-spec**: Data science, fsspec compatibility +- **Boto3**: S3 migration, existing S3 workflows + +### Common Patterns +- Authentication via environment variables or config files +- Use transactions for atomic operations +- Leverage streaming for large files +- Handle errors gracefully with try/catch blocks + +## External Resources + +- [High-Level SDK Documentation](https://pydocs-lakefs.lakefs.io) +- [Generated SDK Documentation](https://pydocs-sdk.lakefs.io) +- [lakefs-spec Documentation](https://lakefs-spec.org/) +- [Boto S3 Router](https://github.com/treeverse/boto-s3-router) + +## See Also + +**Getting Started:** +- **[Python SDK Overview](../index.md)** - Complete overview of all Python SDK options +- **[Getting Started Guide](../getting-started.md)** - Installation and setup for all SDKs +- **[SDK Selection Guide](../index.md#sdk-selection-decision-matrix)** - Choose the right SDK + +**Reference Documentation:** +- **[API Comparison](api-comparison.md)** - Comprehensive feature comparison across all SDKs +- **[Best Practices](best-practices.md)** - Production deployment and optimization guidance +- **[Troubleshooting](troubleshooting.md)** - Common issues, solutions, and debugging techniques +- **[Changelog](changelog.md)** - SDK updates, changes, and migration notes + +**SDK-Specific Documentation:** +- **[High-Level SDK](../high-level-sdk/index.md)** - Comprehensive High-Level SDK documentation +- **[Generated SDK](../generated-sdk/index.md)** - Direct API access patterns and examples +- **[lakefs-spec](../lakefs-spec/index.md)** - Filesystem interface for data science workflows +- **[Boto3 Integration](../boto3/index.md)** - S3-compatible operations and migration + +**Learning Resources:** +- **[Quickstart Guide](../high-level-sdk/quickstart.md)** - Basic operations and examples +- **[Tutorial Collection](../tutorials/index.md)** - Real-world examples and workflows +- **[Data Science Tutorial](../tutorials/data-science-workflow.md)** - End-to-end data analysis +- **[ETL Pipeline Tutorial](../tutorials/etl-pipeline.md)** - Production data pipeline patterns + +**Feature-Specific Guides:** +- **[Transaction Patterns](../high-level-sdk/transactions.md)** - Atomic operations across SDKs +- **[Object I/O Operations](../high-level-sdk/objects-and-io.md)** - File handling and streaming +- **[Repository Management](../high-level-sdk/repositories.md)** - Repository operations +- **[Branch Operations](../high-level-sdk/branches-and-commits.md)** - Version control patterns + +**Integration Guides:** +- **[Data Science Integrations](../lakefs-spec/integrations.md)** - pandas, dask, and other libraries +- **[S3 Migration Patterns](../boto3/s3-operations.md)** - Convert existing S3 workflows +- **[Filesystem Operations](../lakefs-spec/filesystem-api.md)** - File-like operations + +**External Resources:** +- **[lakeFS Documentation](https://docs.lakefs.io){:target="_blank"}** - Complete lakeFS documentation +- **[lakeFS API Reference](https://docs.lakefs.io/reference/api.html){:target="_blank"}** - REST API specification +- **[Community Examples](https://github.com/treeverse/lakeFS-samples){:target="_blank"}** - Sample projects and notebooks \ No newline at end of file diff --git a/docs/src/integrations/python/reference/troubleshooting.md b/docs/src/integrations/python/reference/troubleshooting.md new file mode 100644 index 00000000000..7e654a0072d --- /dev/null +++ b/docs/src/integrations/python/reference/troubleshooting.md @@ -0,0 +1,1026 @@ +--- +title: Troubleshooting Guide +description: Comprehensive troubleshooting guide for Python lakeFS integrations +sdk_types: ["high-level", "generated", "lakefs-spec", "boto3"] +difficulty: "intermediate" +use_cases: ["debugging", "error-resolution", "performance"] +--- + +# Troubleshooting Guide + +This comprehensive guide covers common issues, error patterns, debugging techniques, and performance optimization tips for Python lakeFS integrations across all SDK options. + +## Common Error Types and Solutions + +### Authentication and Connection Errors + +#### Error: `Unauthorized (401)` + +**Symptom:** +``` +lakefs.exceptions.UnauthorizedException: Unauthorized +``` + +**Causes and Solutions:** + +1. **Invalid Credentials** + ```python + # Check your credentials + import lakefs + + # Verify credentials are set correctly + client = lakefs.Client( + host="http://localhost:8000", + access_key_id="your-access-key", + secret_access_key="your-secret-key" + ) + + # Test connection + try: + repos = list(client.repositories.list(amount=1)) + print("Authentication successful") + except lakefs.exceptions.UnauthorizedException: + print("Invalid credentials") + ``` + +2. **Environment Variables Not Set** + ```bash + # Check environment variables + echo $LAKEFS_ACCESS_KEY_ID + echo $LAKEFS_SECRET_ACCESS_KEY + echo $LAKEFS_HOST + + # Set if missing + export LAKEFS_ACCESS_KEY_ID="your-access-key" + export LAKEFS_SECRET_ACCESS_KEY="your-secret-key" + export LAKEFS_HOST="http://localhost:8000" + ``` + +3. **Token Expiration (for JWT tokens)** + ```python + # Refresh token if using JWT authentication + import lakefs + + def refresh_client_token(): + # Re-authenticate to get fresh token + client = lakefs.Client() + # Your token refresh logic here + return client + ``` + +#### Error: `Connection refused` or `Connection timeout` + +**Symptom:** +``` +requests.exceptions.ConnectionError: HTTPConnectionPool(host='localhost', port=8000): +Max retries exceeded with url: /api/v1/repositories +``` + +**Solutions:** + +1. **Verify lakeFS Server is Running** + ```bash + # Check if lakeFS is running + curl http://localhost:8000/api/v1/healthcheck + + # Check Docker container status + docker ps | grep lakefs + + # Check service status + systemctl status lakefs + ``` + +2. **Check Network Configuration** + ```python + import requests + + # Test basic connectivity + try: + response = requests.get("http://localhost:8000/api/v1/healthcheck", timeout=5) + print(f"Server responding: {response.status_code}") + except requests.exceptions.ConnectionError: + print("Cannot connect to lakeFS server") + except requests.exceptions.Timeout: + print("Connection timeout - server may be slow") + ``` + +3. **Proxy Configuration Issues** + ```python + import lakefs + + # Configure proxy settings + client = lakefs.Client( + host="http://localhost:8000", + access_key_id="your-key", + secret_access_key="your-secret", + # Proxy configuration + proxies={ + 'http': 'http://proxy.company.com:8080', + 'https': 'https://proxy.company.com:8080' + } + ) + ``` + +#### Error: SSL Certificate Verification Failed + +**Symptom:** +``` +requests.exceptions.SSLError: HTTPSConnectionPool(host='lakefs.example.com', port=443): +Max retries exceeded with url: /api/v1/repositories (Caused by SSLError(SSLCertVerificationError)) +``` + +**Solutions:** + +1. **Configure SSL Verification** + ```python + import lakefs + + # Option 1: Disable SSL verification (NOT recommended for production) + client = lakefs.Client( + host="https://lakefs.example.com", + access_key_id="your-key", + secret_access_key="your-secret", + verify_ssl=False + ) + + # Option 2: Provide custom CA certificate + client = lakefs.Client( + host="https://lakefs.example.com", + access_key_id="your-key", + secret_access_key="your-secret", + ssl_ca_cert="/path/to/ca-bundle.crt" + ) + + # Option 3: Use system CA bundle + import certifi + client = lakefs.Client( + host="https://lakefs.example.com", + access_key_id="your-key", + secret_access_key="your-secret", + ssl_ca_cert=certifi.where() + ) + ``` + +### Repository and Branch Errors + +#### Error: `Repository not found (404)` + +**Symptom:** +``` +lakefs.exceptions.NotFoundException: Repository 'my-repo' not found +``` + +**Solutions:** + +1. **Verify Repository Exists** + ```python + import lakefs + + client = lakefs.Client() + + # List all repositories + repos = list(client.repositories.list()) + print("Available repositories:") + for repo in repos: + print(f" - {repo.id}") + + # Check specific repository + try: + repo = client.repository("my-repo") + print(f"Repository found: {repo.id}") + except lakefs.exceptions.NotFoundException: + print("Repository does not exist") + ``` + +2. **Create Repository if Missing** + ```python + def ensure_repository_exists(client, repo_name, storage_namespace): + try: + return client.repository(repo_name) + except lakefs.exceptions.NotFoundException: + print(f"Creating repository: {repo_name}") + return client.repositories.create( + name=repo_name, + storage_namespace=storage_namespace + ) + ``` + +#### Error: `Branch not found (404)` + +**Symptom:** +``` +lakefs.exceptions.NotFoundException: Branch 'feature-branch' not found in repository 'my-repo' +``` + +**Solutions:** + +1. **List Available Branches** + ```python + repo = client.repository("my-repo") + + # List all branches + branches = list(repo.branches.list()) + print("Available branches:") + for branch in branches: + print(f" - {branch.id}") + ``` + +2. **Create Branch if Missing** + ```python + def ensure_branch_exists(repo, branch_name, source_branch="main"): + try: + return repo.branch(branch_name) + except lakefs.exceptions.NotFoundException: + print(f"Creating branch: {branch_name}") + return repo.branches.create( + name=branch_name, + source_ref=source_branch + ) + ``` + +### Object Operation Errors + +#### Error: `Object not found (404)` + +**Symptom:** +``` +lakefs.exceptions.NotFoundException: Object 'path/to/file.txt' not found +``` + +**Solutions:** + +1. **Verify Object Path and Branch** + ```python + repo = client.repository("my-repo") + branch = repo.branch("main") + + # List objects to verify path + objects = list(branch.objects.list(prefix="path/to/")) + print("Objects in path:") + for obj in objects: + print(f" - {obj.path}") + + # Check if object exists + try: + obj = branch.object("path/to/file.txt") + stat = obj.stat() + print(f"Object found: {stat.path}, size: {stat.size_bytes}") + except lakefs.exceptions.NotFoundException: + print("Object does not exist") + ``` + +2. **Handle Missing Objects Gracefully** + ```python + def safe_download(branch, object_path, default_content=None): + try: + obj = branch.object(object_path) + return obj.reader().read() + except lakefs.exceptions.NotFoundException: + if default_content is not None: + return default_content + raise + ``` + +#### Error: `Conflict (409)` during Upload + +**Symptom:** +``` +lakefs.exceptions.ConflictException: Object 'path/to/file.txt' was modified +``` + +**Solutions:** + +1. **Handle Concurrent Modifications** + ```python + import time + import random + + def upload_with_retry(branch, path, data, max_retries=3): + for attempt in range(max_retries): + try: + return branch.object(path).upload(data) + except lakefs.exceptions.ConflictException: + if attempt == max_retries - 1: + raise + # Wait with exponential backoff + wait_time = (2 ** attempt) + random.uniform(0, 1) + time.sleep(wait_time) + ``` + +2. **Use Transactions for Atomic Operations** + ```python + # High-Level SDK transaction + with repo.branch("main").transaction() as tx: + tx.upload("file1.txt", data1) + tx.upload("file2.txt", data2) + # All uploads succeed or all fail + ``` + +### Performance Issues + +#### Slow Upload/Download Performance + +**Symptoms:** +- Uploads taking much longer than expected +- High memory usage during file operations +- Timeouts on large files + +**Solutions:** + +1. **Use Streaming for Large Files** + ```python + # Instead of loading entire file into memory + # BAD: + with open("large_file.dat", "rb") as f: + data = f.read() # Loads entire file into memory + branch.object("large_file.dat").upload(data) + + # GOOD: + with open("large_file.dat", "rb") as f: + branch.object("large_file.dat").upload(f, mode='rb') + ``` + +2. **Configure Connection Pooling** + ```python + import lakefs + + # Configure larger connection pool + client = lakefs.Client( + host="http://localhost:8000", + access_key_id="your-key", + secret_access_key="your-secret", + pool_connections=30, + pool_maxsize=30 + ) + ``` + +3. **Use Batch Operations** + ```python + from concurrent.futures import ThreadPoolExecutor, as_completed + + def upload_files_concurrently(branch, files, max_workers=10): + def upload_single_file(file_info): + path, data = file_info + return branch.object(path).upload(data) + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = [executor.submit(upload_single_file, file_info) + for file_info in files] + + results = [] + for future in as_completed(futures): + try: + result = future.result() + results.append(result) + except Exception as e: + print(f"Upload failed: {e}") + + return results + ``` + +#### Memory Usage Issues + +**Symptoms:** +- High memory consumption +- Out of memory errors +- Memory leaks in long-running processes + +**Solutions:** + +1. **Use Streaming Readers** + ```python + # Instead of reading entire object into memory + # BAD: + obj = branch.object("large_file.dat") + data = obj.reader().read() # Loads entire file + + # GOOD: + obj = branch.object("large_file.dat") + with obj.reader() as reader: + for chunk in reader.iter_chunks(chunk_size=8192): + process_chunk(chunk) + ``` + +2. **Implement Proper Resource Management** + ```python + import gc + from contextlib import contextmanager + + @contextmanager + def memory_managed_processing(): + try: + yield + finally: + gc.collect() # Force garbage collection + + # Usage + with memory_managed_processing(): + # Process large amounts of data + for file_path in large_file_list: + process_file(file_path) + ``` + +### SDK-Specific Issues + +#### High-Level SDK Issues + +**Error: Transaction Rollback** +```python +# Handle transaction failures gracefully +try: + with repo.branch("main").transaction() as tx: + tx.upload("file1.txt", data1) + tx.upload("file2.txt", data2) + # Some operation fails here + raise Exception("Simulated failure") +except Exception as e: + print(f"Transaction rolled back: {e}") + # Handle rollback scenario +``` + +#### Generated SDK Issues + +**Error: API Response Parsing** +```python +from lakefs_sdk import ApiException +import json + +try: + # Generated SDK API call + response = api_client.list_repositories() +except ApiException as e: + print(f"API Error: {e.status}") + print(f"Reason: {e.reason}") + + # Parse error details + try: + error_details = json.loads(e.body) + print(f"Error message: {error_details.get('message')}") + except json.JSONDecodeError: + print(f"Raw error: {e.body}") +``` + +#### lakefs-spec Issues + +**Error: fsspec Configuration** +```python +import fsspec +import lakefs_spec + +# Debug fsspec configuration +def debug_lakefs_spec(): + try: + fs = fsspec.filesystem( + 'lakefs', + host='http://localhost:8000', + access_key_id='your-key', + secret_access_key='your-secret' + ) + + # Test basic operation + files = fs.ls('repo/branch/') + print(f"Found {len(files)} files") + + except Exception as e: + print(f"lakefs-spec error: {e}") + + # Check if lakefs-spec is properly installed + try: + import lakefs_spec + print(f"lakefs-spec version: {lakefs_spec.__version__}") + except ImportError: + print("lakefs-spec not installed") +``` + +#### Boto3 Issues + +**Error: S3 Compatibility Issues** +```python +import boto3 +from botocore.exceptions import ClientError + +# Debug Boto3 with lakeFS +def debug_boto3_lakefs(): + s3 = boto3.client( + 's3', + endpoint_url='http://localhost:8000', + aws_access_key_id='your-key', + aws_secret_access_key='your-secret' + ) + + try: + # Test basic operation + response = s3.list_objects_v2(Bucket='repo', Prefix='branch/') + print(f"Found {response.get('KeyCount', 0)} objects") + + except ClientError as e: + error_code = e.response['Error']['Code'] + error_message = e.response['Error']['Message'] + print(f"S3 Error {error_code}: {error_message}") + + # Common S3 compatibility issues + if error_code == 'NoSuchBucket': + print("Repository may not exist or incorrect format") + elif error_code == 'AccessDenied': + print("Check credentials and permissions") +``` + +## Debugging Techniques + +### Enable Debug Logging + +```python +import logging +import lakefs + +# Enable debug logging for lakeFS +logging.basicConfig(level=logging.DEBUG) +lakefs_logger = logging.getLogger('lakefs') +lakefs_logger.setLevel(logging.DEBUG) + +# Enable debug logging for requests +requests_logger = logging.getLogger('urllib3') +requests_logger.setLevel(logging.DEBUG) + +# Enable debug logging for specific SDK +import lakefs_sdk +lakefs_sdk_logger = logging.getLogger('lakefs_sdk') +lakefs_sdk_logger.setLevel(logging.DEBUG) +``` + +### Network Traffic Inspection + +```python +import requests +import lakefs + +# Enable request/response logging +import http.client as http_client +http_client.HTTPConnection.debuglevel = 1 + +# Create client with debug session +session = requests.Session() +session.hooks['response'].append(lambda r, *args, **kwargs: print(f"Response: {r.status_code} {r.url}")) + +client = lakefs.Client(session=session) +``` + +### Performance Profiling + +```python +import cProfile +import pstats +import io + +def profile_operation(): + profiler = cProfile.Profile() + profiler.enable() + + # Your lakeFS operations here + client = lakefs.Client() + repo = client.repository("my-repo") + branch = repo.branch("main") + + # Perform operations to profile + for i in range(100): + branch.object(f"test_{i}.txt").upload(f"data_{i}") + + profiler.disable() + + # Analyze results + s = io.StringIO() + ps = pstats.Stats(profiler, stream=s) + ps.sort_stats('cumulative') + ps.print_stats(20) + + print(s.getvalue()) + +profile_operation() +``` + +### Memory Usage Analysis + +```python +import tracemalloc +import lakefs + +def analyze_memory_usage(): + # Start tracing + tracemalloc.start() + + # Your operations + client = lakefs.Client() + repo = client.repository("my-repo") + branch = repo.branch("main") + + # Take snapshot before operations + snapshot1 = tracemalloc.take_snapshot() + + # Perform memory-intensive operations + large_data = "x" * (10 * 1024 * 1024) # 10MB string + for i in range(10): + branch.object(f"large_{i}.txt").upload(large_data) + + # Take snapshot after operations + snapshot2 = tracemalloc.take_snapshot() + + # Compare snapshots + top_stats = snapshot2.compare_to(snapshot1, 'lineno') + + print("Top 10 memory allocations:") + for stat in top_stats[:10]: + print(stat) + +analyze_memory_usage() +``` + +## Performance Optimization Tips + +### Connection Optimization + +```python +import lakefs +from urllib3.util.retry import Retry +from requests.adapters import HTTPAdapter + +# Create optimized session +session = requests.Session() + +# Configure retry strategy +retry_strategy = Retry( + total=3, + backoff_factor=1, + status_forcelist=[429, 500, 502, 503, 504], +) + +# Configure adapter with retry strategy +adapter = HTTPAdapter( + pool_connections=20, + pool_maxsize=20, + max_retries=retry_strategy +) + +session.mount("http://", adapter) +session.mount("https://", adapter) + +# Use optimized session with client +client = lakefs.Client(session=session) +``` + +### Batch Processing Optimization + +```python +from concurrent.futures import ThreadPoolExecutor, as_completed +import time + +class OptimizedBatchProcessor: + def __init__(self, client, max_workers=10, batch_size=100): + self.client = client + self.max_workers = max_workers + self.batch_size = batch_size + + def process_files_in_batches(self, repo_name, branch_name, files): + repo = self.client.repository(repo_name) + branch = repo.branch(branch_name) + + # Process files in batches + for i in range(0, len(files), self.batch_size): + batch = files[i:i + self.batch_size] + self._process_batch(branch, batch) + + # Small delay between batches to avoid overwhelming server + time.sleep(0.1) + + def _process_batch(self, branch, batch): + with ThreadPoolExecutor(max_workers=self.max_workers) as executor: + futures = [] + + for file_info in batch: + future = executor.submit(self._process_single_file, branch, file_info) + futures.append(future) + + # Wait for all files in batch to complete + for future in as_completed(futures): + try: + result = future.result() + print(f"Processed: {result}") + except Exception as e: + print(f"Error processing file: {e}") + + def _process_single_file(self, branch, file_info): + path, data = file_info + return branch.object(path).upload(data) +``` + +### Caching Strategies + +```python +import functools +import time +from typing import Dict, Any + +class CachedLakeFSClient: + def __init__(self, client, cache_ttl=300): # 5 minutes + self.client = client + self.cache_ttl = cache_ttl + self._cache = {} + + def _is_cache_valid(self, key): + if key not in self._cache: + return False + return time.time() - self._cache[key]['timestamp'] < self.cache_ttl + + def get_repository_cached(self, repo_name): + cache_key = f"repo:{repo_name}" + + if self._is_cache_valid(cache_key): + return self._cache[cache_key]['data'] + + repo = self.client.repository(repo_name) + self._cache[cache_key] = { + 'data': repo, + 'timestamp': time.time() + } + + return repo + + @functools.lru_cache(maxsize=128) + def get_branch_metadata(self, repo_name, branch_name): + """Cache branch metadata using LRU cache""" + repo = self.get_repository_cached(repo_name) + branch = repo.branch(branch_name) + + return { + 'id': branch.id, + 'commit_id': branch.head.id if branch.head else None + } +``` + +## Diagnostic Tools and Scripts + +### Health Check Script + +```python +#!/usr/bin/env python3 +""" +lakeFS Python SDK Health Check Script +""" + +import sys +import time +import lakefs +from typing import Dict, Any + +def check_connectivity(client: lakefs.Client) -> Dict[str, Any]: + """Check basic connectivity to lakeFS server""" + try: + start_time = time.time() + repos = list(client.repositories.list(amount=1)) + response_time = time.time() - start_time + + return { + 'status': 'pass', + 'response_time': response_time, + 'repository_count': len(repos) + } + except Exception as e: + return { + 'status': 'fail', + 'error': str(e) + } + +def check_authentication(client: lakefs.Client) -> Dict[str, Any]: + """Check authentication status""" + try: + # Try to perform an authenticated operation + repos = list(client.repositories.list(amount=1)) + return {'status': 'pass'} + except lakefs.exceptions.UnauthorizedException: + return {'status': 'fail', 'error': 'Invalid credentials'} + except Exception as e: + return {'status': 'fail', 'error': str(e)} + +def check_repository_operations(client: lakefs.Client, repo_name: str) -> Dict[str, Any]: + """Check repository operations""" + try: + repo = client.repository(repo_name) + branches = list(repo.branches.list(amount=1)) + + return { + 'status': 'pass', + 'branch_count': len(branches) + } + except lakefs.exceptions.NotFoundException: + return {'status': 'fail', 'error': f'Repository {repo_name} not found'} + except Exception as e: + return {'status': 'fail', 'error': str(e)} + +def main(): + """Run comprehensive health check""" + print("lakeFS Python SDK Health Check") + print("=" * 40) + + try: + client = lakefs.Client() + except Exception as e: + print(f"Failed to create client: {e}") + sys.exit(1) + + # Run checks + checks = { + 'Connectivity': check_connectivity(client), + 'Authentication': check_authentication(client), + } + + # Add repository check if specified + if len(sys.argv) > 1: + repo_name = sys.argv[1] + checks[f'Repository ({repo_name})'] = check_repository_operations(client, repo_name) + + # Print results + all_passed = True + for check_name, result in checks.items(): + status = "✓ PASS" if result['status'] == 'pass' else "✗ FAIL" + print(f"{check_name}: {status}") + + if result['status'] == 'fail': + all_passed = False + print(f" Error: {result.get('error', 'Unknown error')}") + elif 'response_time' in result: + print(f" Response time: {result['response_time']:.3f}s") + + print("\n" + "=" * 40) + if all_passed: + print("All checks passed!") + sys.exit(0) + else: + print("Some checks failed!") + sys.exit(1) + +if __name__ == "__main__": + main() +``` + +### Configuration Validator + +```python +#!/usr/bin/env python3 +""" +lakeFS Configuration Validator +""" + +import os +import sys +import requests +import lakefs +from urllib.parse import urlparse + +def validate_environment_variables(): + """Validate required environment variables""" + required_vars = [ + 'LAKEFS_HOST', + 'LAKEFS_ACCESS_KEY_ID', + 'LAKEFS_SECRET_ACCESS_KEY' + ] + + missing_vars = [] + for var in required_vars: + if not os.environ.get(var): + missing_vars.append(var) + + if missing_vars: + print(f"Missing environment variables: {', '.join(missing_vars)}") + return False + + print("✓ All required environment variables are set") + return True + +def validate_host_format(host: str): + """Validate host URL format""" + try: + parsed = urlparse(host) + if not parsed.scheme or not parsed.netloc: + print(f"✗ Invalid host format: {host}") + return False + + print(f"✓ Host format is valid: {host}") + return True + except Exception as e: + print(f"✗ Error parsing host: {e}") + return False + +def validate_connectivity(host: str): + """Validate network connectivity""" + try: + response = requests.get(f"{host}/api/v1/healthcheck", timeout=10) + if response.status_code == 200: + print("✓ Server is reachable and healthy") + return True + else: + print(f"✗ Server returned status code: {response.status_code}") + return False + except requests.exceptions.ConnectionError: + print("✗ Cannot connect to server") + return False + except requests.exceptions.Timeout: + print("✗ Connection timeout") + return False + except Exception as e: + print(f"✗ Connection error: {e}") + return False + +def main(): + """Run configuration validation""" + print("lakeFS Configuration Validator") + print("=" * 40) + + all_valid = True + + # Check environment variables + if not validate_environment_variables(): + all_valid = False + + # Check host format + host = os.environ.get('LAKEFS_HOST', '') + if host and not validate_host_format(host): + all_valid = False + + # Check connectivity + if host and not validate_connectivity(host): + all_valid = False + + # Test client creation + try: + client = lakefs.Client() + print("✓ Client created successfully") + except Exception as e: + print(f"✗ Failed to create client: {e}") + all_valid = False + + print("\n" + "=" * 40) + if all_valid: + print("Configuration is valid!") + sys.exit(0) + else: + print("Configuration has issues!") + sys.exit(1) + +if __name__ == "__main__": + main() +``` + +## Getting Help + +### Community Resources + +- **GitHub Issues**: [lakeFS Python SDK Issues](https://github.com/treeverse/lakeFS/issues) +- **Slack Community**: [Join lakeFS Slack](https://lakefs.io/slack) +- **Documentation**: [Official lakeFS Documentation](https://docs.lakefs.io) + +### Reporting Issues + +When reporting issues, please include: + +1. **Environment Information**: + ```python + import sys + import lakefs + import platform + + print(f"Python version: {sys.version}") + print(f"lakeFS SDK version: {lakefs.__version__}") + print(f"Platform: {platform.platform()}") + ``` + +2. **Minimal Reproduction Case**: + ```python + # Provide minimal code that reproduces the issue + import lakefs + + client = lakefs.Client(host="...", access_key_id="...", secret_access_key="...") + # Steps that cause the issue + ``` + +3. **Error Messages**: Include full error messages and stack traces + +4. **Configuration**: Describe your lakeFS server setup and network configuration + +### Advanced Debugging + +For complex issues, consider: + +1. **Enable verbose logging** for all components +2. **Use network traffic inspection** tools like Wireshark +3. **Profile memory and CPU usage** during operations +4. **Test with minimal configurations** to isolate issues +5. **Compare behavior** across different SDK options + +## See Also + +- [API Comparison](api-comparison.md) - Choose the right SDK for your use case +- [Best Practices](best-practices.md) - Production deployment guidelines +- [Getting Started Guide](../getting-started.md) - Initial setup and configuration +- [High-Level SDK Documentation](../high-level-sdk/index.md) - Comprehensive SDK guide +- [Generated SDK Documentation](../generated-sdk/index.md) - Direct API access +- [lakefs-spec Documentation](../lakefs-spec/index.md) - Filesystem interface +- [Boto3 Integration](../boto3/index.md) - S3-compatible operations \ No newline at end of file diff --git a/docs/src/integrations/python/tutorials/data-science-workflow.md b/docs/src/integrations/python/tutorials/data-science-workflow.md new file mode 100644 index 00000000000..76a88f4d5e2 --- /dev/null +++ b/docs/src/integrations/python/tutorials/data-science-workflow.md @@ -0,0 +1,14 @@ +--- +title: Data Science Workflow Tutorial +description: End-to-end data science workflow using Python and lakeFS +sdk_types: ["lakefs-spec", "high-level"] +difficulty: "beginner" +use_cases: ["data-science", "tutorials", "pandas-integration", "jupyter-notebooks"] +topics: ["workflow", "data-analysis", "pandas", "jupyter"] +audience: ["data-scientists", "analysts", "python-developers"] +last_updated: "2024-01-15" +--- + +# Data Science Workflow Tutorial + +Placeholder content for data science workflow tutorial. \ No newline at end of file diff --git a/docs/src/integrations/python/tutorials/etl-pipeline.md b/docs/src/integrations/python/tutorials/etl-pipeline.md new file mode 100644 index 00000000000..1ad845145ba --- /dev/null +++ b/docs/src/integrations/python/tutorials/etl-pipeline.md @@ -0,0 +1,3097 @@ +--- +title: ETL Pipeline Tutorial +description: Build production-ready ETL pipelines with lakeFS transactions and error handling +sdk_types: ["high-level", "generated"] +difficulty: "advanced" +use_cases: ["etl", "data-engineering", "production", "pipelines"] +topics: ["etl", "transactions", "error-handling", "batch-processing", "data-validation"] +audience: ["data-engineers", "backend-developers", "devops"] +last_updated: "2024-01-15" +--- + +# ETL Pipeline Tutorial + +Learn how to build robust, production-ready ETL (Extract, Transform, Load) pipelines using lakeFS for data versioning, transactions for atomicity, and comprehensive error handling. This tutorial demonstrates enterprise-grade patterns for data processing with proper validation, monitoring, and recovery mechanisms. + +## What You'll Build + +By the end of this tutorial, you'll have: + +- **Transactional ETL Pipeline** - Atomic data processing with rollback capabilities +- **Error Handling System** - Comprehensive error detection and recovery +- **Data Validation Framework** - Quality checks and validation rules +- **Batch Processing Engine** - Scalable data processing patterns +- **Monitoring and Alerting** - Pipeline health monitoring and notifications +- **Production Deployment** - CI/CD ready pipeline with proper logging + +## Prerequisites + +### Knowledge Requirements +- Intermediate Python programming +- Understanding of ETL concepts +- Basic database and SQL knowledge +- Familiarity with data validation concepts + +### Environment Setup +- Python 3.8+ installed +- lakeFS server running (local or cloud) +- Database access (PostgreSQL/MySQL for examples) +- Required Python packages (we'll install these) + +### lakeFS Setup +```bash +# Start lakeFS locally (if not already running) +docker run --name lakefs --rm -p 8000:8000 treeverse/lakefs:latest run --local-settings +``` + +## Step 1: Environment Setup and Dependencies + +### Install Required Packages + +```bash +# Install lakeFS and data processing libraries +pip install lakefs pandas sqlalchemy psycopg2-binary +pip install pydantic jsonschema great-expectations +pip install schedule prometheus-client structlog +pip install retry tenacity +``` + +### Project Structure Setup + +```bash +# Create ETL project structure +mkdir lakefs-etl-pipeline +cd lakefs-etl-pipeline + +# Create directory structure +mkdir -p {config,src/{extractors,transformers,loaders,validators},tests,logs,monitoring} + +# Create configuration files +touch config/{database.yaml,pipeline.yaml,validation.yaml} +touch src/{__init__.py,pipeline.py,exceptions.py} +``` + +### Configuration Management + +```python +# config/settings.py +import os +from dataclasses import dataclass +from typing import Dict, List, Optional +import yaml + +@dataclass +class LakeFSConfig: + host: str + username: str + password: str + repository: str + +@dataclass +class DatabaseConfig: + host: str + port: int + database: str + username: str + password: str + +@dataclass +class PipelineConfig: + batch_size: int + max_retries: int + timeout_seconds: int + validation_enabled: bool + monitoring_enabled: bool + +class ConfigManager: + """Centralized configuration management""" + + def __init__(self, config_path: str = "config"): + self.config_path = config_path + self._load_configs() + + def _load_configs(self): + """Load all configuration files""" + # lakeFS configuration + self.lakefs = LakeFSConfig( + host=os.getenv('LAKEFS_HOST', 'http://localhost:8000'), + username=os.getenv('LAKEFS_ACCESS_KEY', 'AKIAIOSFODNN7EXAMPLE'), + password=os.getenv('LAKEFS_SECRET_KEY', 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY'), + repository=os.getenv('LAKEFS_REPOSITORY', 'etl-pipeline') + ) + + # Database configuration + self.database = DatabaseConfig( + host=os.getenv('DB_HOST', 'localhost'), + port=int(os.getenv('DB_PORT', '5432')), + database=os.getenv('DB_NAME', 'etl_source'), + username=os.getenv('DB_USER', 'postgres'), + password=os.getenv('DB_PASSWORD', 'password') + ) + + # Pipeline configuration + self.pipeline = PipelineConfig( + batch_size=int(os.getenv('BATCH_SIZE', '1000')), + max_retries=int(os.getenv('MAX_RETRIES', '3')), + timeout_seconds=int(os.getenv('TIMEOUT_SECONDS', '300')), + validation_enabled=os.getenv('VALIDATION_ENABLED', 'true').lower() == 'true', + monitoring_enabled=os.getenv('MONITORING_ENABLED', 'true').lower() == 'true' + ) + +# Global configuration instance +config = ConfigManager() +``` + +## Step 2: Repository Setup and Data Models + +### Create ETL Repository + +```python +# src/repository_setup.py +import lakefs +from lakefs.exceptions import RepositoryExistsError +from config.settings import config +import logging + +logger = logging.getLogger(__name__) + +def setup_etl_repository(): + """Initialize lakeFS repository for ETL pipeline""" + + client = lakefs.Client( + host=config.lakefs.host, + username=config.lakefs.username, + password=config.lakefs.password + ) + + # Create repository + try: + repo = client.repositories.create( + name=config.lakefs.repository, + storage_namespace='s3://my-bucket/etl-pipeline/', + default_branch='main' + ) + logger.info(f"Created repository: {repo.id}") + except RepositoryExistsError: + repo = client.repositories.get(config.lakefs.repository) + logger.info(f"Using existing repository: {repo.id}") + + # Create standard branches + branches_to_create = [ + ('staging', 'main'), + ('development', 'main'), + ('production', 'main') + ] + + for branch_name, source in branches_to_create: + try: + branch = repo.branches.create(branch_name, source_reference=source) + logger.info(f"Created branch: {branch_name}") + except Exception as e: + logger.info(f"Branch {branch_name} already exists or error: {e}") + + return repo + +def create_directory_structure(repo): + """Create standard directory structure in lakeFS""" + + main_branch = repo.branches.get('main') + + # Create directory structure with placeholder files + directories = [ + 'raw_data/', + 'processed_data/', + 'validated_data/', + 'failed_data/', + 'logs/', + 'schemas/', + 'checkpoints/' + ] + + for directory in directories: + try: + main_branch.objects.upload( + path=f"{directory}.gitkeep", + data=b"# Directory placeholder", + content_type='text/plain' + ) + except Exception as e: + logger.warning(f"Could not create {directory}: {e}") + + # Commit directory structure + commit = main_branch.commits.create( + message="Initialize ETL pipeline directory structure", + metadata={'setup': 'directory_structure', 'pipeline': 'etl'} + ) + + logger.info(f"Directory structure created: {commit.id}") + return commit + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + repo = setup_etl_repository() + create_directory_structure(repo) +``` + +### Data Models and Schemas + +```python +# src/models.py +from dataclasses import dataclass, field +from datetime import datetime +from typing import Dict, List, Optional, Any +from enum import Enum +import json + +class PipelineStatus(Enum): + PENDING = "pending" + RUNNING = "running" + SUCCESS = "success" + FAILED = "failed" + RETRYING = "retrying" + +class ValidationStatus(Enum): + PASSED = "passed" + FAILED = "failed" + WARNING = "warning" + +@dataclass +class PipelineRun: + """Represents a single ETL pipeline execution""" + run_id: str + pipeline_name: str + start_time: datetime + end_time: Optional[datetime] = None + status: PipelineStatus = PipelineStatus.PENDING + source_commit: Optional[str] = None + target_commit: Optional[str] = None + records_processed: int = 0 + records_failed: int = 0 + error_message: Optional[str] = None + metadata: Dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary for serialization""" + return { + 'run_id': self.run_id, + 'pipeline_name': self.pipeline_name, + 'start_time': self.start_time.isoformat(), + 'end_time': self.end_time.isoformat() if self.end_time else None, + 'status': self.status.value, + 'source_commit': self.source_commit, + 'target_commit': self.target_commit, + 'records_processed': self.records_processed, + 'records_failed': self.records_failed, + 'error_message': self.error_message, + 'metadata': self.metadata + } + +@dataclass +class ValidationResult: + """Represents validation results for a dataset""" + dataset_name: str + validation_time: datetime + status: ValidationStatus + total_records: int + passed_records: int + failed_records: int + warnings: List[str] = field(default_factory=list) + errors: List[str] = field(default_factory=list) + details: Dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary for serialization""" + return { + 'dataset_name': self.dataset_name, + 'validation_time': self.validation_time.isoformat(), + 'status': self.status.value, + 'total_records': self.total_records, + 'passed_records': self.passed_records, + 'failed_records': self.failed_records, + 'warnings': self.warnings, + 'errors': self.errors, + 'details': self.details + } + +@dataclass +class DataQualityMetrics: + """Data quality metrics for monitoring""" + dataset_name: str + timestamp: datetime + completeness_score: float # 0-1 + accuracy_score: float # 0-1 + consistency_score: float # 0-1 + timeliness_score: float # 0-1 + overall_score: float # 0-1 + record_count: int + null_percentage: float + duplicate_percentage: float + schema_violations: int + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary for serialization""" + return { + 'dataset_name': self.dataset_name, + 'timestamp': self.timestamp.isoformat(), + 'completeness_score': self.completeness_score, + 'accuracy_score': self.accuracy_score, + 'consistency_score': self.consistency_score, + 'timeliness_score': self.timeliness_score, + 'overall_score': self.overall_score, + 'record_count': self.record_count, + 'null_percentage': self.null_percentage, + 'duplicate_percentage': self.duplicate_percentage, + 'schema_violations': self.schema_violations + } +``` + +## Step 3: Data Extraction Components + +### Database Extractor + +```python +# src/extractors/database_extractor.py +import pandas as pd +import sqlalchemy as sa +from sqlalchemy import create_engine, text +from typing import Dict, List, Optional, Iterator +import logging +from datetime import datetime, timedelta +from config.settings import config +from src.models import PipelineRun + +logger = logging.getLogger(__name__) + +class DatabaseExtractor: + """Extract data from relational databases with incremental loading support""" + + def __init__(self): + self.engine = self._create_engine() + + def _create_engine(self) -> sa.Engine: + """Create database engine with connection pooling""" + db_config = config.database + connection_string = ( + f"postgresql://{db_config.username}:{db_config.password}" + f"@{db_config.host}:{db_config.port}/{db_config.database}" + ) + + return create_engine( + connection_string, + pool_size=10, + max_overflow=20, + pool_pre_ping=True, + pool_recycle=3600 + ) + + def extract_full_table(self, table_name: str, chunk_size: int = None) -> Iterator[pd.DataFrame]: + """Extract complete table data in chunks""" + + chunk_size = chunk_size or config.pipeline.batch_size + + try: + # Get total record count + count_query = f"SELECT COUNT(*) FROM {table_name}" + with self.engine.connect() as conn: + total_records = conn.execute(text(count_query)).scalar() + + logger.info(f"Extracting {total_records} records from {table_name}") + + # Extract data in chunks + offset = 0 + while offset < total_records: + query = f""" + SELECT * FROM {table_name} + ORDER BY id + LIMIT {chunk_size} OFFSET {offset} + """ + + chunk_df = pd.read_sql(query, self.engine) + + if chunk_df.empty: + break + + logger.debug(f"Extracted chunk: {len(chunk_df)} records (offset: {offset})") + yield chunk_df + + offset += chunk_size + + except Exception as e: + logger.error(f"Error extracting from {table_name}: {e}") + raise + + def extract_incremental( + self, + table_name: str, + timestamp_column: str, + last_extracted: Optional[datetime] = None, + chunk_size: int = None + ) -> Iterator[pd.DataFrame]: + """Extract data incrementally based on timestamp""" + + chunk_size = chunk_size or config.pipeline.batch_size + + # Default to 24 hours ago if no last extraction time + if last_extracted is None: + last_extracted = datetime.now() - timedelta(hours=24) + + try: + # Build incremental query + base_query = f""" + SELECT * FROM {table_name} + WHERE {timestamp_column} > %(last_extracted)s + ORDER BY {timestamp_column} + """ + + # Get total incremental records + count_query = f""" + SELECT COUNT(*) FROM {table_name} + WHERE {timestamp_column} > %(last_extracted)s + """ + + with self.engine.connect() as conn: + total_records = conn.execute( + text(count_query), + {'last_extracted': last_extracted} + ).scalar() + + logger.info(f"Extracting {total_records} incremental records from {table_name}") + + if total_records == 0: + logger.info("No new records to extract") + return + + # Extract in chunks + offset = 0 + while offset < total_records: + query = f"{base_query} LIMIT {chunk_size} OFFSET {offset}" + + chunk_df = pd.read_sql( + query, + self.engine, + params={'last_extracted': last_extracted} + ) + + if chunk_df.empty: + break + + logger.debug(f"Extracted incremental chunk: {len(chunk_df)} records") + yield chunk_df + + offset += chunk_size + + except Exception as e: + logger.error(f"Error in incremental extraction from {table_name}: {e}") + raise + + def extract_custom_query(self, query: str, params: Dict = None) -> pd.DataFrame: + """Execute custom SQL query and return results""" + + try: + logger.info(f"Executing custom query: {query[:100]}...") + + df = pd.read_sql(query, self.engine, params=params or {}) + + logger.info(f"Custom query returned {len(df)} records") + return df + + except Exception as e: + logger.error(f"Error executing custom query: {e}") + raise + + def get_table_schema(self, table_name: str) -> Dict[str, str]: + """Get table schema information""" + + try: + query = """ + SELECT column_name, data_type, is_nullable + FROM information_schema.columns + WHERE table_name = %(table_name)s + ORDER BY ordinal_position + """ + + schema_df = pd.read_sql(query, self.engine, params={'table_name': table_name}) + + return { + row['column_name']: { + 'data_type': row['data_type'], + 'is_nullable': row['is_nullable'] + } + for _, row in schema_df.iterrows() + } + + except Exception as e: + logger.error(f"Error getting schema for {table_name}: {e}") + raise + + def close(self): + """Close database connections""" + if self.engine: + self.engine.dispose() + logger.info("Database connections closed") + +# Example usage and testing +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + + extractor = DatabaseExtractor() + + # Test schema extraction + try: + schema = extractor.get_table_schema('users') + print("Table schema:", schema) + except Exception as e: + print(f"Schema extraction failed: {e}") + + # Test data extraction + try: + for chunk in extractor.extract_full_table('users', chunk_size=100): + print(f"Extracted chunk with {len(chunk)} records") + break # Just test first chunk + except Exception as e: + print(f"Data extraction failed: {e}") + + extractor.close() +```## S +tep 4: Data Transformation Engine + +### Core Transformation Framework + +```python +# src/transformers/base_transformer.py +from abc import ABC, abstractmethod +import pandas as pd +from typing import Dict, List, Any, Optional +import logging +from datetime import datetime +from src.models import ValidationResult, ValidationStatus + +logger = logging.getLogger(__name__) + +class BaseTransformer(ABC): + """Abstract base class for all data transformers""" + + def __init__(self, name: str): + self.name = name + self.logger = logging.getLogger(f"{__name__}.{name}") + + @abstractmethod + def transform(self, df: pd.DataFrame) -> pd.DataFrame: + """Apply transformation to DataFrame""" + pass + + def validate_input(self, df: pd.DataFrame) -> bool: + """Validate input data before transformation""" + if df.empty: + self.logger.warning("Input DataFrame is empty") + return False + return True + + def validate_output(self, df: pd.DataFrame) -> bool: + """Validate output data after transformation""" + if df.empty: + self.logger.warning("Output DataFrame is empty") + return False + return True + + def execute(self, df: pd.DataFrame) -> pd.DataFrame: + """Execute transformation with validation""" + self.logger.info(f"Starting transformation: {self.name}") + + if not self.validate_input(df): + raise ValueError(f"Input validation failed for {self.name}") + + input_count = len(df) + result_df = self.transform(df) + output_count = len(result_df) + + if not self.validate_output(result_df): + raise ValueError(f"Output validation failed for {self.name}") + + self.logger.info( + f"Transformation {self.name} complete: {input_count} -> {output_count} records" + ) + + return result_df + +class DataCleaningTransformer(BaseTransformer): + """Clean and standardize data""" + + def __init__(self, cleaning_rules: Dict[str, Any]): + super().__init__("DataCleaning") + self.cleaning_rules = cleaning_rules + + def transform(self, df: pd.DataFrame) -> pd.DataFrame: + """Apply data cleaning rules""" + result_df = df.copy() + + # Remove duplicates + if self.cleaning_rules.get('remove_duplicates', False): + initial_count = len(result_df) + result_df = result_df.drop_duplicates() + removed_count = initial_count - len(result_df) + if removed_count > 0: + self.logger.info(f"Removed {removed_count} duplicate records") + + # Handle missing values + null_handling = self.cleaning_rules.get('null_handling', {}) + for column, strategy in null_handling.items(): + if column in result_df.columns: + if strategy == 'drop': + result_df = result_df.dropna(subset=[column]) + elif strategy == 'fill_zero': + result_df[column] = result_df[column].fillna(0) + elif strategy == 'fill_mean': + result_df[column] = result_df[column].fillna(result_df[column].mean()) + elif isinstance(strategy, (str, int, float)): + result_df[column] = result_df[column].fillna(strategy) + + # Data type conversions + type_conversions = self.cleaning_rules.get('type_conversions', {}) + for column, target_type in type_conversions.items(): + if column in result_df.columns: + try: + if target_type == 'datetime': + result_df[column] = pd.to_datetime(result_df[column]) + elif target_type == 'numeric': + result_df[column] = pd.to_numeric(result_df[column], errors='coerce') + else: + result_df[column] = result_df[column].astype(target_type) + except Exception as e: + self.logger.warning(f"Type conversion failed for {column}: {e}") + + # String cleaning + string_cleaning = self.cleaning_rules.get('string_cleaning', {}) + for column, operations in string_cleaning.items(): + if column in result_df.columns and result_df[column].dtype == 'object': + if 'strip' in operations: + result_df[column] = result_df[column].str.strip() + if 'lower' in operations: + result_df[column] = result_df[column].str.lower() + if 'upper' in operations: + result_df[column] = result_df[column].str.upper() + if 'remove_special_chars' in operations: + result_df[column] = result_df[column].str.replace(r'[^\w\s]', '', regex=True) + + return result_df + +class BusinessLogicTransformer(BaseTransformer): + """Apply business logic transformations""" + + def __init__(self, business_rules: Dict[str, Any]): + super().__init__("BusinessLogic") + self.business_rules = business_rules + + def transform(self, df: pd.DataFrame) -> pd.DataFrame: + """Apply business logic rules""" + result_df = df.copy() + + # Calculate derived columns + derived_columns = self.business_rules.get('derived_columns', {}) + for new_column, formula in derived_columns.items(): + try: + # Simple formula evaluation (extend as needed) + if isinstance(formula, str): + result_df[new_column] = result_df.eval(formula) + elif callable(formula): + result_df[new_column] = result_df.apply(formula, axis=1) + except Exception as e: + self.logger.error(f"Error calculating {new_column}: {e}") + result_df[new_column] = None + + # Apply filters + filters = self.business_rules.get('filters', []) + for filter_condition in filters: + try: + initial_count = len(result_df) + result_df = result_df.query(filter_condition) + filtered_count = initial_count - len(result_df) + if filtered_count > 0: + self.logger.info(f"Filtered out {filtered_count} records with: {filter_condition}") + except Exception as e: + self.logger.error(f"Error applying filter '{filter_condition}': {e}") + + # Categorization + categorizations = self.business_rules.get('categorizations', {}) + for column, categories in categorizations.items(): + if column in result_df.columns: + def categorize_value(value): + for category, condition in categories.items(): + if callable(condition) and condition(value): + return category + elif isinstance(condition, (list, tuple)) and value in condition: + return category + elif isinstance(condition, dict): + if condition.get('min', float('-inf')) <= value <= condition.get('max', float('inf')): + return category + return 'Other' + + result_df[f"{column}_category"] = result_df[column].apply(categorize_value) + + return result_df + +class AggregationTransformer(BaseTransformer): + """Perform data aggregations""" + + def __init__(self, aggregation_rules: Dict[str, Any]): + super().__init__("Aggregation") + self.aggregation_rules = aggregation_rules + + def transform(self, df: pd.DataFrame) -> pd.DataFrame: + """Apply aggregation rules""" + + group_by = self.aggregation_rules.get('group_by', []) + aggregations = self.aggregation_rules.get('aggregations', {}) + + if not group_by or not aggregations: + self.logger.warning("No grouping or aggregation rules specified") + return df + + try: + # Perform groupby aggregation + result_df = df.groupby(group_by).agg(aggregations).reset_index() + + # Flatten column names if multi-level + if isinstance(result_df.columns, pd.MultiIndex): + result_df.columns = ['_'.join(col).strip() if col[1] else col[0] + for col in result_df.columns.values] + + self.logger.info(f"Aggregated {len(df)} records into {len(result_df)} groups") + return result_df + + except Exception as e: + self.logger.error(f"Aggregation failed: {e}") + raise + +class TransformationPipeline: + """Chain multiple transformers together""" + + def __init__(self, transformers: List[BaseTransformer]): + self.transformers = transformers + self.logger = logging.getLogger(__name__) + + def execute(self, df: pd.DataFrame) -> pd.DataFrame: + """Execute all transformers in sequence""" + + result_df = df.copy() + + for transformer in self.transformers: + try: + result_df = transformer.execute(result_df) + except Exception as e: + self.logger.error(f"Transformation pipeline failed at {transformer.name}: {e}") + raise + + self.logger.info(f"Transformation pipeline complete: {len(df)} -> {len(result_df)} records") + return result_df + +# Example transformation configurations +SAMPLE_CLEANING_RULES = { + 'remove_duplicates': True, + 'null_handling': { + 'email': 'drop', + 'age': 'fill_zero', + 'income': 'fill_mean', + 'status': 'active' + }, + 'type_conversions': { + 'created_at': 'datetime', + 'age': 'int', + 'income': 'float' + }, + 'string_cleaning': { + 'email': ['strip', 'lower'], + 'name': ['strip'], + 'status': ['strip', 'lower'] + } +} + +SAMPLE_BUSINESS_RULES = { + 'derived_columns': { + 'age_group': lambda row: 'Young' if row['age'] < 30 else 'Middle' if row['age'] < 60 else 'Senior', + 'income_bracket': 'income // 10000 * 10000' # Round to nearest 10k + }, + 'filters': [ + 'age >= 18', + 'income > 0', + 'status == "active"' + ], + 'categorizations': { + 'income': { + 'Low': {'min': 0, 'max': 30000}, + 'Medium': {'min': 30001, 'max': 80000}, + 'High': {'min': 80001, 'max': float('inf')} + } + } +} + +SAMPLE_AGGREGATION_RULES = { + 'group_by': ['age_group', 'income_category'], + 'aggregations': { + 'income': ['mean', 'median', 'count'], + 'age': ['mean', 'min', 'max'] + } +} + +# Example usage +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + + # Create sample data + sample_data = pd.DataFrame({ + 'id': range(1, 101), + 'name': [f'User {i}' for i in range(1, 101)], + 'email': [f'user{i}@example.com' if i % 10 != 0 else None for i in range(1, 101)], + 'age': [20 + (i % 50) for i in range(1, 101)], + 'income': [30000 + (i * 1000) for i in range(1, 101)], + 'status': ['active' if i % 5 != 0 else 'inactive' for i in range(1, 101)] + }) + + # Create transformation pipeline + transformers = [ + DataCleaningTransformer(SAMPLE_CLEANING_RULES), + BusinessLogicTransformer(SAMPLE_BUSINESS_RULES), + AggregationTransformer(SAMPLE_AGGREGATION_RULES) + ] + + pipeline = TransformationPipeline(transformers) + + # Execute pipeline + try: + result = pipeline.execute(sample_data) + print("Transformation successful!") + print(result.head()) + except Exception as e: + print(f"Transformation failed: {e}") +``` + +## Step 5: Data Validation Framework + +### Comprehensive Data Validation + +```python +# src/validators/data_validator.py +import pandas as pd +import numpy as np +from typing import Dict, List, Any, Optional, Callable +import logging +from datetime import datetime +import jsonschema +from src.models import ValidationResult, ValidationStatus + +logger = logging.getLogger(__name__) + +class ValidationRule: + """Individual validation rule""" + + def __init__(self, name: str, description: str, validator_func: Callable, severity: str = 'error'): + self.name = name + self.description = description + self.validator_func = validator_func + self.severity = severity # 'error', 'warning', 'info' + + def validate(self, df: pd.DataFrame) -> Dict[str, Any]: + """Execute validation rule""" + try: + result = self.validator_func(df) + return { + 'rule_name': self.name, + 'description': self.description, + 'severity': self.severity, + 'passed': result.get('passed', False), + 'message': result.get('message', ''), + 'details': result.get('details', {}), + 'affected_records': result.get('affected_records', 0) + } + except Exception as e: + return { + 'rule_name': self.name, + 'description': self.description, + 'severity': 'error', + 'passed': False, + 'message': f"Validation rule execution failed: {str(e)}", + 'details': {}, + 'affected_records': 0 + } + +class DataValidator: + """Comprehensive data validation framework""" + + def __init__(self): + self.rules: List[ValidationRule] = [] + self.logger = logging.getLogger(__name__) + + def add_rule(self, rule: ValidationRule): + """Add validation rule""" + self.rules.append(rule) + self.logger.debug(f"Added validation rule: {rule.name}") + + def add_schema_validation(self, schema: Dict[str, Any]): + """Add JSON schema validation""" + def schema_validator(df: pd.DataFrame) -> Dict[str, Any]: + violations = [] + + for column, column_schema in schema.get('properties', {}).items(): + if column not in df.columns: + if column in schema.get('required', []): + violations.append(f"Required column '{column}' is missing") + continue + + # Type validation + expected_type = column_schema.get('type') + if expected_type: + if expected_type == 'string' and df[column].dtype != 'object': + violations.append(f"Column '{column}' should be string type") + elif expected_type == 'number' and not pd.api.types.is_numeric_dtype(df[column]): + violations.append(f"Column '{column}' should be numeric type") + elif expected_type == 'integer' and df[column].dtype not in ['int64', 'int32']: + violations.append(f"Column '{column}' should be integer type") + + # Range validation + if 'minimum' in column_schema: + min_violations = (df[column] < column_schema['minimum']).sum() + if min_violations > 0: + violations.append(f"Column '{column}' has {min_violations} values below minimum {column_schema['minimum']}") + + if 'maximum' in column_schema: + max_violations = (df[column] > column_schema['maximum']).sum() + if max_violations > 0: + violations.append(f"Column '{column}' has {max_violations} values above maximum {column_schema['maximum']}") + + return { + 'passed': len(violations) == 0, + 'message': '; '.join(violations) if violations else 'Schema validation passed', + 'details': {'violations': violations}, + 'affected_records': len(violations) + } + + rule = ValidationRule( + name="schema_validation", + description="Validate data against JSON schema", + validator_func=schema_validator, + severity='error' + ) + self.add_rule(rule) + + def add_completeness_check(self, required_columns: List[str], threshold: float = 0.95): + """Add data completeness validation""" + def completeness_validator(df: pd.DataFrame) -> Dict[str, Any]: + issues = [] + + for column in required_columns: + if column not in df.columns: + issues.append(f"Required column '{column}' is missing") + continue + + completeness = 1 - (df[column].isnull().sum() / len(df)) + if completeness < threshold: + issues.append(f"Column '{column}' completeness {completeness:.2%} below threshold {threshold:.2%}") + + return { + 'passed': len(issues) == 0, + 'message': '; '.join(issues) if issues else f'Completeness check passed for {len(required_columns)} columns', + 'details': {'threshold': threshold, 'issues': issues}, + 'affected_records': len(issues) + } + + rule = ValidationRule( + name="completeness_check", + description=f"Check data completeness for required columns (threshold: {threshold:.2%})", + validator_func=completeness_validator, + severity='error' + ) + self.add_rule(rule) + + def add_uniqueness_check(self, unique_columns: List[str]): + """Add uniqueness validation""" + def uniqueness_validator(df: pd.DataFrame) -> Dict[str, Any]: + issues = [] + + for column in unique_columns: + if column not in df.columns: + issues.append(f"Column '{column}' not found for uniqueness check") + continue + + duplicates = df[column].duplicated().sum() + if duplicates > 0: + issues.append(f"Column '{column}' has {duplicates} duplicate values") + + return { + 'passed': len(issues) == 0, + 'message': '; '.join(issues) if issues else f'Uniqueness check passed for {len(unique_columns)} columns', + 'details': {'issues': issues}, + 'affected_records': sum([int(issue.split()[3]) for issue in issues if 'duplicate' in issue]) + } + + rule = ValidationRule( + name="uniqueness_check", + description="Check for duplicate values in unique columns", + validator_func=uniqueness_validator, + severity='error' + ) + self.add_rule(rule) + + def add_referential_integrity_check(self, foreign_keys: Dict[str, Dict[str, Any]]): + """Add referential integrity validation""" + def referential_validator(df: pd.DataFrame) -> Dict[str, Any]: + issues = [] + + for fk_column, reference_info in foreign_keys.items(): + if fk_column not in df.columns: + issues.append(f"Foreign key column '{fk_column}' not found") + continue + + # For this example, we'll check against a list of valid values + # In practice, this would query the reference table + valid_values = reference_info.get('valid_values', []) + if valid_values: + invalid_count = (~df[fk_column].isin(valid_values)).sum() + if invalid_count > 0: + issues.append(f"Column '{fk_column}' has {invalid_count} invalid references") + + return { + 'passed': len(issues) == 0, + 'message': '; '.join(issues) if issues else 'Referential integrity check passed', + 'details': {'issues': issues}, + 'affected_records': sum([int(issue.split()[3]) for issue in issues if 'invalid' in issue]) + } + + rule = ValidationRule( + name="referential_integrity", + description="Check referential integrity constraints", + validator_func=referential_validator, + severity='error' + ) + self.add_rule(rule) + + def add_business_rule_validation(self, business_rules: List[Dict[str, Any]]): + """Add custom business rule validation""" + def business_rule_validator(df: pd.DataFrame) -> Dict[str, Any]: + issues = [] + + for rule in business_rules: + rule_name = rule.get('name', 'unnamed_rule') + condition = rule.get('condition', '') + + try: + violations = df.query(f"not ({condition})") + if len(violations) > 0: + issues.append(f"Business rule '{rule_name}' violated by {len(violations)} records") + except Exception as e: + issues.append(f"Business rule '{rule_name}' evaluation failed: {str(e)}") + + return { + 'passed': len(issues) == 0, + 'message': '; '.join(issues) if issues else f'Business rule validation passed for {len(business_rules)} rules', + 'details': {'issues': issues}, + 'affected_records': len(issues) + } + + rule = ValidationRule( + name="business_rules", + description="Validate custom business rules", + validator_func=business_rule_validator, + severity='warning' + ) + self.add_rule(rule) + + def validate(self, df: pd.DataFrame, dataset_name: str = "unknown") -> ValidationResult: + """Execute all validation rules""" + + self.logger.info(f"Starting validation for dataset: {dataset_name}") + + validation_results = [] + total_errors = 0 + total_warnings = 0 + + for rule in self.rules: + result = rule.validate(df) + validation_results.append(result) + + if not result['passed']: + if result['severity'] == 'error': + total_errors += 1 + elif result['severity'] == 'warning': + total_warnings += 1 + + # Determine overall status + if total_errors > 0: + status = ValidationStatus.FAILED + elif total_warnings > 0: + status = ValidationStatus.WARNING + else: + status = ValidationStatus.PASSED + + # Collect error and warning messages + errors = [r['message'] for r in validation_results if not r['passed'] and r['severity'] == 'error'] + warnings = [r['message'] for r in validation_results if not r['passed'] and r['severity'] == 'warning'] + + # Calculate passed/failed records (simplified) + failed_records = sum([r['affected_records'] for r in validation_results if not r['passed']]) + passed_records = len(df) - failed_records + + validation_result = ValidationResult( + dataset_name=dataset_name, + validation_time=datetime.now(), + status=status, + total_records=len(df), + passed_records=max(0, passed_records), + failed_records=failed_records, + warnings=warnings, + errors=errors, + details={'rule_results': validation_results} + ) + + self.logger.info(f"Validation complete for {dataset_name}: {status.value}") + return validation_result + +# Example validation configurations +SAMPLE_SCHEMA = { + "type": "object", + "properties": { + "id": {"type": "integer", "minimum": 1}, + "name": {"type": "string"}, + "email": {"type": "string"}, + "age": {"type": "integer", "minimum": 0, "maximum": 150}, + "income": {"type": "number", "minimum": 0}, + "status": {"type": "string"} + }, + "required": ["id", "name", "email"] +} + +SAMPLE_BUSINESS_RULES = [ + { + "name": "adult_users_only", + "condition": "age >= 18" + }, + { + "name": "valid_email_format", + "condition": "email.str.contains('@', na=False)" + }, + { + "name": "positive_income", + "condition": "income > 0" + } +] + +# Example usage +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + + # Create sample data with some issues + sample_data = pd.DataFrame({ + 'id': [1, 2, 3, 4, 5, 5], # Duplicate ID + 'name': ['Alice', 'Bob', None, 'David', 'Eve', 'Frank'], # Missing name + 'email': ['alice@example.com', 'bob@example.com', 'invalid-email', 'david@example.com', 'eve@example.com', 'frank@example.com'], + 'age': [25, 30, 17, 35, -5, 40], # Underage and negative age + 'income': [50000, 60000, 0, 70000, 80000, -1000], # Zero and negative income + 'status': ['active', 'active', 'inactive', 'active', 'active', 'active'] + }) + + # Create validator + validator = DataValidator() + + # Add validation rules + validator.add_schema_validation(SAMPLE_SCHEMA) + validator.add_completeness_check(['id', 'name', 'email'], threshold=0.9) + validator.add_uniqueness_check(['id', 'email']) + validator.add_business_rule_validation(SAMPLE_BUSINESS_RULES) + + # Run validation + result = validator.validate(sample_data, "sample_dataset") + + print(f"Validation Status: {result.status.value}") + print(f"Total Records: {result.total_records}") + print(f"Passed Records: {result.passed_records}") + print(f"Failed Records: {result.failed_records}") + + if result.errors: + print("\nErrors:") + for error in result.errors: + print(f" - {error}") + + if result.warnings: + print("\nWarnings:") + for warning in result.warnings: + print(f" - {warning}") +```# +# Step 6: Transactional ETL Pipeline with lakeFS + +### Core ETL Pipeline with Transactions + +```python +# src/pipeline.py +import lakefs +from lakefs.exceptions import ConflictError, NotFoundException +import pandas as pd +import logging +from datetime import datetime, timedelta +from typing import Dict, List, Optional, Any +import uuid +import json +import traceback +from contextlib import contextmanager + +from config.settings import config +from src.models import PipelineRun, PipelineStatus, ValidationResult +from src.extractors.database_extractor import DatabaseExtractor +from src.transformers.base_transformer import TransformationPipeline +from src.validators.data_validator import DataValidator +from src.exceptions import ETLPipelineError, DataValidationError, TransformationError + +logger = logging.getLogger(__name__) + +class TransactionalETLPipeline: + """Production-ready ETL pipeline with lakeFS transactions""" + + def __init__(self, pipeline_name: str): + self.pipeline_name = pipeline_name + self.client = lakefs.Client( + host=config.lakefs.host, + username=config.lakefs.username, + password=config.lakefs.password + ) + self.repo = self.client.repositories.get(config.lakefs.repository) + self.logger = logging.getLogger(f"{__name__}.{pipeline_name}") + + # Initialize components + self.extractor = DatabaseExtractor() + self.validator = DataValidator() + + # Pipeline state + self.current_run: Optional[PipelineRun] = None + self.working_branch: Optional[str] = None + + @contextmanager + def pipeline_transaction(self, source_branch: str = 'main'): + """Context manager for transactional pipeline execution""" + + run_id = str(uuid.uuid4()) + branch_name = f"etl-run-{run_id[:8]}" + + try: + # Create working branch + self.logger.info(f"Creating working branch: {branch_name}") + working_branch = self.repo.branches.create(branch_name, source_reference=source_branch) + self.working_branch = branch_name + + # Initialize pipeline run + self.current_run = PipelineRun( + run_id=run_id, + pipeline_name=self.pipeline_name, + start_time=datetime.now(), + status=PipelineStatus.RUNNING, + source_commit=self.repo.branches.get(source_branch).head.id + ) + + # Save run metadata + self._save_run_metadata(working_branch) + + yield working_branch + + # If we get here, the pipeline succeeded + self.current_run.status = PipelineStatus.SUCCESS + self.current_run.end_time = datetime.now() + self.current_run.target_commit = working_branch.head.id + + # Final commit with run summary + self._save_run_metadata(working_branch) + working_branch.commits.create( + message=f"ETL pipeline {self.pipeline_name} completed successfully", + metadata={ + 'pipeline_name': self.pipeline_name, + 'run_id': run_id, + 'status': 'success', + 'records_processed': str(self.current_run.records_processed), + 'duration_seconds': str((self.current_run.end_time - self.current_run.start_time).total_seconds()) + } + ) + + self.logger.info(f"Pipeline {self.pipeline_name} completed successfully") + + except Exception as e: + # Pipeline failed, update status + if self.current_run: + self.current_run.status = PipelineStatus.FAILED + self.current_run.end_time = datetime.now() + self.current_run.error_message = str(e) + + # Save failure metadata + try: + if self.working_branch: + branch = self.repo.branches.get(self.working_branch) + self._save_run_metadata(branch) + branch.commits.create( + message=f"ETL pipeline {self.pipeline_name} failed: {str(e)[:100]}", + metadata={ + 'pipeline_name': self.pipeline_name, + 'run_id': run_id, + 'status': 'failed', + 'error': str(e)[:500] + } + ) + except Exception as save_error: + self.logger.error(f"Failed to save error metadata: {save_error}") + + self.logger.error(f"Pipeline {self.pipeline_name} failed: {e}") + self.logger.debug(traceback.format_exc()) + raise + + finally: + # Cleanup + self.working_branch = None + + def _save_run_metadata(self, branch): + """Save pipeline run metadata to lakeFS""" + if self.current_run: + metadata_json = json.dumps(self.current_run.to_dict(), indent=2) + branch.objects.upload( + path=f"logs/pipeline_runs/{self.current_run.run_id}.json", + data=metadata_json.encode(), + content_type='application/json' + ) + + def extract_data(self, branch, extraction_config: Dict[str, Any]) -> pd.DataFrame: + """Extract data from configured sources""" + + self.logger.info("Starting data extraction") + + extraction_type = extraction_config.get('type', 'full') + table_name = extraction_config.get('table_name') + + if not table_name: + raise ETLPipelineError("Table name not specified in extraction config") + + try: + if extraction_type == 'incremental': + # Get last extraction timestamp + last_extracted = self._get_last_extraction_time(table_name) + timestamp_column = extraction_config.get('timestamp_column', 'updated_at') + + data_chunks = list(self.extractor.extract_incremental( + table_name=table_name, + timestamp_column=timestamp_column, + last_extracted=last_extracted, + chunk_size=config.pipeline.batch_size + )) + else: + # Full extraction + data_chunks = list(self.extractor.extract_full_table( + table_name=table_name, + chunk_size=config.pipeline.batch_size + )) + + if not data_chunks: + self.logger.warning("No data extracted") + return pd.DataFrame() + + # Combine chunks + combined_df = pd.concat(data_chunks, ignore_index=True) + + # Save raw data to lakeFS + raw_data_path = f"raw_data/{table_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.parquet" + self._save_dataframe_to_lakefs(branch, combined_df, raw_data_path) + + self.logger.info(f"Extracted {len(combined_df)} records from {table_name}") + + if self.current_run: + self.current_run.records_processed += len(combined_df) + + return combined_df + + except Exception as e: + raise ETLPipelineError(f"Data extraction failed: {str(e)}") from e + + def transform_data(self, branch, df: pd.DataFrame, transformation_pipeline: TransformationPipeline) -> pd.DataFrame: + """Apply transformations to extracted data""" + + self.logger.info("Starting data transformation") + + try: + transformed_df = transformation_pipeline.execute(df) + + # Save transformed data + transformed_data_path = f"processed_data/transformed_{datetime.now().strftime('%Y%m%d_%H%M%S')}.parquet" + self._save_dataframe_to_lakefs(branch, transformed_df, transformed_data_path) + + self.logger.info(f"Transformed data: {len(df)} -> {len(transformed_df)} records") + return transformed_df + + except Exception as e: + raise TransformationError(f"Data transformation failed: {str(e)}") from e + + def validate_data(self, branch, df: pd.DataFrame, dataset_name: str) -> ValidationResult: + """Validate transformed data""" + + self.logger.info("Starting data validation") + + try: + validation_result = self.validator.validate(df, dataset_name) + + # Save validation results + validation_path = f"validation_results/{dataset_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" + validation_json = json.dumps(validation_result.to_dict(), indent=2) + branch.objects.upload( + path=validation_path, + data=validation_json.encode(), + content_type='application/json' + ) + + # Handle validation failures + if validation_result.status.value == 'failed': + # Save failed records for analysis + if validation_result.failed_records > 0: + failed_data_path = f"failed_data/{dataset_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.parquet" + # In practice, you'd identify and save the actual failed records + self.logger.warning(f"Validation failed: {validation_result.failed_records} records failed validation") + + if not config.pipeline.validation_enabled: + self.logger.warning("Validation failed but continuing due to configuration") + else: + raise DataValidationError(f"Data validation failed: {'; '.join(validation_result.errors)}") + + self.logger.info(f"Data validation completed: {validation_result.status.value}") + + if self.current_run: + self.current_run.records_failed += validation_result.failed_records + + return validation_result + + except Exception as e: + if isinstance(e, DataValidationError): + raise + raise DataValidationError(f"Data validation process failed: {str(e)}") from e + + def load_data(self, branch, df: pd.DataFrame, load_config: Dict[str, Any]) -> bool: + """Load validated data to final destination""" + + self.logger.info("Starting data loading") + + try: + # Save to final location in lakeFS + final_data_path = load_config.get('path', f"validated_data/final_{datetime.now().strftime('%Y%m%d_%H%M%S')}.parquet") + self._save_dataframe_to_lakefs(branch, df, final_data_path) + + # Optional: Load to external systems (database, data warehouse, etc.) + external_targets = load_config.get('external_targets', []) + for target in external_targets: + self._load_to_external_target(df, target) + + self.logger.info(f"Data loading completed: {len(df)} records") + return True + + except Exception as e: + raise ETLPipelineError(f"Data loading failed: {str(e)}") from e + + def _save_dataframe_to_lakefs(self, branch, df: pd.DataFrame, path: str): + """Save DataFrame to lakeFS as Parquet""" + import io + + buffer = io.BytesIO() + df.to_parquet(buffer, index=False) + + branch.objects.upload( + path=path, + data=buffer.getvalue(), + content_type='application/octet-stream' + ) + + self.logger.debug(f"Saved DataFrame to {path}: {len(df)} records") + + def _load_to_external_target(self, df: pd.DataFrame, target_config: Dict[str, Any]): + """Load data to external target (database, API, etc.)""" + target_type = target_config.get('type') + + if target_type == 'database': + # Example database loading + table_name = target_config.get('table_name') + if_exists = target_config.get('if_exists', 'append') + + df.to_sql( + table_name, + self.extractor.engine, + if_exists=if_exists, + index=False, + chunksize=config.pipeline.batch_size + ) + + self.logger.info(f"Loaded {len(df)} records to database table {table_name}") + + elif target_type == 'api': + # Example API loading + api_url = target_config.get('url') + # Implementation would depend on specific API requirements + self.logger.info(f"Would load {len(df)} records to API endpoint {api_url}") + + else: + self.logger.warning(f"Unknown target type: {target_type}") + + def _get_last_extraction_time(self, table_name: str) -> Optional[datetime]: + """Get timestamp of last successful extraction""" + try: + # Look for checkpoint files + main_branch = self.repo.branches.get('main') + checkpoint_path = f"checkpoints/{table_name}_last_extraction.json" + + checkpoint_obj = main_branch.objects.get(checkpoint_path) + checkpoint_data = json.loads(checkpoint_obj.reader().read().decode()) + + return datetime.fromisoformat(checkpoint_data['last_extraction_time']) + + except NotFoundException: + # No previous extraction found + self.logger.info(f"No previous extraction checkpoint found for {table_name}") + return None + except Exception as e: + self.logger.warning(f"Error reading extraction checkpoint: {e}") + return None + + def _save_extraction_checkpoint(self, branch, table_name: str, extraction_time: datetime): + """Save extraction checkpoint""" + checkpoint_data = { + 'table_name': table_name, + 'last_extraction_time': extraction_time.isoformat(), + 'pipeline_run_id': self.current_run.run_id if self.current_run else None + } + + checkpoint_path = f"checkpoints/{table_name}_last_extraction.json" + branch.objects.upload( + path=checkpoint_path, + data=json.dumps(checkpoint_data, indent=2).encode(), + content_type='application/json' + ) + + def run_pipeline( + self, + extraction_config: Dict[str, Any], + transformation_pipeline: TransformationPipeline, + load_config: Dict[str, Any], + source_branch: str = 'main' + ) -> PipelineRun: + """Execute complete ETL pipeline""" + + self.logger.info(f"Starting ETL pipeline: {self.pipeline_name}") + + with self.pipeline_transaction(source_branch) as working_branch: + + # Extract + extracted_df = self.extract_data(working_branch, extraction_config) + + if extracted_df.empty: + self.logger.info("No data to process, pipeline completed") + return self.current_run + + # Transform + transformed_df = self.transform_data(working_branch, extracted_df, transformation_pipeline) + + # Validate + validation_result = self.validate_data(working_branch, transformed_df, extraction_config.get('table_name', 'unknown')) + + # Load + self.load_data(working_branch, transformed_df, load_config) + + # Save extraction checkpoint + self._save_extraction_checkpoint(working_branch, extraction_config.get('table_name'), datetime.now()) + + self.logger.info(f"ETL pipeline {self.pipeline_name} completed successfully") + + return self.current_run + + def merge_to_target(self, target_branch: str = 'main', delete_working_branch: bool = True) -> str: + """Merge working branch to target branch""" + + if not self.current_run or not self.current_run.target_commit: + raise ETLPipelineError("No successful pipeline run to merge") + + working_branch_name = f"etl-run-{self.current_run.run_id[:8]}" + + try: + target_branch_obj = self.repo.branches.get(target_branch) + + merge_result = target_branch_obj.merge( + source_reference=working_branch_name, + message=f"Merge ETL pipeline {self.pipeline_name} results", + metadata={ + 'pipeline_name': self.pipeline_name, + 'run_id': self.current_run.run_id, + 'records_processed': str(self.current_run.records_processed) + } + ) + + self.logger.info(f"Merged {working_branch_name} to {target_branch}: {merge_result.id}") + + # Optionally delete working branch + if delete_working_branch: + try: + self.repo.branches.delete(working_branch_name) + self.logger.info(f"Deleted working branch: {working_branch_name}") + except Exception as e: + self.logger.warning(f"Failed to delete working branch: {e}") + + return merge_result.id + + except ConflictError as e: + self.logger.error(f"Merge conflict: {e}") + raise ETLPipelineError(f"Merge conflict when merging to {target_branch}") from e + + def cleanup(self): + """Cleanup resources""" + if self.extractor: + self.extractor.close() + +# Example pipeline configuration +SAMPLE_EXTRACTION_CONFIG = { + 'type': 'incremental', # or 'full' + 'table_name': 'users', + 'timestamp_column': 'updated_at' +} + +SAMPLE_LOAD_CONFIG = { + 'path': 'validated_data/users.parquet', + 'external_targets': [ + { + 'type': 'database', + 'table_name': 'processed_users', + 'if_exists': 'replace' + } + ] +} + +# Example usage +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + + from src.transformers.base_transformer import ( + DataCleaningTransformer, + BusinessLogicTransformer, + TransformationPipeline, + SAMPLE_CLEANING_RULES, + SAMPLE_BUSINESS_RULES + ) + from src.validators.data_validator import DataValidator, SAMPLE_SCHEMA, SAMPLE_BUSINESS_RULES as VALIDATION_RULES + + # Create pipeline + pipeline = TransactionalETLPipeline("user_processing_pipeline") + + # Setup validation + pipeline.validator.add_schema_validation(SAMPLE_SCHEMA) + pipeline.validator.add_completeness_check(['id', 'name', 'email']) + pipeline.validator.add_uniqueness_check(['id', 'email']) + pipeline.validator.add_business_rule_validation(VALIDATION_RULES) + + # Setup transformation pipeline + transformers = [ + DataCleaningTransformer(SAMPLE_CLEANING_RULES), + BusinessLogicTransformer(SAMPLE_BUSINESS_RULES) + ] + transformation_pipeline = TransformationPipeline(transformers) + + try: + # Run pipeline + run_result = pipeline.run_pipeline( + extraction_config=SAMPLE_EXTRACTION_CONFIG, + transformation_pipeline=transformation_pipeline, + load_config=SAMPLE_LOAD_CONFIG + ) + + print(f"Pipeline completed: {run_result.status.value}") + print(f"Records processed: {run_result.records_processed}") + + # Merge to main if successful + if run_result.status == PipelineStatus.SUCCESS: + merge_commit = pipeline.merge_to_target('main') + print(f"Merged to main: {merge_commit}") + + except Exception as e: + print(f"Pipeline failed: {e}") + + finally: + pipeline.cleanup() +```## Step 7: +Error Handling and Recovery + +### Comprehensive Error Handling System + +```python +# src/exceptions.py +class ETLPipelineError(Exception): + """Base exception for ETL pipeline errors""" + pass + +class DataExtractionError(ETLPipelineError): + """Raised when data extraction fails""" + pass + +class TransformationError(ETLPipelineError): + """Raised when data transformation fails""" + pass + +class DataValidationError(ETLPipelineError): + """Raised when data validation fails""" + pass + +class DataLoadingError(ETLPipelineError): + """Raised when data loading fails""" + pass + +# src/error_handler.py +import logging +from typing import Dict, Any, Optional, Callable +from datetime import datetime +import traceback +import json +from enum import Enum + +class ErrorSeverity(Enum): + LOW = "low" + MEDIUM = "medium" + HIGH = "high" + CRITICAL = "critical" + +class ErrorHandler: + """Centralized error handling and recovery system""" + + def __init__(self, pipeline_name: str): + self.pipeline_name = pipeline_name + self.logger = logging.getLogger(f"{__name__}.{pipeline_name}") + self.error_log = [] + + def handle_error( + self, + error: Exception, + context: Dict[str, Any], + severity: ErrorSeverity = ErrorSeverity.HIGH, + recovery_action: Optional[Callable] = None + ) -> bool: + """Handle pipeline errors with optional recovery""" + + error_info = { + 'timestamp': datetime.now().isoformat(), + 'pipeline_name': self.pipeline_name, + 'error_type': type(error).__name__, + 'error_message': str(error), + 'severity': severity.value, + 'context': context, + 'traceback': traceback.format_exc() + } + + self.error_log.append(error_info) + + # Log error based on severity + if severity == ErrorSeverity.CRITICAL: + self.logger.critical(f"Critical error in {self.pipeline_name}: {error}") + elif severity == ErrorSeverity.HIGH: + self.logger.error(f"High severity error in {self.pipeline_name}: {error}") + elif severity == ErrorSeverity.MEDIUM: + self.logger.warning(f"Medium severity error in {self.pipeline_name}: {error}") + else: + self.logger.info(f"Low severity error in {self.pipeline_name}: {error}") + + # Attempt recovery if provided + if recovery_action: + try: + self.logger.info("Attempting error recovery...") + recovery_result = recovery_action() + if recovery_result: + self.logger.info("Error recovery successful") + return True + else: + self.logger.warning("Error recovery failed") + except Exception as recovery_error: + self.logger.error(f"Error recovery failed with exception: {recovery_error}") + + return False + + def save_error_log(self, branch, run_id: str): + """Save error log to lakeFS""" + if self.error_log: + error_log_path = f"logs/errors/{run_id}_errors.json" + error_log_json = json.dumps(self.error_log, indent=2) + + branch.objects.upload( + path=error_log_path, + data=error_log_json.encode(), + content_type='application/json' + ) + + self.logger.info(f"Error log saved to {error_log_path}") + +# src/retry_handler.py +import time +import random +from typing import Callable, Any, Optional +from functools import wraps + +class RetryHandler: + """Retry mechanism with exponential backoff""" + + def __init__(self, max_retries: int = 3, base_delay: float = 1.0, max_delay: float = 60.0): + self.max_retries = max_retries + self.base_delay = base_delay + self.max_delay = max_delay + self.logger = logging.getLogger(__name__) + + def retry_with_backoff( + self, + func: Callable, + *args, + retryable_exceptions: tuple = (Exception,), + **kwargs + ) -> Any: + """Execute function with retry and exponential backoff""" + + last_exception = None + + for attempt in range(self.max_retries + 1): + try: + return func(*args, **kwargs) + + except retryable_exceptions as e: + last_exception = e + + if attempt == self.max_retries: + self.logger.error(f"Function {func.__name__} failed after {self.max_retries} retries") + raise e + + # Calculate delay with jitter + delay = min(self.base_delay * (2 ** attempt), self.max_delay) + jitter = random.uniform(0, 0.1) * delay + total_delay = delay + jitter + + self.logger.warning( + f"Function {func.__name__} failed (attempt {attempt + 1}/{self.max_retries + 1}): {e}. " + f"Retrying in {total_delay:.2f} seconds..." + ) + + time.sleep(total_delay) + + # This should never be reached, but just in case + raise last_exception + +def retry_on_failure(max_retries: int = 3, base_delay: float = 1.0, retryable_exceptions: tuple = (Exception,)): + """Decorator for automatic retry with exponential backoff""" + + def decorator(func: Callable) -> Callable: + @wraps(func) + def wrapper(*args, **kwargs): + retry_handler = RetryHandler(max_retries, base_delay) + return retry_handler.retry_with_backoff( + func, *args, retryable_exceptions=retryable_exceptions, **kwargs + ) + return wrapper + return decorator + +# Enhanced pipeline with error handling +class RobustETLPipeline(TransactionalETLPipeline): + """ETL Pipeline with comprehensive error handling and recovery""" + + def __init__(self, pipeline_name: str): + super().__init__(pipeline_name) + self.error_handler = ErrorHandler(pipeline_name) + self.retry_handler = RetryHandler( + max_retries=config.pipeline.max_retries, + base_delay=1.0, + max_delay=30.0 + ) + + @retry_on_failure(max_retries=3, retryable_exceptions=(DataExtractionError,)) + def extract_data_with_retry(self, branch, extraction_config: Dict[str, Any]) -> pd.DataFrame: + """Extract data with automatic retry on failure""" + try: + return super().extract_data(branch, extraction_config) + except Exception as e: + # Convert to specific exception type for retry logic + raise DataExtractionError(f"Data extraction failed: {str(e)}") from e + + def transform_data_with_recovery(self, branch, df: pd.DataFrame, transformation_pipeline) -> pd.DataFrame: + """Transform data with error recovery""" + + def recovery_action(): + """Recovery action for transformation failures""" + self.logger.info("Attempting transformation recovery with simplified rules") + + # Create a simplified transformation pipeline + from src.transformers.base_transformer import DataCleaningTransformer + + simplified_rules = { + 'remove_duplicates': True, + 'null_handling': {'*': 'drop'}, # Drop any null values + 'type_conversions': {} # Skip type conversions + } + + simplified_transformer = DataCleaningTransformer(simplified_rules) + + try: + return simplified_transformer.execute(df) + except Exception: + return df # Return original data if even simplified transformation fails + + try: + return super().transform_data(branch, df, transformation_pipeline) + + except Exception as e: + context = { + 'step': 'transformation', + 'input_records': len(df), + 'transformation_pipeline': str(transformation_pipeline) + } + + recovery_successful = self.error_handler.handle_error( + error=e, + context=context, + severity=ErrorSeverity.HIGH, + recovery_action=recovery_action + ) + + if recovery_successful: + return recovery_action() + else: + raise TransformationError(f"Transformation failed and recovery unsuccessful: {str(e)}") from e + + def validate_data_with_fallback(self, branch, df: pd.DataFrame, dataset_name: str): + """Validate data with fallback to warnings-only mode""" + + def recovery_action(): + """Recovery action for validation failures""" + self.logger.info("Attempting validation recovery with relaxed rules") + + # Create a more lenient validator + lenient_validator = DataValidator() + + # Add only critical validations + lenient_validator.add_completeness_check(['id'], threshold=0.5) # Lower threshold + + try: + return lenient_validator.validate(df, f"{dataset_name}_lenient") + except Exception: + # If even lenient validation fails, create a minimal passing result + from src.models import ValidationResult, ValidationStatus + return ValidationResult( + dataset_name=dataset_name, + validation_time=datetime.now(), + status=ValidationStatus.WARNING, + total_records=len(df), + passed_records=len(df), + failed_records=0, + warnings=["Validation failed, using fallback mode"], + errors=[] + ) + + try: + return super().validate_data(branch, df, dataset_name) + + except DataValidationError as e: + context = { + 'step': 'validation', + 'dataset_name': dataset_name, + 'record_count': len(df) + } + + recovery_successful = self.error_handler.handle_error( + error=e, + context=context, + severity=ErrorSeverity.MEDIUM, + recovery_action=recovery_action + ) + + if recovery_successful: + return recovery_action() + else: + raise e + + def run_pipeline_with_recovery( + self, + extraction_config: Dict[str, Any], + transformation_pipeline, + load_config: Dict[str, Any], + source_branch: str = 'main' + ): + """Run pipeline with comprehensive error handling""" + + try: + with self.pipeline_transaction(source_branch) as working_branch: + + # Extract with retry + extracted_df = self.extract_data_with_retry(working_branch, extraction_config) + + if extracted_df.empty: + self.logger.info("No data to process, pipeline completed") + return self.current_run + + # Transform with recovery + transformed_df = self.transform_data_with_recovery(working_branch, extracted_df, transformation_pipeline) + + # Validate with fallback + validation_result = self.validate_data_with_fallback( + working_branch, + transformed_df, + extraction_config.get('table_name', 'unknown') + ) + + # Load with retry + self.retry_handler.retry_with_backoff( + self.load_data, + working_branch, + transformed_df, + load_config, + retryable_exceptions=(DataLoadingError,) + ) + + # Save extraction checkpoint + self._save_extraction_checkpoint( + working_branch, + extraction_config.get('table_name'), + datetime.now() + ) + + self.logger.info(f"ETL pipeline {self.pipeline_name} completed successfully") + + except Exception as e: + # Final error handling + context = { + 'step': 'pipeline_execution', + 'extraction_config': extraction_config, + 'load_config': load_config + } + + self.error_handler.handle_error( + error=e, + context=context, + severity=ErrorSeverity.CRITICAL + ) + + raise + + finally: + # Always save error log + if self.working_branch and self.current_run: + try: + branch = self.repo.branches.get(self.working_branch) + self.error_handler.save_error_log(branch, self.current_run.run_id) + except Exception as log_error: + self.logger.error(f"Failed to save error log: {log_error}") + + return self.current_run + +# Example usage with error handling +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + + # Create robust pipeline + pipeline = RobustETLPipeline("robust_user_processing") + + # Setup components (same as before) + from src.transformers.base_transformer import ( + DataCleaningTransformer, + BusinessLogicTransformer, + TransformationPipeline, + SAMPLE_CLEANING_RULES, + SAMPLE_BUSINESS_RULES + ) + + pipeline.validator.add_schema_validation(SAMPLE_SCHEMA) + pipeline.validator.add_completeness_check(['id', 'name', 'email']) + pipeline.validator.add_uniqueness_check(['id', 'email']) + + transformers = [ + DataCleaningTransformer(SAMPLE_CLEANING_RULES), + BusinessLogicTransformer(SAMPLE_BUSINESS_RULES) + ] + transformation_pipeline = TransformationPipeline(transformers) + + try: + # Run robust pipeline + run_result = pipeline.run_pipeline_with_recovery( + extraction_config=SAMPLE_EXTRACTION_CONFIG, + transformation_pipeline=transformation_pipeline, + load_config=SAMPLE_LOAD_CONFIG + ) + + print(f"Robust pipeline completed: {run_result.status.value}") + print(f"Records processed: {run_result.records_processed}") + print(f"Errors encountered: {len(pipeline.error_handler.error_log)}") + + except Exception as e: + print(f"Pipeline failed despite error handling: {e}") + + finally: + pipeline.cleanup() +``` + +## Step 8: Monitoring and Alerting + +### Pipeline Monitoring System + +```python +# src/monitoring.py +import logging +from datetime import datetime, timedelta +from typing import Dict, List, Any, Optional +import json +from dataclasses import dataclass +from enum import Enum +import smtplib +from email.mime.text import MIMEText +from email.mime.multipart import MIMEMultipart + +class AlertLevel(Enum): + INFO = "info" + WARNING = "warning" + ERROR = "error" + CRITICAL = "critical" + +@dataclass +class PipelineMetrics: + """Pipeline execution metrics""" + pipeline_name: str + run_id: str + start_time: datetime + end_time: Optional[datetime] + duration_seconds: Optional[float] + records_processed: int + records_failed: int + success_rate: float + error_count: int + warning_count: int + + def to_dict(self) -> Dict[str, Any]: + return { + 'pipeline_name': self.pipeline_name, + 'run_id': self.run_id, + 'start_time': self.start_time.isoformat(), + 'end_time': self.end_time.isoformat() if self.end_time else None, + 'duration_seconds': self.duration_seconds, + 'records_processed': self.records_processed, + 'records_failed': self.records_failed, + 'success_rate': self.success_rate, + 'error_count': self.error_count, + 'warning_count': self.warning_count + } + +class PipelineMonitor: + """Monitor pipeline execution and send alerts""" + + def __init__(self, pipeline_name: str): + self.pipeline_name = pipeline_name + self.logger = logging.getLogger(f"{__name__}.{pipeline_name}") + self.metrics_history: List[PipelineMetrics] = [] + + def record_metrics(self, pipeline_run) -> PipelineMetrics: + """Record metrics from pipeline run""" + + duration = None + if pipeline_run.end_time and pipeline_run.start_time: + duration = (pipeline_run.end_time - pipeline_run.start_time).total_seconds() + + success_rate = 0.0 + if pipeline_run.records_processed > 0: + success_rate = (pipeline_run.records_processed - pipeline_run.records_failed) / pipeline_run.records_processed + + metrics = PipelineMetrics( + pipeline_name=self.pipeline_name, + run_id=pipeline_run.run_id, + start_time=pipeline_run.start_time, + end_time=pipeline_run.end_time, + duration_seconds=duration, + records_processed=pipeline_run.records_processed, + records_failed=pipeline_run.records_failed, + success_rate=success_rate, + error_count=1 if pipeline_run.status.value == 'failed' else 0, + warning_count=0 # Would be populated from validation results + ) + + self.metrics_history.append(metrics) + return metrics + + def check_sla_violations(self, metrics: PipelineMetrics) -> List[Dict[str, Any]]: + """Check for SLA violations""" + violations = [] + + # Duration SLA (example: pipeline should complete within 30 minutes) + max_duration = 30 * 60 # 30 minutes in seconds + if metrics.duration_seconds and metrics.duration_seconds > max_duration: + violations.append({ + 'type': 'duration_sla', + 'message': f"Pipeline duration {metrics.duration_seconds:.0f}s exceeds SLA of {max_duration}s", + 'severity': AlertLevel.WARNING + }) + + # Success rate SLA (example: should process 95% of records successfully) + min_success_rate = 0.95 + if metrics.success_rate < min_success_rate: + violations.append({ + 'type': 'success_rate_sla', + 'message': f"Success rate {metrics.success_rate:.2%} below SLA of {min_success_rate:.2%}", + 'severity': AlertLevel.ERROR + }) + + # Error count SLA (example: no errors allowed) + if metrics.error_count > 0: + violations.append({ + 'type': 'error_count_sla', + 'message': f"Pipeline failed with {metrics.error_count} errors", + 'severity': AlertLevel.CRITICAL + }) + + return violations + + def generate_report(self, time_window_hours: int = 24) -> Dict[str, Any]: + """Generate pipeline performance report""" + + cutoff_time = datetime.now() - timedelta(hours=time_window_hours) + recent_metrics = [m for m in self.metrics_history if m.start_time >= cutoff_time] + + if not recent_metrics: + return { + 'pipeline_name': self.pipeline_name, + 'time_window_hours': time_window_hours, + 'total_runs': 0, + 'message': 'No pipeline runs in the specified time window' + } + + # Calculate aggregated metrics + total_runs = len(recent_metrics) + successful_runs = len([m for m in recent_metrics if m.error_count == 0]) + total_records_processed = sum(m.records_processed for m in recent_metrics) + total_records_failed = sum(m.records_failed for m in recent_metrics) + + avg_duration = None + completed_runs = [m for m in recent_metrics if m.duration_seconds is not None] + if completed_runs: + avg_duration = sum(m.duration_seconds for m in completed_runs) / len(completed_runs) + + overall_success_rate = successful_runs / total_runs if total_runs > 0 else 0 + + return { + 'pipeline_name': self.pipeline_name, + 'time_window_hours': time_window_hours, + 'report_generated': datetime.now().isoformat(), + 'total_runs': total_runs, + 'successful_runs': successful_runs, + 'failed_runs': total_runs - successful_runs, + 'overall_success_rate': overall_success_rate, + 'total_records_processed': total_records_processed, + 'total_records_failed': total_records_failed, + 'average_duration_seconds': avg_duration, + 'recent_runs': [m.to_dict() for m in recent_metrics[-5:]] # Last 5 runs + } + + def save_metrics(self, branch, metrics: PipelineMetrics): + """Save metrics to lakeFS""" + metrics_path = f"monitoring/metrics/{metrics.run_id}_metrics.json" + metrics_json = json.dumps(metrics.to_dict(), indent=2) + + branch.objects.upload( + path=metrics_path, + data=metrics_json.encode(), + content_type='application/json' + ) + + self.logger.info(f"Metrics saved to {metrics_path}") + +class AlertManager: + """Manage pipeline alerts and notifications""" + + def __init__(self, smtp_config: Optional[Dict[str, Any]] = None): + self.smtp_config = smtp_config or {} + self.logger = logging.getLogger(__name__) + + def send_alert(self, alert_level: AlertLevel, subject: str, message: str, recipients: List[str]): + """Send alert notification""" + + # Log alert + log_method = { + AlertLevel.INFO: self.logger.info, + AlertLevel.WARNING: self.logger.warning, + AlertLevel.ERROR: self.logger.error, + AlertLevel.CRITICAL: self.logger.critical + }.get(alert_level, self.logger.info) + + log_method(f"ALERT [{alert_level.value.upper()}]: {subject} - {message}") + + # Send email if configured + if self.smtp_config and recipients: + try: + self._send_email_alert(alert_level, subject, message, recipients) + except Exception as e: + self.logger.error(f"Failed to send email alert: {e}") + + def _send_email_alert(self, alert_level: AlertLevel, subject: str, message: str, recipients: List[str]): + """Send email alert""" + + smtp_server = self.smtp_config.get('server', 'localhost') + smtp_port = self.smtp_config.get('port', 587) + username = self.smtp_config.get('username') + password = self.smtp_config.get('password') + sender = self.smtp_config.get('sender', 'etl-pipeline@company.com') + + # Create message + msg = MIMEMultipart() + msg['From'] = sender + msg['To'] = ', '.join(recipients) + msg['Subject'] = f"[{alert_level.value.upper()}] {subject}" + + # Add body + body = f""" + Alert Level: {alert_level.value.upper()} + Subject: {subject} + + Message: + {message} + + Generated at: {datetime.now().isoformat()} + """ + + msg.attach(MIMEText(body, 'plain')) + + # Send email + with smtplib.SMTP(smtp_server, smtp_port) as server: + if username and password: + server.starttls() + server.login(username, password) + + server.send_message(msg) + + self.logger.info(f"Email alert sent to {len(recipients)} recipients") + +# Enhanced pipeline with monitoring +class MonitoredETLPipeline(RobustETLPipeline): + """ETL Pipeline with comprehensive monitoring and alerting""" + + def __init__(self, pipeline_name: str, alert_recipients: List[str] = None): + super().__init__(pipeline_name) + self.monitor = PipelineMonitor(pipeline_name) + self.alert_manager = AlertManager() + self.alert_recipients = alert_recipients or [] + + def run_monitored_pipeline( + self, + extraction_config: Dict[str, Any], + transformation_pipeline, + load_config: Dict[str, Any], + source_branch: str = 'main' + ): + """Run pipeline with monitoring and alerting""" + + try: + # Run pipeline + run_result = self.run_pipeline_with_recovery( + extraction_config, transformation_pipeline, load_config, source_branch + ) + + # Record metrics + metrics = self.monitor.record_metrics(run_result) + + # Save metrics to lakeFS + if self.working_branch: + branch = self.repo.branches.get(self.working_branch) + self.monitor.save_metrics(branch, metrics) + + # Check for SLA violations + violations = self.monitor.check_sla_violations(metrics) + + # Send alerts for violations + for violation in violations: + self.alert_manager.send_alert( + alert_level=violation['severity'], + subject=f"Pipeline SLA Violation: {self.pipeline_name}", + message=violation['message'], + recipients=self.alert_recipients + ) + + # Send success notification for critical pipelines + if run_result.status.value == 'success' and not violations: + self.alert_manager.send_alert( + alert_level=AlertLevel.INFO, + subject=f"Pipeline Success: {self.pipeline_name}", + message=f"Pipeline completed successfully. Processed {metrics.records_processed} records in {metrics.duration_seconds:.0f} seconds.", + recipients=self.alert_recipients + ) + + return run_result + + except Exception as e: + # Send critical failure alert + self.alert_manager.send_alert( + alert_level=AlertLevel.CRITICAL, + subject=f"Pipeline Failure: {self.pipeline_name}", + message=f"Pipeline failed with error: {str(e)}", + recipients=self.alert_recipients + ) + raise + + def generate_daily_report(self) -> Dict[str, Any]: + """Generate and send daily pipeline report""" + + report = self.monitor.generate_report(time_window_hours=24) + + # Send report via email + if self.alert_recipients: + report_message = f""" + Daily Pipeline Report: {self.pipeline_name} + + Summary: + - Total Runs: {report['total_runs']} + - Successful Runs: {report['successful_runs']} + - Failed Runs: {report['failed_runs']} + - Overall Success Rate: {report['overall_success_rate']:.2%} + - Total Records Processed: {report['total_records_processed']:,} + - Total Records Failed: {report['total_records_failed']:,} + - Average Duration: {report['average_duration_seconds']:.0f} seconds + + For detailed metrics, check the pipeline monitoring dashboard. + """ + + self.alert_manager.send_alert( + alert_level=AlertLevel.INFO, + subject=f"Daily Report: {self.pipeline_name}", + message=report_message, + recipients=self.alert_recipients + ) + + return report + +# Example usage with monitoring +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + + # Create monitored pipeline + pipeline = MonitoredETLPipeline( + "monitored_user_processing", + alert_recipients=["data-team@company.com", "ops-team@company.com"] + ) + + # Setup components + from src.transformers.base_transformer import ( + DataCleaningTransformer, + BusinessLogicTransformer, + TransformationPipeline, + SAMPLE_CLEANING_RULES, + SAMPLE_BUSINESS_RULES + ) + + pipeline.validator.add_schema_validation(SAMPLE_SCHEMA) + pipeline.validator.add_completeness_check(['id', 'name', 'email']) + pipeline.validator.add_uniqueness_check(['id', 'email']) + + transformers = [ + DataCleaningTransformer(SAMPLE_CLEANING_RULES), + BusinessLogicTransformer(SAMPLE_BUSINESS_RULES) + ] + transformation_pipeline = TransformationPipeline(transformers) + + try: + # Run monitored pipeline + run_result = pipeline.run_monitored_pipeline( + extraction_config=SAMPLE_EXTRACTION_CONFIG, + transformation_pipeline=transformation_pipeline, + load_config=SAMPLE_LOAD_CONFIG + ) + + print(f"Monitored pipeline completed: {run_result.status.value}") + + # Generate daily report + daily_report = pipeline.generate_daily_report() + print("Daily report generated and sent") + + except Exception as e: + print(f"Monitored pipeline failed: {e}") + + finally: + pipeline.cleanup() +```## + Step 9: Production Deployment and CI/CD + +### Production Deployment Configuration + +```python +# deployment/docker/Dockerfile +FROM python:3.9-slim + +# Set working directory +WORKDIR /app + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + gcc \ + g++ \ + libpq-dev \ + && rm -rf /var/lib/apt/lists/* + +# Copy requirements and install Python dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application code +COPY src/ ./src/ +COPY config/ ./config/ +COPY deployment/scripts/ ./scripts/ + +# Create non-root user +RUN useradd -m -u 1000 etluser && chown -R etluser:etluser /app +USER etluser + +# Set environment variables +ENV PYTHONPATH=/app +ENV PYTHONUNBUFFERED=1 + +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD python scripts/health_check.py + +# Default command +CMD ["python", "scripts/run_pipeline.py"] +``` + +```yaml +# deployment/kubernetes/etl-pipeline.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: etl-pipeline + labels: + app: etl-pipeline +spec: + replicas: 1 + selector: + matchLabels: + app: etl-pipeline + template: + metadata: + labels: + app: etl-pipeline + spec: + containers: + - name: etl-pipeline + image: your-registry/etl-pipeline:latest + env: + - name: LAKEFS_HOST + valueFrom: + secretKeyRef: + name: lakefs-credentials + key: host + - name: LAKEFS_ACCESS_KEY + valueFrom: + secretKeyRef: + name: lakefs-credentials + key: access-key + - name: LAKEFS_SECRET_KEY + valueFrom: + secretKeyRef: + name: lakefs-credentials + key: secret-key + - name: DB_HOST + valueFrom: + secretKeyRef: + name: database-credentials + key: host + - name: DB_PASSWORD + valueFrom: + secretKeyRef: + name: database-credentials + key: password + resources: + requests: + memory: "512Mi" + cpu: "250m" + limits: + memory: "2Gi" + cpu: "1000m" + volumeMounts: + - name: config-volume + mountPath: /app/config + volumes: + - name: config-volume + configMap: + name: etl-pipeline-config +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: etl-pipeline-config +data: + pipeline.yaml: | + batch_size: 1000 + max_retries: 3 + timeout_seconds: 300 + validation_enabled: true + monitoring_enabled: true +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: etl-pipeline-schedule +spec: + schedule: "0 2 * * *" # Run daily at 2 AM + jobTemplate: + spec: + template: + spec: + containers: + - name: etl-pipeline + image: your-registry/etl-pipeline:latest + command: ["python", "scripts/run_scheduled_pipeline.py"] + env: + - name: PIPELINE_NAME + value: "daily_user_processing" + restartPolicy: OnFailure +``` + +### CI/CD Pipeline Configuration + +```yaml +# .github/workflows/etl-pipeline.yml +name: ETL Pipeline CI/CD + +on: + push: + branches: [ main, develop ] + pull_request: + branches: [ main ] + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }}/etl-pipeline + +jobs: + test: + runs-on: ubuntu-latest + + services: + postgres: + image: postgres:13 + env: + POSTGRES_PASSWORD: postgres + POSTGRES_DB: test_db + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 5432:5432 + + lakefs: + image: treeverse/lakefs:latest + env: + LAKEFS_AUTH_ENCRYPT_SECRET_KEY: some-secret + LAKEFS_DATABASE_TYPE: local + LAKEFS_BLOCKSTORE_TYPE: local + ports: + - 8000:8000 + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.9' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install -r requirements-test.txt + + - name: Wait for services + run: | + sleep 30 # Wait for services to be ready + + - name: Run unit tests + run: | + python -m pytest tests/unit/ -v --cov=src --cov-report=xml + env: + LAKEFS_HOST: http://localhost:8000 + LAKEFS_ACCESS_KEY: AKIAIOSFODNN7EXAMPLE + LAKEFS_SECRET_KEY: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY + DB_HOST: localhost + DB_PASSWORD: postgres + + - name: Run integration tests + run: | + python -m pytest tests/integration/ -v + env: + LAKEFS_HOST: http://localhost:8000 + LAKEFS_ACCESS_KEY: AKIAIOSFODNN7EXAMPLE + LAKEFS_SECRET_KEY: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY + DB_HOST: localhost + DB_PASSWORD: postgres + + - name: Upload coverage reports + uses: codecov/codecov-action@v3 + with: + file: ./coverage.xml + + build: + needs: test + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + + - name: Log in to Container Registry + uses: docker/login-action@v2 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata + id: meta + uses: docker/metadata-action@v4 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + tags: | + type=ref,event=branch + type=ref,event=pr + type=sha + + - name: Build and push Docker image + uses: docker/build-push-action@v4 + with: + context: . + file: deployment/docker/Dockerfile + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + + deploy-staging: + needs: build + runs-on: ubuntu-latest + if: github.ref == 'refs/heads/develop' + + steps: + - uses: actions/checkout@v3 + + - name: Deploy to staging + run: | + echo "Deploying to staging environment" + # Add your staging deployment commands here + # kubectl apply -f deployment/kubernetes/staging/ + + deploy-production: + needs: build + runs-on: ubuntu-latest + if: github.ref == 'refs/heads/main' + environment: production + + steps: + - uses: actions/checkout@v3 + + - name: Deploy to production + run: | + echo "Deploying to production environment" + # Add your production deployment commands here + # kubectl apply -f deployment/kubernetes/production/ +``` + +### Production Scripts + +```python +# scripts/run_pipeline.py +#!/usr/bin/env python3 +""" +Production pipeline runner script +""" + +import os +import sys +import logging +import argparse +from datetime import datetime + +# Add src to path +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) + +from src.pipeline import MonitoredETLPipeline +from src.transformers.base_transformer import ( + DataCleaningTransformer, + BusinessLogicTransformer, + TransformationPipeline +) +from config.settings import config + +def setup_logging(): + """Setup production logging""" + logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[ + logging.StreamHandler(sys.stdout), + logging.FileHandler('/app/logs/pipeline.log') + ] + ) + +def load_pipeline_config(config_file: str) -> dict: + """Load pipeline configuration from file""" + import yaml + + with open(config_file, 'r') as f: + return yaml.safe_load(f) + +def create_transformation_pipeline(transform_config: dict) -> TransformationPipeline: + """Create transformation pipeline from configuration""" + + transformers = [] + + # Data cleaning transformer + if 'cleaning_rules' in transform_config: + transformers.append(DataCleaningTransformer(transform_config['cleaning_rules'])) + + # Business logic transformer + if 'business_rules' in transform_config: + transformers.append(BusinessLogicTransformer(transform_config['business_rules'])) + + return TransformationPipeline(transformers) + +def main(): + parser = argparse.ArgumentParser(description='Run ETL Pipeline') + parser.add_argument('--pipeline-name', required=True, help='Name of the pipeline to run') + parser.add_argument('--config-file', required=True, help='Pipeline configuration file') + parser.add_argument('--source-branch', default='main', help='Source branch for pipeline') + parser.add_argument('--merge-to-main', action='store_true', help='Merge results to main branch') + + args = parser.parse_args() + + setup_logging() + logger = logging.getLogger(__name__) + + try: + # Load configuration + pipeline_config = load_pipeline_config(args.config_file) + + # Create pipeline + pipeline = MonitoredETLPipeline( + pipeline_name=args.pipeline_name, + alert_recipients=pipeline_config.get('alert_recipients', []) + ) + + # Setup validation rules + validation_config = pipeline_config.get('validation', {}) + if 'schema' in validation_config: + pipeline.validator.add_schema_validation(validation_config['schema']) + if 'completeness_check' in validation_config: + pipeline.validator.add_completeness_check(**validation_config['completeness_check']) + if 'uniqueness_check' in validation_config: + pipeline.validator.add_uniqueness_check(validation_config['uniqueness_check']) + + # Create transformation pipeline + transformation_pipeline = create_transformation_pipeline( + pipeline_config.get('transformations', {}) + ) + + # Run pipeline + logger.info(f"Starting pipeline: {args.pipeline_name}") + + run_result = pipeline.run_monitored_pipeline( + extraction_config=pipeline_config['extraction'], + transformation_pipeline=transformation_pipeline, + load_config=pipeline_config['load'], + source_branch=args.source_branch + ) + + logger.info(f"Pipeline completed: {run_result.status.value}") + + # Merge to main if requested and successful + if args.merge_to_main and run_result.status.value == 'success': + merge_commit = pipeline.merge_to_target('main') + logger.info(f"Merged to main: {merge_commit}") + + # Exit with appropriate code + sys.exit(0 if run_result.status.value == 'success' else 1) + + except Exception as e: + logger.error(f"Pipeline failed: {e}") + sys.exit(1) + + finally: + if 'pipeline' in locals(): + pipeline.cleanup() + +if __name__ == '__main__': + main() +``` + +```python +# scripts/health_check.py +#!/usr/bin/env python3 +""" +Health check script for ETL pipeline +""" + +import sys +import os +import logging + +# Add src to path +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) + +from config.settings import config +import lakefs + +def check_lakefs_connection(): + """Check lakeFS connectivity""" + try: + client = lakefs.Client( + host=config.lakefs.host, + username=config.lakefs.username, + password=config.lakefs.password + ) + + # Try to list repositories + repos = client.repositories.list() + return True, f"lakeFS connection OK, found {len(repos)} repositories" + + except Exception as e: + return False, f"lakeFS connection failed: {str(e)}" + +def check_database_connection(): + """Check database connectivity""" + try: + from src.extractors.database_extractor import DatabaseExtractor + + extractor = DatabaseExtractor() + + # Try a simple query + with extractor.engine.connect() as conn: + result = conn.execute("SELECT 1") + result.fetchone() + + extractor.close() + return True, "Database connection OK" + + except Exception as e: + return False, f"Database connection failed: {str(e)}" + +def main(): + """Run health checks""" + + logging.basicConfig(level=logging.WARNING) # Reduce noise + + checks = [ + ("lakeFS", check_lakefs_connection), + ("Database", check_database_connection) + ] + + all_healthy = True + + for check_name, check_func in checks: + try: + healthy, message = check_func() + status = "PASS" if healthy else "FAIL" + print(f"{check_name}: {status} - {message}") + + if not healthy: + all_healthy = False + + except Exception as e: + print(f"{check_name}: FAIL - Unexpected error: {str(e)}") + all_healthy = False + + if all_healthy: + print("Overall health: HEALTHY") + sys.exit(0) + else: + print("Overall health: UNHEALTHY") + sys.exit(1) + +if __name__ == '__main__': + main() +``` + +## Best Practices and Production Considerations + +### ETL Pipeline Best Practices + +1. **Idempotency** + - Ensure pipelines can be run multiple times safely + - Use upsert operations instead of inserts where possible + - Implement proper checkpointing + +2. **Data Quality** + - Validate data at every stage + - Implement comprehensive error handling + - Monitor data quality metrics + +3. **Performance Optimization** + - Process data in batches + - Use appropriate data formats (Parquet for analytics) + - Implement connection pooling + +4. **Security** + - Use environment variables for credentials + - Implement proper access controls + - Encrypt sensitive data + +5. **Monitoring and Alerting** + - Monitor pipeline execution metrics + - Set up alerts for failures and SLA violations + - Generate regular reports + +### Troubleshooting Common Issues + +1. **Memory Issues** +```python +# Process data in smaller chunks +def process_large_dataset(df, chunk_size=1000): + for i in range(0, len(df), chunk_size): + chunk = df.iloc[i:i+chunk_size] + yield process_chunk(chunk) +``` + +2. **Connection Timeouts** +```python +# Implement retry logic with exponential backoff +@retry_on_failure(max_retries=3, base_delay=2.0) +def robust_database_operation(): + # Database operation here + pass +``` + +3. **Data Validation Failures** +```python +# Implement graceful degradation +try: + validate_data_strict(df) +except ValidationError: + logger.warning("Strict validation failed, using lenient validation") + validate_data_lenient(df) +``` + +## Next Steps + +### Advanced Topics to Explore + +1. **[ML Experiment Tracking](ml-experiment-tracking.md)** - Version models and experiments +2. **[Data Science Workflow](data-science-workflow.md)** - Interactive analysis patterns +3. **[Advanced Features](../high-level-sdk/advanced.md)** - Performance optimization +4. **[Best Practices](../reference/best-practices.md)** - Production deployment + +### Integration Opportunities + +1. **Apache Airflow** - Schedule and orchestrate ETL pipelines +2. **dbt** - Transform data with SQL-based transformations +3. **Great Expectations** - Advanced data validation and profiling +4. **Prometheus/Grafana** - Advanced monitoring and visualization + +## See Also + +**Prerequisites and Setup:** +- **[Python SDK Overview](../index.md)** - Compare all Python SDK options +- **[Getting Started Guide](../getting-started.md)** - Installation and authentication setup +- **[High-Level SDK Transactions](../high-level-sdk/transactions.md)** - Transaction patterns + +**Related Tutorials:** +- **[Data Science Workflow](data-science-workflow.md)** - Interactive analysis patterns +- **[ML Experiment Tracking](ml-experiment-tracking.md)** - Model versioning workflows + +**Advanced Features:** +- **[Error Handling](../reference/troubleshooting.md)** - Comprehensive error handling +- **[Best Practices](../reference/best-practices.md)** - Production deployment guidance +- **[API Comparison](../reference/api-comparison.md)** - SDK feature comparison + +**External Resources:** +- **[Apache Airflow Documentation](https://airflow.apache.org/docs/){:target="_blank"}** - Workflow orchestration +- **[Great Expectations](https://greatexpectations.io/){:target="_blank"}** - Data validation framework +- **[Docker Best Practices](https://docs.docker.com/develop/dev-best-practices/){:target="_blank"}** - Container deployment \ No newline at end of file diff --git a/docs/src/integrations/python/tutorials/index.md b/docs/src/integrations/python/tutorials/index.md new file mode 100644 index 00000000000..66df9ca9ab9 --- /dev/null +++ b/docs/src/integrations/python/tutorials/index.md @@ -0,0 +1,70 @@ +--- +title: Python Tutorials +description: Real-world examples and tutorials for Python with lakeFS +sdk_types: ["high-level", "generated", "lakefs-spec", "boto3"] +difficulty: "beginner" +use_cases: ["tutorials", "learning", "examples", "workflows"] +topics: ["tutorials", "examples", "workflows", "real-world"] +audience: ["data-engineers", "data-scientists", "developers"] +last_updated: "2024-01-15" +--- + +# Python Tutorials + +Comprehensive tutorials and real-world examples for using Python with lakeFS. + +## Available Tutorials + +- **[Data Science Workflow](data-science-workflow.md)** - End-to-end data science pipeline +- **[ETL Pipeline](etl-pipeline.md)** - Extract, Transform, Load operations +- **[ML Experiment Tracking](ml-experiment-tracking.md)** - Machine learning workflows + +## Tutorial Structure + +Each tutorial includes: +- **Overview** - What you'll learn and build +- **Prerequisites** - Required knowledge and setup +- **Step-by-step Guide** - Detailed implementation +- **Complete Code** - Full working examples +- **Next Steps** - Further exploration + +## Getting Started + +Choose a tutorial based on your use case: +- New to lakeFS? Start with [Data Science Workflow](data-science-workflow.md) +- Building data pipelines? Try [ETL Pipeline](etl-pipeline.md) +- Working with ML? Explore [ML Experiment Tracking](ml-experiment-tracking.md) + +## See Also + +**Prerequisites and Setup:** +- **[Python SDK Overview](../index.md)** - Compare all Python SDK options +- **[Getting Started Guide](../getting-started.md)** - Installation and authentication setup +- **[SDK Selection Guide](../index.md#sdk-selection-decision-matrix)** - Choose the right SDK for your tutorial + +**SDK-Specific Documentation:** +- **[High-Level SDK Quickstart](../high-level-sdk/quickstart.md)** - Basic operations for tutorial prerequisites +- **[lakefs-spec Integration](../lakefs-spec/integrations.md)** - Data science library integrations +- **[Boto3 S3 Operations](../boto3/s3-operations.md)** - S3-compatible patterns for migration scenarios + +**Tutorial-Related Features:** +- **[Transaction Patterns](../high-level-sdk/transactions.md)** - Atomic operations used in tutorials +- **[Object I/O Operations](../high-level-sdk/objects-and-io.md)** - File handling patterns +- **[Import/Export Operations](../high-level-sdk/imports-and-exports.md)** - Bulk data operations +- **[Filesystem Operations](../lakefs-spec/filesystem-api.md)** - File-like operations for data science + +**Learning Path:** +- **[Quickstart Guide](../high-level-sdk/quickstart.md)** - Learn basic operations first +- **[API Comparison](../reference/api-comparison.md)** - Understand SDK differences +- **[Best Practices](../reference/best-practices.md)** - Production deployment guidance +- **[Troubleshooting](../reference/troubleshooting.md)** - Common issues and solutions + +**Real-World Applications:** +- **[Data Science Workflow](data-science-workflow.md)** - pandas, Jupyter, and analysis workflows +- **[ETL Pipeline Tutorial](etl-pipeline.md)** - Production data pipeline patterns +- **[ML Experiment Tracking](ml-experiment-tracking.md)** - Model versioning and experiment management + +**External Resources:** +- **[lakeFS Use Cases](https://docs.lakefs.io/use_cases/){:target="_blank"}** - Real-world lakeFS applications +- **[lakeFS Concepts](https://docs.lakefs.io/understand/){:target="_blank"}** - Core lakeFS concepts for tutorials +- **[Jupyter Integration Examples](https://github.com/treeverse/lakeFS-samples){:target="_blank"}** - Community examples and notebooks \ No newline at end of file diff --git a/docs/src/integrations/python/tutorials/ml-experiment-tracking.md b/docs/src/integrations/python/tutorials/ml-experiment-tracking.md new file mode 100644 index 00000000000..9746591a3ad --- /dev/null +++ b/docs/src/integrations/python/tutorials/ml-experiment-tracking.md @@ -0,0 +1,3018 @@ +--- +title: ML Experiment Tracking Tutorial +description: Build machine learning workflows with model versioning and experiment tracking using lakeFS +sdk_types: ["high-level", "lakefs-spec"] +difficulty: "advanced" +use_cases: ["ml", "experiment-tracking", "model-versioning", "data-science"] +topics: ["machine-learning", "experiments", "models", "versioning", "mlops"] +audience: ["data-scientists", "ml-engineers", "researchers"] +last_updated: "2024-01-15" +--- + +# ML Experiment Tracking Tutorial + +Learn how to build comprehensive machine learning workflows with experiment tracking, model versioning, and dataset management using lakeFS. This tutorial demonstrates MLOps best practices for reproducible machine learning experiments, model lifecycle management, and collaborative ML development. + +## What You'll Build + +By the end of this tutorial, you'll have: + +- **Experiment Management System** - Track ML experiments with full reproducibility +- **Model Versioning Pipeline** - Version models with metadata and lineage +- **Dataset Management** - Version datasets and track data lineage +- **Model Registry** - Centralized model storage and deployment tracking +- **A/B Testing Framework** - Compare model performance across experiments +- **MLOps Pipeline** - Production-ready ML deployment workflow + +## Prerequisites + +### Knowledge Requirements +- Intermediate Python and machine learning concepts +- Familiarity with scikit-learn, pandas, and numpy +- Understanding of ML model lifecycle +- Basic knowledge of model evaluation metrics + +### Environment Setup +- Python 3.8+ installed +- Jupyter notebook environment +- lakeFS server running (local or cloud) +- Required ML libraries (we'll install these) + +### lakeFS Setup +```bash +# Start lakeFS locally (if not already running) +docker run --name lakefs --rm -p 8000:8000 treeverse/lakefs:latest run --local-settings +``` + +## Step 1: Environment Setup and ML Dependencies + +### Install Required Packages + +```bash +# Install lakeFS and ML libraries +pip install lakefs lakefs-spec pandas numpy +pip install scikit-learn xgboost lightgbm +pip install matplotlib seaborn plotly +pip install mlflow optuna hyperopt +pip install joblib pickle-mixin +pip install jupyter ipywidgets +``` + +### Project Structure Setup + +```bash +# Create ML project structure +mkdir lakefs-ml-experiments +cd lakefs-ml-experiments + +# Create directory structure +mkdir -p {data,models,experiments,notebooks,scripts,configs,reports} + +# Create configuration files +touch configs/{model_config.yaml,experiment_config.yaml} +``` + +### ML Experiment Configuration + +```python +# configs/ml_config.py +import os +from dataclasses import dataclass, field +from typing import Dict, List, Optional, Any +from datetime import datetime +import yaml + +@dataclass +class LakeFSConfig: + host: str + username: str + password: str + repository: str + +@dataclass +class ExperimentConfig: + experiment_name: str + description: str + tags: List[str] + parameters: Dict[str, Any] + metrics_to_track: List[str] + artifacts_to_save: List[str] + +@dataclass +class ModelConfig: + model_type: str + hyperparameters: Dict[str, Any] + feature_columns: List[str] + target_column: str + validation_split: float + random_state: int + +class MLConfigManager: + """Configuration management for ML experiments""" + + def __init__(self, config_path: str = "configs"): + self.config_path = config_path + self._load_configs() + + def _load_configs(self): + """Load all configuration files""" + # lakeFS configuration + self.lakefs = LakeFSConfig( + host=os.getenv('LAKEFS_HOST', 'http://localhost:8000'), + username=os.getenv('LAKEFS_ACCESS_KEY', 'AKIAIOSFODNN7EXAMPLE'), + password=os.getenv('LAKEFS_SECRET_KEY', 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY'), + repository=os.getenv('LAKEFS_REPOSITORY', 'ml-experiments') + ) + + def create_experiment_config( + self, + experiment_name: str, + description: str, + model_type: str, + hyperparameters: Dict[str, Any], + tags: List[str] = None + ) -> ExperimentConfig: + """Create experiment configuration""" + + return ExperimentConfig( + experiment_name=experiment_name, + description=description, + tags=tags or [], + parameters=hyperparameters, + metrics_to_track=['accuracy', 'precision', 'recall', 'f1_score', 'auc_roc'], + artifacts_to_save=['model', 'feature_importance', 'confusion_matrix', 'roc_curve'] + ) + + def create_model_config( + self, + model_type: str, + hyperparameters: Dict[str, Any], + feature_columns: List[str], + target_column: str + ) -> ModelConfig: + """Create model configuration""" + + return ModelConfig( + model_type=model_type, + hyperparameters=hyperparameters, + feature_columns=feature_columns, + target_column=target_column, + validation_split=0.2, + random_state=42 + ) + +# Global configuration instance +ml_config = MLConfigManager() +``` + +## Step 2: ML Repository Setup and Data Management + +### Create ML Repository + +```python +# scripts/setup_ml_repository.py +import lakefs +from lakefs.exceptions import RepositoryExistsError +import pandas as pd +import numpy as np +from sklearn.datasets import make_classification, make_regression +from datetime import datetime, timedelta +import json +import logging + +logger = logging.getLogger(__name__) + +def setup_ml_repository(): + """Initialize lakeFS repository for ML experiments""" + + client = lakefs.Client( + host='http://localhost:8000', + username='AKIAIOSFODNN7EXAMPLE', + password='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY' + ) + + # Create repository + try: + repo = client.repositories.create( + name='ml-experiments', + storage_namespace='s3://my-bucket/ml-experiments/', + default_branch='main' + ) + logger.info(f"Created repository: {repo.id}") + except RepositoryExistsError: + repo = client.repositories.get('ml-experiments') + logger.info(f"Using existing repository: {repo.id}") + + # Create standard branches for ML workflow + branches_to_create = [ + ('development', 'main'), + ('staging', 'main'), + ('production', 'main'), + ('experiments', 'main') + ] + + for branch_name, source in branches_to_create: + try: + branch = repo.branches.create(branch_name, source_reference=source) + logger.info(f"Created branch: {branch_name}") + except Exception as e: + logger.info(f"Branch {branch_name} already exists or error: {e}") + + return repo + +def create_ml_directory_structure(repo): + """Create ML-specific directory structure in lakeFS""" + + main_branch = repo.branches.get('main') + + # Create directory structure with placeholder files + directories = [ + 'datasets/raw/', + 'datasets/processed/', + 'datasets/features/', + 'models/trained/', + 'models/registered/', + 'models/deployed/', + 'experiments/runs/', + 'experiments/results/', + 'experiments/artifacts/', + 'notebooks/', + 'reports/', + 'configs/', + 'logs/' + ] + + for directory in directories: + try: + main_branch.objects.upload( + path=f"{directory}.gitkeep", + data=b"# ML directory placeholder", + content_type='text/plain' + ) + except Exception as e: + logger.warning(f"Could not create {directory}: {e}") + + # Commit directory structure + commit = main_branch.commits.create( + message="Initialize ML experiment directory structure", + metadata={'setup': 'ml_directory_structure', 'type': 'ml_setup'} + ) + + logger.info(f"ML directory structure created: {commit.id}") + return commit + +def generate_sample_datasets(repo): + """Generate sample datasets for ML experiments""" + + main_branch = repo.branches.get('main') + + # Generate classification dataset + X_class, y_class = make_classification( + n_samples=10000, + n_features=20, + n_informative=15, + n_redundant=5, + n_classes=2, + random_state=42 + ) + + # Create feature names + feature_names = [f'feature_{i}' for i in range(X_class.shape[1])] + + # Create classification DataFrame + classification_df = pd.DataFrame(X_class, columns=feature_names) + classification_df['target'] = y_class + classification_df['customer_id'] = range(len(classification_df)) + classification_df['timestamp'] = pd.date_range('2023-01-01', periods=len(classification_df), freq='H') + + # Add some realistic business context + classification_df['age'] = np.random.randint(18, 80, len(classification_df)) + classification_df['income'] = np.random.exponential(50000, len(classification_df)) + classification_df['region'] = np.random.choice(['North', 'South', 'East', 'West'], len(classification_df)) + + # Save classification dataset + classification_csv = classification_df.to_csv(index=False) + main_branch.objects.upload( + path='datasets/raw/customer_churn_dataset.csv', + data=classification_csv.encode(), + content_type='text/csv' + ) + + # Generate regression dataset + X_reg, y_reg = make_regression( + n_samples=8000, + n_features=15, + n_informative=10, + noise=0.1, + random_state=42 + ) + + # Create regression DataFrame + regression_feature_names = [f'feature_{i}' for i in range(X_reg.shape[1])] + regression_df = pd.DataFrame(X_reg, columns=regression_feature_names) + regression_df['target'] = y_reg + regression_df['property_id'] = range(len(regression_df)) + regression_df['timestamp'] = pd.date_range('2023-01-01', periods=len(regression_df), freq='2H') + + # Add realistic property features + regression_df['square_feet'] = np.random.randint(500, 5000, len(regression_df)) + regression_df['bedrooms'] = np.random.randint(1, 6, len(regression_df)) + regression_df['bathrooms'] = np.random.randint(1, 4, len(regression_df)) + regression_df['location_score'] = np.random.uniform(1, 10, len(regression_df)) + + # Save regression dataset + regression_csv = regression_df.to_csv(index=False) + main_branch.objects.upload( + path='datasets/raw/house_price_dataset.csv', + data=regression_csv.encode(), + content_type='text/csv' + ) + + # Create dataset metadata + datasets_metadata = { + 'customer_churn_dataset': { + 'type': 'classification', + 'target_column': 'target', + 'n_samples': len(classification_df), + 'n_features': len(feature_names), + 'classes': [0, 1], + 'description': 'Customer churn prediction dataset', + 'created_at': datetime.now().isoformat() + }, + 'house_price_dataset': { + 'type': 'regression', + 'target_column': 'target', + 'n_samples': len(regression_df), + 'n_features': len(regression_feature_names), + 'description': 'House price prediction dataset', + 'created_at': datetime.now().isoformat() + } + } + + main_branch.objects.upload( + path='datasets/metadata.json', + data=json.dumps(datasets_metadata, indent=2).encode(), + content_type='application/json' + ) + + # Commit datasets + commit = main_branch.commits.create( + message="Add sample ML datasets: customer churn and house prices", + metadata={ + 'datasets_added': 2, + 'classification_samples': len(classification_df), + 'regression_samples': len(regression_df) + } + ) + + logger.info(f"Sample datasets created: {commit.id}") + return commit + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + + repo = setup_ml_repository() + create_ml_directory_structure(repo) + generate_sample_datasets(repo) + + print("ML repository setup complete!") +``` + +## Step 3: Experiment Tracking Framework + +### Core Experiment Tracking System + +```python +# src/experiment_tracker.py +import lakefs +import lakefs_spec +import pandas as pd +import numpy as np +import json +import pickle +import joblib +from datetime import datetime +from typing import Dict, List, Any, Optional, Union +import logging +import uuid +from dataclasses import dataclass, field, asdict +from pathlib import Path +import matplotlib.pyplot as plt +import seaborn as sns +from sklearn.metrics import ( + accuracy_score, precision_score, recall_score, f1_score, + roc_auc_score, mean_squared_error, mean_absolute_error, r2_score, + confusion_matrix, roc_curve, precision_recall_curve +) +import io + +logger = logging.getLogger(__name__) + +@dataclass +class ExperimentRun: + """Represents a single ML experiment run""" + run_id: str + experiment_name: str + model_type: str + start_time: datetime + end_time: Optional[datetime] = None + status: str = "running" # running, completed, failed + parameters: Dict[str, Any] = field(default_factory=dict) + metrics: Dict[str, float] = field(default_factory=dict) + artifacts: Dict[str, str] = field(default_factory=dict) # artifact_name -> path + dataset_version: Optional[str] = None + model_version: Optional[str] = None + notes: str = "" + tags: List[str] = field(default_factory=list) + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary for serialization""" + return { + 'run_id': self.run_id, + 'experiment_name': self.experiment_name, + 'model_type': self.model_type, + 'start_time': self.start_time.isoformat(), + 'end_time': self.end_time.isoformat() if self.end_time else None, + 'status': self.status, + 'parameters': self.parameters, + 'metrics': self.metrics, + 'artifacts': self.artifacts, + 'dataset_version': self.dataset_version, + 'model_version': self.model_version, + 'notes': self.notes, + 'tags': self.tags + } + +class MLExperimentTracker: + """Comprehensive ML experiment tracking with lakeFS""" + + def __init__(self, repository_name: str = 'ml-experiments'): + self.client = lakefs.Client( + host='http://localhost:8000', + username='AKIAIOSFODNN7EXAMPLE', + password='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY' + ) + self.repo = self.client.repositories.get(repository_name) + self.fs = lakefs_spec.LakeFSFileSystem( + host='http://localhost:8000', + username='AKIAIOSFODNN7EXAMPLE', + password='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY' + ) + self.logger = logging.getLogger(__name__) + + # Current experiment state + self.current_run: Optional[ExperimentRun] = None + self.current_branch: Optional[str] = None + + def start_experiment( + self, + experiment_name: str, + model_type: str, + parameters: Dict[str, Any], + dataset_version: Optional[str] = None, + notes: str = "", + tags: List[str] = None, + branch_name: Optional[str] = None + ) -> ExperimentRun: + """Start a new experiment run""" + + run_id = str(uuid.uuid4()) + + # Create experiment branch if not provided + if branch_name is None: + branch_name = f"experiment/{experiment_name}/{run_id[:8]}" + + try: + # Create experiment branch + experiment_branch = self.repo.branches.create( + branch_name, + source_reference='experiments' + ) + self.current_branch = branch_name + self.logger.info(f"Created experiment branch: {branch_name}") + except Exception as e: + # Branch might already exist + experiment_branch = self.repo.branches.get(branch_name) + self.current_branch = branch_name + self.logger.info(f"Using existing experiment branch: {branch_name}") + + # Initialize experiment run + self.current_run = ExperimentRun( + run_id=run_id, + experiment_name=experiment_name, + model_type=model_type, + start_time=datetime.now(), + parameters=parameters, + dataset_version=dataset_version, + notes=notes, + tags=tags or [] + ) + + # Save initial experiment metadata + self._save_experiment_metadata(experiment_branch) + + self.logger.info(f"Started experiment: {experiment_name} (run_id: {run_id})") + return self.current_run + + def log_parameter(self, key: str, value: Any): + """Log a parameter for the current experiment""" + if self.current_run: + self.current_run.parameters[key] = value + self.logger.debug(f"Logged parameter: {key} = {value}") + + def log_metric(self, key: str, value: float): + """Log a metric for the current experiment""" + if self.current_run: + self.current_run.metrics[key] = value + self.logger.debug(f"Logged metric: {key} = {value}") + + def log_metrics(self, metrics: Dict[str, float]): + """Log multiple metrics at once""" + for key, value in metrics.items(): + self.log_metric(key, value) + + def log_artifact(self, artifact_name: str, artifact_path: str, artifact_data: Any = None): + """Log an artifact (model, plot, etc.) for the current experiment""" + if not self.current_run or not self.current_branch: + raise ValueError("No active experiment run") + + branch = self.repo.branches.get(self.current_branch) + + # Determine storage path + storage_path = f"experiments/runs/{self.current_run.run_id}/artifacts/{artifact_name}" + + # Save artifact data if provided + if artifact_data is not None: + if isinstance(artifact_data, plt.Figure): + # Save matplotlib figure + buffer = io.BytesIO() + artifact_data.savefig(buffer, format='png', dpi=300, bbox_inches='tight') + buffer.seek(0) + + branch.objects.upload( + path=f"{storage_path}.png", + data=buffer.getvalue(), + content_type='image/png' + ) + storage_path = f"{storage_path}.png" + + elif hasattr(artifact_data, 'save'): + # Save scikit-learn model or similar + buffer = io.BytesIO() + joblib.dump(artifact_data, buffer) + + branch.objects.upload( + path=f"{storage_path}.joblib", + data=buffer.getvalue(), + content_type='application/octet-stream' + ) + storage_path = f"{storage_path}.joblib" + + elif isinstance(artifact_data, (dict, list)): + # Save JSON data + json_data = json.dumps(artifact_data, indent=2, default=str) + + branch.objects.upload( + path=f"{storage_path}.json", + data=json_data.encode(), + content_type='application/json' + ) + storage_path = f"{storage_path}.json" + + elif isinstance(artifact_data, pd.DataFrame): + # Save DataFrame as CSV + csv_data = artifact_data.to_csv(index=False) + + branch.objects.upload( + path=f"{storage_path}.csv", + data=csv_data.encode(), + content_type='text/csv' + ) + storage_path = f"{storage_path}.csv" + + # Record artifact path + self.current_run.artifacts[artifact_name] = storage_path + self.logger.info(f"Logged artifact: {artifact_name} -> {storage_path}") + + def log_model(self, model, model_name: str = "model", metadata: Dict[str, Any] = None): + """Log a trained model with metadata""" + if not self.current_run or not self.current_branch: + raise ValueError("No active experiment run") + + branch = self.repo.branches.get(self.current_branch) + + # Save model + model_path = f"experiments/runs/{self.current_run.run_id}/models/{model_name}.joblib" + + buffer = io.BytesIO() + joblib.dump(model, buffer) + + branch.objects.upload( + path=model_path, + data=buffer.getvalue(), + content_type='application/octet-stream' + ) + + # Save model metadata + model_metadata = { + 'model_name': model_name, + 'model_type': self.current_run.model_type, + 'run_id': self.current_run.run_id, + 'experiment_name': self.current_run.experiment_name, + 'created_at': datetime.now().isoformat(), + 'parameters': self.current_run.parameters, + 'metrics': self.current_run.metrics, + 'custom_metadata': metadata or {} + } + + metadata_path = f"experiments/runs/{self.current_run.run_id}/models/{model_name}_metadata.json" + branch.objects.upload( + path=metadata_path, + data=json.dumps(model_metadata, indent=2).encode(), + content_type='application/json' + ) + + # Update run artifacts + self.current_run.artifacts[f"{model_name}_model"] = model_path + self.current_run.artifacts[f"{model_name}_metadata"] = metadata_path + self.current_run.model_version = model_path + + self.logger.info(f"Logged model: {model_name} -> {model_path}") + + def evaluate_model(self, model, X_test, y_test, task_type: str = 'classification') -> Dict[str, float]: + """Evaluate model and log metrics automatically""" + + y_pred = model.predict(X_test) + metrics = {} + + if task_type == 'classification': + # Classification metrics + metrics['accuracy'] = accuracy_score(y_test, y_pred) + metrics['precision'] = precision_score(y_test, y_pred, average='weighted') + metrics['recall'] = recall_score(y_test, y_pred, average='weighted') + metrics['f1_score'] = f1_score(y_test, y_pred, average='weighted') + + # ROC AUC for binary classification + if len(np.unique(y_test)) == 2: + if hasattr(model, 'predict_proba'): + y_pred_proba = model.predict_proba(X_test)[:, 1] + metrics['roc_auc'] = roc_auc_score(y_test, y_pred_proba) + elif hasattr(model, 'decision_function'): + y_pred_scores = model.decision_function(X_test) + metrics['roc_auc'] = roc_auc_score(y_test, y_pred_scores) + + # Log confusion matrix + cm = confusion_matrix(y_test, y_pred) + self.log_artifact('confusion_matrix', 'confusion_matrix', cm.tolist()) + + # Create and log confusion matrix plot + plt.figure(figsize=(8, 6)) + sns.heatmap(cm, annot=True, fmt='d', cmap='Blues') + plt.title('Confusion Matrix') + plt.ylabel('True Label') + plt.xlabel('Predicted Label') + self.log_artifact('confusion_matrix_plot', 'confusion_matrix_plot', plt.gcf()) + plt.close() + + elif task_type == 'regression': + # Regression metrics + metrics['mse'] = mean_squared_error(y_test, y_pred) + metrics['rmse'] = np.sqrt(metrics['mse']) + metrics['mae'] = mean_absolute_error(y_test, y_pred) + metrics['r2_score'] = r2_score(y_test, y_pred) + + # Create and log residuals plot + residuals = y_test - y_pred + plt.figure(figsize=(10, 6)) + + plt.subplot(1, 2, 1) + plt.scatter(y_pred, residuals, alpha=0.6) + plt.axhline(y=0, color='r', linestyle='--') + plt.xlabel('Predicted Values') + plt.ylabel('Residuals') + plt.title('Residuals vs Predicted') + + plt.subplot(1, 2, 2) + plt.hist(residuals, bins=30, alpha=0.7) + plt.xlabel('Residuals') + plt.ylabel('Frequency') + plt.title('Residuals Distribution') + + plt.tight_layout() + self.log_artifact('residuals_plot', 'residuals_plot', plt.gcf()) + plt.close() + + # Log all metrics + self.log_metrics(metrics) + + return metrics + + def end_experiment(self, status: str = "completed", notes: str = ""): + """End the current experiment run""" + if not self.current_run or not self.current_branch: + raise ValueError("No active experiment run") + + self.current_run.end_time = datetime.now() + self.current_run.status = status + if notes: + self.current_run.notes += f"\n{notes}" + + # Save final experiment metadata + branch = self.repo.branches.get(self.current_branch) + self._save_experiment_metadata(branch) + + # Commit experiment results + duration = (self.current_run.end_time - self.current_run.start_time).total_seconds() + + commit = branch.commits.create( + message=f"Complete experiment: {self.current_run.experiment_name}", + metadata={ + 'experiment_name': self.current_run.experiment_name, + 'run_id': self.current_run.run_id, + 'status': status, + 'duration_seconds': str(duration), + 'metrics': json.dumps(self.current_run.metrics) + } + ) + + self.logger.info(f"Experiment completed: {self.current_run.experiment_name} (duration: {duration:.2f}s)") + + # Reset current state + completed_run = self.current_run + self.current_run = None + self.current_branch = None + + return completed_run, commit.id + + def _save_experiment_metadata(self, branch): + """Save experiment metadata to lakeFS""" + if self.current_run: + metadata_path = f"experiments/runs/{self.current_run.run_id}/metadata.json" + metadata_json = json.dumps(self.current_run.to_dict(), indent=2) + + branch.objects.upload( + path=metadata_path, + data=metadata_json.encode(), + content_type='application/json' + ) + + def list_experiments(self, experiment_name: Optional[str] = None) -> List[Dict[str, Any]]: + """List all experiments or experiments by name""" + + experiments = [] + + try: + # List all experiment run directories + experiments_branch = self.repo.branches.get('experiments') + + # Get all run directories + run_objects = experiments_branch.objects.list(prefix='experiments/runs/') + + run_ids = set() + for obj in run_objects: + path_parts = obj.path.split('/') + if len(path_parts) >= 3 and path_parts[2] not in run_ids: + run_ids.add(path_parts[2]) + + # Load metadata for each run + for run_id in run_ids: + try: + metadata_obj = experiments_branch.objects.get(f'experiments/runs/{run_id}/metadata.json') + metadata = json.loads(metadata_obj.reader().read().decode()) + + if experiment_name is None or metadata.get('experiment_name') == experiment_name: + experiments.append(metadata) + + except Exception as e: + self.logger.warning(f"Could not load metadata for run {run_id}: {e}") + + # Sort by start time + experiments.sort(key=lambda x: x.get('start_time', ''), reverse=True) + + except Exception as e: + self.logger.error(f"Error listing experiments: {e}") + + return experiments + + def load_model(self, run_id: str, model_name: str = "model"): + """Load a model from a specific experiment run""" + + try: + experiments_branch = self.repo.branches.get('experiments') + model_path = f"experiments/runs/{run_id}/models/{model_name}.joblib" + + model_obj = experiments_branch.objects.get(model_path) + model_data = model_obj.reader().read() + + # Load model from bytes + model = joblib.load(io.BytesIO(model_data)) + + self.logger.info(f"Loaded model: {model_name} from run {run_id}") + return model + + except Exception as e: + self.logger.error(f"Error loading model: {e}") + raise + + def compare_experiments(self, run_ids: List[str]) -> pd.DataFrame: + """Compare multiple experiment runs""" + + comparison_data = [] + + for run_id in run_ids: + try: + experiments_branch = self.repo.branches.get('experiments') + metadata_obj = experiments_branch.objects.get(f'experiments/runs/{run_id}/metadata.json') + metadata = json.loads(metadata_obj.reader().read().decode()) + + # Flatten data for comparison + row = { + 'run_id': run_id, + 'experiment_name': metadata.get('experiment_name'), + 'model_type': metadata.get('model_type'), + 'status': metadata.get('status'), + 'start_time': metadata.get('start_time') + } + + # Add parameters + for key, value in metadata.get('parameters', {}).items(): + row[f'param_{key}'] = value + + # Add metrics + for key, value in metadata.get('metrics', {}).items(): + row[f'metric_{key}'] = value + + comparison_data.append(row) + + except Exception as e: + self.logger.warning(f"Could not load data for run {run_id}: {e}") + + return pd.DataFrame(comparison_data) + +# Example usage +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + + # Create experiment tracker + tracker = MLExperimentTracker() + + # Start an experiment + run = tracker.start_experiment( + experiment_name="customer_churn_prediction", + model_type="random_forest", + parameters={ + 'n_estimators': 100, + 'max_depth': 10, + 'random_state': 42 + }, + notes="Initial baseline model", + tags=['baseline', 'random_forest'] + ) + + print(f"Started experiment: {run.run_id}") + + # Log additional parameters and metrics + tracker.log_parameter('feature_selection', 'all') + tracker.log_metric('training_time', 45.2) + + # End experiment + completed_run, commit_id = tracker.end_experiment("completed") + print(f"Experiment completed: {commit_id}") +```# +# Step 4: Complete ML Workflow Implementation + +### End-to-End ML Pipeline + +```python +# src/ml_pipeline.py +import lakefs +import lakefs_spec +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +from sklearn.linear_model import LogisticRegression, LinearRegression +from sklearn.svm import SVC, SVR +from sklearn.preprocessing import StandardScaler, LabelEncoder +from sklearn.pipeline import Pipeline +from sklearn.metrics import classification_report, regression_report +import xgboost as xgb +import lightgbm as lgb +from datetime import datetime +import logging +from typing import Dict, List, Any, Optional, Tuple +import json + +from src.experiment_tracker import MLExperimentTracker + +logger = logging.getLogger(__name__) + +class MLPipeline: + """Complete ML pipeline with experiment tracking""" + + def __init__(self, repository_name: str = 'ml-experiments'): + self.tracker = MLExperimentTracker(repository_name) + self.repo = self.tracker.repo + self.fs = self.tracker.fs + self.logger = logging.getLogger(__name__) + + # Available models + self.models = { + 'random_forest_classifier': RandomForestClassifier, + 'random_forest_regressor': RandomForestRegressor, + 'logistic_regression': LogisticRegression, + 'linear_regression': LinearRegression, + 'svc': SVC, + 'svr': SVR, + 'xgboost_classifier': xgb.XGBClassifier, + 'xgboost_regressor': xgb.XGBRegressor, + 'lightgbm_classifier': lgb.LGBMClassifier, + 'lightgbm_regressor': lgb.LGBMRegressor + } + + def load_dataset(self, dataset_path: str, branch: str = 'main') -> pd.DataFrame: + """Load dataset from lakeFS""" + + full_path = f"lakefs://ml-experiments/{branch}/{dataset_path}" + + try: + if dataset_path.endswith('.csv'): + df = pd.read_csv(full_path, filesystem=self.fs) + elif dataset_path.endswith('.parquet'): + df = pd.read_parquet(full_path, filesystem=self.fs) + else: + raise ValueError(f"Unsupported file format: {dataset_path}") + + self.logger.info(f"Loaded dataset: {dataset_path} ({len(df)} rows, {len(df.columns)} columns)") + return df + + except Exception as e: + self.logger.error(f"Error loading dataset {dataset_path}: {e}") + raise + + def preprocess_data( + self, + df: pd.DataFrame, + target_column: str, + feature_columns: Optional[List[str]] = None, + test_size: float = 0.2, + random_state: int = 42 + ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]: + """Preprocess data for ML training""" + + # Select features + if feature_columns is None: + feature_columns = [col for col in df.columns if col != target_column] + + X = df[feature_columns].copy() + y = df[target_column].copy() + + # Handle missing values + X = X.fillna(X.mean() if X.select_dtypes(include=[np.number]).shape[1] > 0 else X.mode().iloc[0]) + + # Encode categorical variables + categorical_columns = X.select_dtypes(include=['object']).columns + for col in categorical_columns: + le = LabelEncoder() + X[col] = le.fit_transform(X[col].astype(str)) + + # Split data + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=test_size, random_state=random_state, stratify=y if len(np.unique(y)) < 20 else None + ) + + self.logger.info(f"Data preprocessing complete: {len(X_train)} train, {len(X_test)} test samples") + + return X_train, X_test, y_train, y_test + + def run_experiment( + self, + experiment_name: str, + dataset_path: str, + target_column: str, + model_type: str, + hyperparameters: Dict[str, Any], + feature_columns: Optional[List[str]] = None, + task_type: str = 'classification', + cross_validation: bool = True, + hyperparameter_tuning: bool = False, + tuning_params: Optional[Dict[str, List]] = None, + notes: str = "", + tags: List[str] = None + ) -> Tuple[Any, Dict[str, float]]: + """Run a complete ML experiment""" + + # Start experiment tracking + run = self.tracker.start_experiment( + experiment_name=experiment_name, + model_type=model_type, + parameters=hyperparameters, + notes=notes, + tags=tags or [] + ) + + try: + # Load and preprocess data + df = self.load_dataset(dataset_path) + X_train, X_test, y_train, y_test = self.preprocess_data( + df, target_column, feature_columns + ) + + # Log dataset information + self.tracker.log_parameter('dataset_path', dataset_path) + self.tracker.log_parameter('target_column', target_column) + self.tracker.log_parameter('n_features', len(X_train.columns)) + self.tracker.log_parameter('n_train_samples', len(X_train)) + self.tracker.log_parameter('n_test_samples', len(X_test)) + self.tracker.log_parameter('task_type', task_type) + + # Create model + if model_type not in self.models: + raise ValueError(f"Unknown model type: {model_type}") + + model_class = self.models[model_type] + + # Hyperparameter tuning + if hyperparameter_tuning and tuning_params: + self.logger.info("Starting hyperparameter tuning...") + + # Create base model + base_model = model_class(**hyperparameters) + + # Grid search + grid_search = GridSearchCV( + base_model, + tuning_params, + cv=5, + scoring='accuracy' if task_type == 'classification' else 'r2', + n_jobs=-1, + verbose=1 + ) + + grid_search.fit(X_train, y_train) + + # Use best parameters + best_params = grid_search.best_params_ + hyperparameters.update(best_params) + + # Log tuning results + self.tracker.log_parameter('hyperparameter_tuning', True) + self.tracker.log_parameter('best_params', best_params) + self.tracker.log_metric('best_cv_score', grid_search.best_score_) + + model = grid_search.best_estimator_ + + else: + # Create model with given hyperparameters + model = model_class(**hyperparameters) + model.fit(X_train, y_train) + + # Cross-validation + if cross_validation: + cv_scores = cross_val_score( + model, X_train, y_train, cv=5, + scoring='accuracy' if task_type == 'classification' else 'r2' + ) + + self.tracker.log_metric('cv_mean', cv_scores.mean()) + self.tracker.log_metric('cv_std', cv_scores.std()) + self.tracker.log_artifact('cv_scores', 'cv_scores', cv_scores.tolist()) + + # Evaluate model + metrics = self.tracker.evaluate_model(model, X_test, y_test, task_type) + + # Feature importance (if available) + if hasattr(model, 'feature_importances_'): + feature_importance = pd.DataFrame({ + 'feature': X_train.columns, + 'importance': model.feature_importances_ + }).sort_values('importance', ascending=False) + + self.tracker.log_artifact('feature_importance', 'feature_importance', feature_importance) + + # Create feature importance plot + import matplotlib.pyplot as plt + plt.figure(figsize=(10, 8)) + top_features = feature_importance.head(20) + plt.barh(range(len(top_features)), top_features['importance']) + plt.yticks(range(len(top_features)), top_features['feature']) + plt.xlabel('Feature Importance') + plt.title('Top 20 Feature Importances') + plt.gca().invert_yaxis() + plt.tight_layout() + + self.tracker.log_artifact('feature_importance_plot', 'feature_importance_plot', plt.gcf()) + plt.close() + + # Log model + self.tracker.log_model(model, 'trained_model', { + 'feature_columns': list(X_train.columns), + 'target_column': target_column, + 'model_class': model_class.__name__ + }) + + # Log training data sample + train_sample = X_train.head(100).copy() + train_sample[target_column] = y_train.head(100) + self.tracker.log_artifact('training_data_sample', 'training_data_sample', train_sample) + + # End experiment + completed_run, commit_id = self.tracker.end_experiment("completed") + + self.logger.info(f"Experiment completed successfully: {experiment_name}") + + return model, metrics + + except Exception as e: + self.logger.error(f"Experiment failed: {e}") + self.tracker.end_experiment("failed", f"Error: {str(e)}") + raise + + def run_hyperparameter_optimization( + self, + experiment_name: str, + dataset_path: str, + target_column: str, + model_type: str, + param_space: Dict[str, List], + n_trials: int = 50, + task_type: str = 'classification' + ): + """Run hyperparameter optimization using Optuna""" + + try: + import optuna + except ImportError: + raise ImportError("Optuna is required for hyperparameter optimization. Install with: pip install optuna") + + # Load data + df = self.load_dataset(dataset_path) + X_train, X_test, y_train, y_test = self.preprocess_data(df, target_column) + + def objective(trial): + # Sample hyperparameters + params = {} + for param_name, param_values in param_space.items(): + if isinstance(param_values[0], int): + params[param_name] = trial.suggest_int(param_name, min(param_values), max(param_values)) + elif isinstance(param_values[0], float): + params[param_name] = trial.suggest_float(param_name, min(param_values), max(param_values)) + else: + params[param_name] = trial.suggest_categorical(param_name, param_values) + + # Train model + model_class = self.models[model_type] + model = model_class(**params) + + # Cross-validation score + cv_scores = cross_val_score( + model, X_train, y_train, cv=3, + scoring='accuracy' if task_type == 'classification' else 'r2' + ) + + return cv_scores.mean() + + # Run optimization + study = optuna.create_study(direction='maximize') + study.optimize(objective, n_trials=n_trials) + + # Run experiment with best parameters + best_params = study.best_params + + model, metrics = self.run_experiment( + experiment_name=f"{experiment_name}_optimized", + dataset_path=dataset_path, + target_column=target_column, + model_type=model_type, + hyperparameters=best_params, + task_type=task_type, + notes=f"Hyperparameter optimization with {n_trials} trials. Best score: {study.best_value:.4f}", + tags=['optimized', 'optuna'] + ) + + return model, metrics, study + + def compare_models( + self, + experiment_name: str, + dataset_path: str, + target_column: str, + models_to_compare: List[Dict[str, Any]], + task_type: str = 'classification' + ) -> pd.DataFrame: + """Compare multiple models on the same dataset""" + + results = [] + + for i, model_config in enumerate(models_to_compare): + model_type = model_config['model_type'] + hyperparameters = model_config.get('hyperparameters', {}) + + try: + model, metrics = self.run_experiment( + experiment_name=f"{experiment_name}_comparison_{i+1}", + dataset_path=dataset_path, + target_column=target_column, + model_type=model_type, + hyperparameters=hyperparameters, + task_type=task_type, + notes=f"Model comparison experiment {i+1}/{len(models_to_compare)}", + tags=['comparison', model_type] + ) + + result = { + 'model_type': model_type, + 'experiment_name': f"{experiment_name}_comparison_{i+1}", + **hyperparameters, + **metrics + } + results.append(result) + + except Exception as e: + self.logger.error(f"Failed to train {model_type}: {e}") + result = { + 'model_type': model_type, + 'experiment_name': f"{experiment_name}_comparison_{i+1}", + 'error': str(e) + } + results.append(result) + + comparison_df = pd.DataFrame(results) + + # Save comparison results + experiments_branch = self.repo.branches.get('experiments') + comparison_path = f"experiments/comparisons/{experiment_name}_comparison_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv" + + comparison_csv = comparison_df.to_csv(index=False) + experiments_branch.objects.upload( + path=comparison_path, + data=comparison_csv.encode(), + content_type='text/csv' + ) + + self.logger.info(f"Model comparison complete. Results saved to {comparison_path}") + + return comparison_df + +# Example usage and complete workflow +def run_complete_ml_workflow(): + """Demonstrate complete ML workflow with experiment tracking""" + + logging.basicConfig(level=logging.INFO) + + # Initialize ML pipeline + ml_pipeline = MLPipeline() + + # 1. Classification experiment + print("=== Running Classification Experiments ===") + + # Baseline Random Forest + rf_model, rf_metrics = ml_pipeline.run_experiment( + experiment_name="customer_churn_baseline", + dataset_path="datasets/raw/customer_churn_dataset.csv", + target_column="target", + model_type="random_forest_classifier", + hyperparameters={ + 'n_estimators': 100, + 'max_depth': 10, + 'random_state': 42 + }, + task_type='classification', + notes="Baseline Random Forest model for customer churn prediction", + tags=['baseline', 'random_forest', 'churn'] + ) + + print(f"Random Forest Metrics: {rf_metrics}") + + # XGBoost with hyperparameter tuning + xgb_model, xgb_metrics = ml_pipeline.run_experiment( + experiment_name="customer_churn_xgboost", + dataset_path="datasets/raw/customer_churn_dataset.csv", + target_column="target", + model_type="xgboost_classifier", + hyperparameters={ + 'n_estimators': 100, + 'max_depth': 6, + 'learning_rate': 0.1, + 'random_state': 42 + }, + task_type='classification', + hyperparameter_tuning=True, + tuning_params={ + 'n_estimators': [50, 100, 200], + 'max_depth': [3, 6, 10], + 'learning_rate': [0.01, 0.1, 0.2] + }, + notes="XGBoost with hyperparameter tuning", + tags=['tuned', 'xgboost', 'churn'] + ) + + print(f"XGBoost Metrics: {xgb_metrics}") + + # 2. Model comparison + print("\n=== Running Model Comparison ===") + + models_to_compare = [ + { + 'model_type': 'random_forest_classifier', + 'hyperparameters': {'n_estimators': 100, 'random_state': 42} + }, + { + 'model_type': 'logistic_regression', + 'hyperparameters': {'random_state': 42, 'max_iter': 1000} + }, + { + 'model_type': 'xgboost_classifier', + 'hyperparameters': {'n_estimators': 100, 'random_state': 42} + } + ] + + comparison_results = ml_pipeline.compare_models( + experiment_name="churn_model_comparison", + dataset_path="datasets/raw/customer_churn_dataset.csv", + target_column="target", + models_to_compare=models_to_compare, + task_type='classification' + ) + + print("Model Comparison Results:") + print(comparison_results[['model_type', 'accuracy', 'f1_score', 'roc_auc']].to_string()) + + # 3. Regression experiment + print("\n=== Running Regression Experiments ===") + + # House price prediction + reg_model, reg_metrics = ml_pipeline.run_experiment( + experiment_name="house_price_prediction", + dataset_path="datasets/raw/house_price_dataset.csv", + target_column="target", + model_type="random_forest_regressor", + hyperparameters={ + 'n_estimators': 100, + 'max_depth': 15, + 'random_state': 42 + }, + task_type='regression', + notes="House price prediction using Random Forest", + tags=['regression', 'house_prices', 'random_forest'] + ) + + print(f"Regression Metrics: {reg_metrics}") + + # 4. List all experiments + print("\n=== Experiment Summary ===") + + all_experiments = ml_pipeline.tracker.list_experiments() + + print(f"Total experiments run: {len(all_experiments)}") + for exp in all_experiments[:5]: # Show last 5 experiments + print(f"- {exp['experiment_name']} ({exp['model_type']}) - Status: {exp['status']}") + if exp.get('metrics'): + main_metric = list(exp['metrics'].keys())[0] if exp['metrics'] else 'N/A' + main_value = exp['metrics'].get(main_metric, 'N/A') + print(f" {main_metric}: {main_value}") + + # 5. Compare specific experiments + print("\n=== Experiment Comparison ===") + + # Get run IDs for comparison + churn_experiments = [exp for exp in all_experiments if 'churn' in exp['experiment_name']] + if len(churn_experiments) >= 2: + run_ids = [exp['run_id'] for exp in churn_experiments[:3]] + comparison_df = ml_pipeline.tracker.compare_experiments(run_ids) + + print("Churn Prediction Experiments Comparison:") + metric_columns = [col for col in comparison_df.columns if col.startswith('metric_')] + display_columns = ['experiment_name', 'model_type'] + metric_columns + print(comparison_df[display_columns].to_string()) + +if __name__ == "__main__": + run_complete_ml_workflow() +``` + +## Step 5: Model Registry and Deployment + +### Model Registry System + +```python +# src/model_registry.py +import lakefs +import lakefs_spec +import pandas as pd +import numpy as np +import json +import joblib +from datetime import datetime +from typing import Dict, List, Any, Optional +import logging +from dataclasses import dataclass, field, asdict +from enum import Enum +import io +import uuid + +logger = logging.getLogger(__name__) + +class ModelStage(Enum): + STAGING = "staging" + PRODUCTION = "production" + ARCHIVED = "archived" + +@dataclass +class RegisteredModel: + """Represents a registered model in the model registry""" + model_name: str + model_version: str + model_stage: ModelStage + model_path: str + metadata_path: str + experiment_run_id: str + created_at: datetime + created_by: str + description: str + tags: List[str] = field(default_factory=list) + metrics: Dict[str, float] = field(default_factory=dict) + parameters: Dict[str, Any] = field(default_factory=dict) + model_signature: Optional[Dict[str, Any]] = None + deployment_info: Dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary for serialization""" + return { + 'model_name': self.model_name, + 'model_version': self.model_version, + 'model_stage': self.model_stage.value, + 'model_path': self.model_path, + 'metadata_path': self.metadata_path, + 'experiment_run_id': self.experiment_run_id, + 'created_at': self.created_at.isoformat(), + 'created_by': self.created_by, + 'description': self.description, + 'tags': self.tags, + 'metrics': self.metrics, + 'parameters': self.parameters, + 'model_signature': self.model_signature, + 'deployment_info': self.deployment_info + } + +class ModelRegistry: + """Centralized model registry using lakeFS""" + + def __init__(self, repository_name: str = 'ml-experiments'): + self.client = lakefs.Client( + host='http://localhost:8000', + username='AKIAIOSFODNN7EXAMPLE', + password='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY' + ) + self.repo = self.client.repositories.get(repository_name) + self.fs = lakefs_spec.LakeFSFileSystem( + host='http://localhost:8000', + username='AKIAIOSFODNN7EXAMPLE', + password='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY' + ) + self.logger = logging.getLogger(__name__) + + def register_model( + self, + model_name: str, + experiment_run_id: str, + model_artifact_name: str = "trained_model", + description: str = "", + tags: List[str] = None, + stage: ModelStage = ModelStage.STAGING, + created_by: str = "unknown" + ) -> RegisteredModel: + """Register a model from an experiment run""" + + # Load experiment metadata + experiments_branch = self.repo.branches.get('experiments') + + try: + # Get experiment metadata + exp_metadata_obj = experiments_branch.objects.get(f'experiments/runs/{experiment_run_id}/metadata.json') + exp_metadata = json.loads(exp_metadata_obj.reader().read().decode()) + + # Get model metadata + model_metadata_obj = experiments_branch.objects.get(f'experiments/runs/{experiment_run_id}/models/{model_artifact_name}_metadata.json') + model_metadata = json.loads(model_metadata_obj.reader().read().decode()) + + except Exception as e: + raise ValueError(f"Could not load experiment or model metadata: {e}") + + # Generate model version + model_version = f"v{datetime.now().strftime('%Y%m%d_%H%M%S')}" + + # Copy model to registry + registry_branch = self._get_or_create_registry_branch() + + # Source paths + source_model_path = f'experiments/runs/{experiment_run_id}/models/{model_artifact_name}.joblib' + source_metadata_path = f'experiments/runs/{experiment_run_id}/models/{model_artifact_name}_metadata.json' + + # Registry paths + registry_model_path = f'models/registered/{model_name}/{model_version}/model.joblib' + registry_metadata_path = f'models/registered/{model_name}/{model_version}/metadata.json' + + # Copy model file + model_obj = experiments_branch.objects.get(source_model_path) + model_data = model_obj.reader().read() + + registry_branch.objects.upload( + path=registry_model_path, + data=model_data, + content_type='application/octet-stream' + ) + + # Copy model metadata + registry_branch.objects.upload( + path=registry_metadata_path, + data=json.dumps(model_metadata, indent=2).encode(), + content_type='application/json' + ) + + # Create registered model entry + registered_model = RegisteredModel( + model_name=model_name, + model_version=model_version, + model_stage=stage, + model_path=registry_model_path, + metadata_path=registry_metadata_path, + experiment_run_id=experiment_run_id, + created_at=datetime.now(), + created_by=created_by, + description=description, + tags=tags or [], + metrics=exp_metadata.get('metrics', {}), + parameters=exp_metadata.get('parameters', {}), + model_signature=self._extract_model_signature(model_metadata) + ) + + # Save registry entry + registry_entry_path = f'models/registered/{model_name}/{model_version}/registry_entry.json' + registry_branch.objects.upload( + path=registry_entry_path, + data=json.dumps(registered_model.to_dict(), indent=2).encode(), + content_type='application/json' + ) + + # Update model index + self._update_model_index(registry_branch, registered_model) + + # Commit registration + commit = registry_branch.commits.create( + message=f"Register model: {model_name} version {model_version}", + metadata={ + 'model_name': model_name, + 'model_version': model_version, + 'stage': stage.value, + 'experiment_run_id': experiment_run_id + } + ) + + self.logger.info(f"Registered model: {model_name} version {model_version}") + + return registered_model + + def list_models(self, model_name: Optional[str] = None, stage: Optional[ModelStage] = None) -> List[RegisteredModel]: + """List registered models""" + + models = [] + + try: + registry_branch = self.repo.branches.get('production') + + # Get model index + index_obj = registry_branch.objects.get('models/model_index.json') + model_index = json.loads(index_obj.reader().read().decode()) + + for model_entry in model_index.get('models', []): + # Filter by model name + if model_name and model_entry['model_name'] != model_name: + continue + + # Filter by stage + if stage and ModelStage(model_entry['model_stage']) != stage: + continue + + # Convert to RegisteredModel object + model_entry['created_at'] = datetime.fromisoformat(model_entry['created_at']) + model_entry['model_stage'] = ModelStage(model_entry['model_stage']) + + registered_model = RegisteredModel(**model_entry) + models.append(registered_model) + + except Exception as e: + self.logger.error(f"Error listing models: {e}") + + return models + + def get_model(self, model_name: str, model_version: Optional[str] = None, stage: Optional[ModelStage] = None): + """Get a specific model""" + + models = self.list_models(model_name=model_name, stage=stage) + + if not models: + raise ValueError(f"No models found for {model_name}") + + if model_version: + # Find specific version + for model in models: + if model.model_version == model_version: + return model + raise ValueError(f"Model version {model_version} not found for {model_name}") + else: + # Return latest version + models.sort(key=lambda x: x.created_at, reverse=True) + return models[0] + + def load_model(self, model_name: str, model_version: Optional[str] = None, stage: Optional[ModelStage] = None): + """Load a model from the registry""" + + registered_model = self.get_model(model_name, model_version, stage) + + try: + registry_branch = self.repo.branches.get('production') + model_obj = registry_branch.objects.get(registered_model.model_path) + model_data = model_obj.reader().read() + + # Load model from bytes + model = joblib.load(io.BytesIO(model_data)) + + self.logger.info(f"Loaded model: {model_name} version {registered_model.model_version}") + return model, registered_model + + except Exception as e: + self.logger.error(f"Error loading model: {e}") + raise + + def promote_model(self, model_name: str, model_version: str, target_stage: ModelStage) -> RegisteredModel: + """Promote a model to a different stage""" + + # Get current model + current_model = self.get_model(model_name, model_version) + + # Update stage + current_model.model_stage = target_stage + + # Update registry + registry_branch = self._get_or_create_registry_branch() + + # Update registry entry + registry_entry_path = f'models/registered/{model_name}/{model_version}/registry_entry.json' + registry_branch.objects.upload( + path=registry_entry_path, + data=json.dumps(current_model.to_dict(), indent=2).encode(), + content_type='application/json' + ) + + # Update model index + self._update_model_index(registry_branch, current_model) + + # Commit promotion + commit = registry_branch.commits.create( + message=f"Promote model: {model_name} version {model_version} to {target_stage.value}", + metadata={ + 'model_name': model_name, + 'model_version': model_version, + 'new_stage': target_stage.value, + 'action': 'promotion' + } + ) + + # If promoting to production, merge to production branch + if target_stage == ModelStage.PRODUCTION: + try: + production_branch = self.repo.branches.get('production') + production_branch.merge( + source_reference=registry_branch.id, + message=f"Deploy model to production: {model_name} version {model_version}" + ) + self.logger.info(f"Model deployed to production: {model_name} version {model_version}") + except Exception as e: + self.logger.warning(f"Could not merge to production branch: {e}") + + self.logger.info(f"Promoted model: {model_name} version {model_version} to {target_stage.value}") + + return current_model + + def archive_model(self, model_name: str, model_version: str) -> RegisteredModel: + """Archive a model""" + return self.promote_model(model_name, model_version, ModelStage.ARCHIVED) + + def delete_model(self, model_name: str, model_version: str): + """Delete a model from the registry""" + + registry_branch = self._get_or_create_registry_branch() + + # Delete model files + model_dir = f'models/registered/{model_name}/{model_version}/' + + try: + # List all objects in the model directory + objects = registry_branch.objects.list(prefix=model_dir) + + for obj in objects: + registry_branch.objects.delete(obj.path) + + # Update model index + self._remove_from_model_index(registry_branch, model_name, model_version) + + # Commit deletion + commit = registry_branch.commits.create( + message=f"Delete model: {model_name} version {model_version}", + metadata={ + 'model_name': model_name, + 'model_version': model_version, + 'action': 'deletion' + } + ) + + self.logger.info(f"Deleted model: {model_name} version {model_version}") + + except Exception as e: + self.logger.error(f"Error deleting model: {e}") + raise + + def _get_or_create_registry_branch(self): + """Get or create the model registry branch""" + try: + return self.repo.branches.get('model-registry') + except: + return self.repo.branches.create('model-registry', source_reference='production') + + def _extract_model_signature(self, model_metadata: Dict[str, Any]) -> Dict[str, Any]: + """Extract model signature from metadata""" + return { + 'feature_columns': model_metadata.get('custom_metadata', {}).get('feature_columns', []), + 'target_column': model_metadata.get('custom_metadata', {}).get('target_column'), + 'model_class': model_metadata.get('custom_metadata', {}).get('model_class') + } + + def _update_model_index(self, branch, registered_model: RegisteredModel): + """Update the model index with new/updated model""" + + try: + # Load existing index + index_obj = branch.objects.get('models/model_index.json') + model_index = json.loads(index_obj.reader().read().decode()) + except: + # Create new index + model_index = {'models': [], 'last_updated': datetime.now().isoformat()} + + # Remove existing entry for this model version + model_index['models'] = [ + m for m in model_index['models'] + if not (m['model_name'] == registered_model.model_name and m['model_version'] == registered_model.model_version) + ] + + # Add new entry + model_index['models'].append(registered_model.to_dict()) + model_index['last_updated'] = datetime.now().isoformat() + + # Save updated index + branch.objects.upload( + path='models/model_index.json', + data=json.dumps(model_index, indent=2).encode(), + content_type='application/json' + ) + + def _remove_from_model_index(self, branch, model_name: str, model_version: str): + """Remove model from index""" + + try: + # Load existing index + index_obj = branch.objects.get('models/model_index.json') + model_index = json.loads(index_obj.reader().read().decode()) + + # Remove entry + model_index['models'] = [ + m for m in model_index['models'] + if not (m['model_name'] == model_name and m['model_version'] == model_version) + ] + model_index['last_updated'] = datetime.now().isoformat() + + # Save updated index + branch.objects.upload( + path='models/model_index.json', + data=json.dumps(model_index, indent=2).encode(), + content_type='application/json' + ) + + except Exception as e: + self.logger.warning(f"Could not update model index: {e}") + +# Example usage +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + + # Create model registry + registry = ModelRegistry() + + # Register a model (assuming we have an experiment run) + # registered_model = registry.register_model( + # model_name="customer_churn_predictor", + # experiment_run_id="some-run-id", + # description="Random Forest model for customer churn prediction", + # tags=['churn', 'random_forest', 'production_ready'] + # ) + + # List all models + models = registry.list_models() + print(f"Found {len(models)} registered models") + + for model in models: + print(f"- {model.model_name} {model.model_version} ({model.model_stage.value})") +```# +# Step 6: A/B Testing and Model Comparison + +### A/B Testing Framework + +```python +# src/ab_testing.py +import lakefs +import pandas as pd +import numpy as np +from datetime import datetime, timedelta +from typing import Dict, List, Any, Optional, Tuple +import logging +from dataclasses import dataclass, field +from scipy import stats +import json +import uuid + +from src.model_registry import ModelRegistry, ModelStage + +logger = logging.getLogger(__name__) + +@dataclass +class ABTestConfig: + """Configuration for A/B test""" + test_name: str + description: str + model_a_name: str + model_a_version: str + model_b_name: str + model_b_version: str + traffic_split: float = 0.5 # Percentage of traffic to model B + success_metric: str = 'accuracy' + minimum_sample_size: int = 1000 + significance_level: float = 0.05 + test_duration_days: int = 7 + + def to_dict(self) -> Dict[str, Any]: + return { + 'test_name': self.test_name, + 'description': self.description, + 'model_a_name': self.model_a_name, + 'model_a_version': self.model_a_version, + 'model_b_name': self.model_b_name, + 'model_b_version': self.model_b_version, + 'traffic_split': self.traffic_split, + 'success_metric': self.success_metric, + 'minimum_sample_size': self.minimum_sample_size, + 'significance_level': self.significance_level, + 'test_duration_days': self.test_duration_days + } + +@dataclass +class ABTestResult: + """Results of an A/B test""" + test_name: str + start_date: datetime + end_date: datetime + model_a_metrics: Dict[str, float] + model_b_metrics: Dict[str, float] + sample_size_a: int + sample_size_b: int + statistical_significance: bool + p_value: float + confidence_interval: Tuple[float, float] + winner: str # 'model_a', 'model_b', or 'inconclusive' + recommendation: str + + def to_dict(self) -> Dict[str, Any]: + return { + 'test_name': self.test_name, + 'start_date': self.start_date.isoformat(), + 'end_date': self.end_date.isoformat(), + 'model_a_metrics': self.model_a_metrics, + 'model_b_metrics': self.model_b_metrics, + 'sample_size_a': self.sample_size_a, + 'sample_size_b': self.sample_size_b, + 'statistical_significance': self.statistical_significance, + 'p_value': self.p_value, + 'confidence_interval': list(self.confidence_interval), + 'winner': self.winner, + 'recommendation': self.recommendation + } + +class ABTestingFramework: + """A/B testing framework for ML models""" + + def __init__(self, repository_name: str = 'ml-experiments'): + self.model_registry = ModelRegistry(repository_name) + self.repo = self.model_registry.repo + self.logger = logging.getLogger(__name__) + + def create_ab_test(self, config: ABTestConfig) -> str: + """Create a new A/B test""" + + # Validate models exist + try: + model_a = self.model_registry.get_model(config.model_a_name, config.model_a_version) + model_b = self.model_registry.get_model(config.model_b_name, config.model_b_version) + except Exception as e: + raise ValueError(f"Could not find specified models: {e}") + + # Create test branch + test_id = str(uuid.uuid4()) + test_branch_name = f"ab-test/{config.test_name}/{test_id[:8]}" + + try: + test_branch = self.repo.branches.create(test_branch_name, source_reference='production') + except Exception as e: + raise ValueError(f"Could not create test branch: {e}") + + # Save test configuration + test_config_path = f"ab_tests/{config.test_name}/config.json" + test_branch.objects.upload( + path=test_config_path, + data=json.dumps(config.to_dict(), indent=2).encode(), + content_type='application/json' + ) + + # Create test metadata + test_metadata = { + 'test_id': test_id, + 'test_name': config.test_name, + 'status': 'created', + 'created_at': datetime.now().isoformat(), + 'branch_name': test_branch_name, + 'config': config.to_dict() + } + + test_metadata_path = f"ab_tests/{config.test_name}/metadata.json" + test_branch.objects.upload( + path=test_metadata_path, + data=json.dumps(test_metadata, indent=2).encode(), + content_type='application/json' + ) + + # Commit test creation + commit = test_branch.commits.create( + message=f"Create A/B test: {config.test_name}", + metadata={ + 'test_name': config.test_name, + 'test_id': test_id, + 'model_a': f"{config.model_a_name}:{config.model_a_version}", + 'model_b': f"{config.model_b_name}:{config.model_b_version}", + 'action': 'create_ab_test' + } + ) + + self.logger.info(f"Created A/B test: {config.test_name} (ID: {test_id})") + + return test_id + + def simulate_ab_test( + self, + test_name: str, + test_dataset: pd.DataFrame, + target_column: str, + feature_columns: List[str] + ) -> ABTestResult: + """Simulate an A/B test using historical data""" + + # Load test configuration + try: + production_branch = self.repo.branches.get('production') + config_obj = production_branch.objects.get(f"ab_tests/{test_name}/config.json") + config_data = json.loads(config_obj.reader().read().decode()) + config = ABTestConfig(**config_data) + except Exception as e: + raise ValueError(f"Could not load test configuration: {e}") + + # Load models + model_a, reg_model_a = self.model_registry.load_model(config.model_a_name, config.model_a_version) + model_b, reg_model_b = self.model_registry.load_model(config.model_b_name, config.model_b_version) + + # Prepare data + X = test_dataset[feature_columns] + y_true = test_dataset[target_column] + + # Split data according to traffic split + n_samples = len(test_dataset) + n_b = int(n_samples * config.traffic_split) + n_a = n_samples - n_b + + # Random assignment (in practice, this would be based on user ID hash) + np.random.seed(42) # For reproducibility + assignment = np.random.choice(['A', 'B'], size=n_samples, p=[1-config.traffic_split, config.traffic_split]) + + # Get predictions + y_pred_a = model_a.predict(X) + y_pred_b = model_b.predict(X) + + # Calculate metrics for each group + a_indices = assignment == 'A' + b_indices = assignment == 'B' + + metrics_a = self._calculate_metrics(y_true[a_indices], y_pred_a[a_indices]) + metrics_b = self._calculate_metrics(y_true[b_indices], y_pred_b[b_indices]) + + # Statistical significance test + success_metric = config.success_metric + + if success_metric in metrics_a and success_metric in metrics_b: + # For accuracy/precision/recall, we can use proportion test + if success_metric in ['accuracy', 'precision', 'recall', 'f1_score']: + # Convert to success/failure counts + successes_a = int(metrics_a[success_metric] * sum(a_indices)) + successes_b = int(metrics_b[success_metric] * sum(b_indices)) + + # Two-proportion z-test + stat, p_value = self._two_proportion_test( + successes_a, sum(a_indices), + successes_b, sum(b_indices) + ) + else: + # For continuous metrics, use t-test + # This is simplified - in practice you'd need the raw values + stat, p_value = stats.ttest_ind( + np.random.normal(metrics_a[success_metric], 0.1, sum(a_indices)), + np.random.normal(metrics_b[success_metric], 0.1, sum(b_indices)) + ) + else: + p_value = 1.0 + stat = 0.0 + + # Determine winner + is_significant = p_value < config.significance_level + + if is_significant: + if metrics_b[success_metric] > metrics_a[success_metric]: + winner = 'model_b' + recommendation = f"Model B ({config.model_b_name}) performs significantly better" + else: + winner = 'model_a' + recommendation = f"Model A ({config.model_a_name}) performs significantly better" + else: + winner = 'inconclusive' + recommendation = "No statistically significant difference found" + + # Calculate confidence interval (simplified) + diff = metrics_b[success_metric] - metrics_a[success_metric] + se = np.sqrt(metrics_a[success_metric] * (1 - metrics_a[success_metric]) / sum(a_indices) + + metrics_b[success_metric] * (1 - metrics_b[success_metric]) / sum(b_indices)) + ci_lower = diff - 1.96 * se + ci_upper = diff + 1.96 * se + + # Create result + result = ABTestResult( + test_name=test_name, + start_date=datetime.now() - timedelta(days=config.test_duration_days), + end_date=datetime.now(), + model_a_metrics=metrics_a, + model_b_metrics=metrics_b, + sample_size_a=sum(a_indices), + sample_size_b=sum(b_indices), + statistical_significance=is_significant, + p_value=p_value, + confidence_interval=(ci_lower, ci_upper), + winner=winner, + recommendation=recommendation + ) + + # Save test results + self._save_test_results(test_name, result) + + return result + + def _calculate_metrics(self, y_true, y_pred) -> Dict[str, float]: + """Calculate standard ML metrics""" + from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score + + try: + metrics = { + 'accuracy': accuracy_score(y_true, y_pred), + 'precision': precision_score(y_true, y_pred, average='weighted', zero_division=0), + 'recall': recall_score(y_true, y_pred, average='weighted', zero_division=0), + 'f1_score': f1_score(y_true, y_pred, average='weighted', zero_division=0) + } + except Exception as e: + self.logger.warning(f"Error calculating metrics: {e}") + metrics = {'accuracy': 0.0, 'precision': 0.0, 'recall': 0.0, 'f1_score': 0.0} + + return metrics + + def _two_proportion_test(self, x1, n1, x2, n2): + """Two-proportion z-test""" + p1 = x1 / n1 + p2 = x2 / n2 + p_pool = (x1 + x2) / (n1 + n2) + + se = np.sqrt(p_pool * (1 - p_pool) * (1/n1 + 1/n2)) + z = (p2 - p1) / se + p_value = 2 * (1 - stats.norm.cdf(abs(z))) + + return z, p_value + + def _save_test_results(self, test_name: str, result: ABTestResult): + """Save A/B test results""" + + try: + production_branch = self.repo.branches.get('production') + + results_path = f"ab_tests/{test_name}/results.json" + production_branch.objects.upload( + path=results_path, + data=json.dumps(result.to_dict(), indent=2).encode(), + content_type='application/json' + ) + + # Update test metadata + metadata_obj = production_branch.objects.get(f"ab_tests/{test_name}/metadata.json") + metadata = json.loads(metadata_obj.reader().read().decode()) + metadata['status'] = 'completed' + metadata['completed_at'] = datetime.now().isoformat() + metadata['winner'] = result.winner + + production_branch.objects.upload( + path=f"ab_tests/{test_name}/metadata.json", + data=json.dumps(metadata, indent=2).encode(), + content_type='application/json' + ) + + # Commit results + commit = production_branch.commits.create( + message=f"A/B test results: {test_name}", + metadata={ + 'test_name': test_name, + 'winner': result.winner, + 'p_value': str(result.p_value), + 'action': 'ab_test_results' + } + ) + + self.logger.info(f"Saved A/B test results: {test_name}") + + except Exception as e: + self.logger.error(f"Error saving test results: {e}") + + def list_ab_tests(self) -> List[Dict[str, Any]]: + """List all A/B tests""" + + tests = [] + + try: + production_branch = self.repo.branches.get('production') + + # List all test directories + test_objects = production_branch.objects.list(prefix='ab_tests/') + + test_names = set() + for obj in test_objects: + path_parts = obj.path.split('/') + if len(path_parts) >= 2: + test_names.add(path_parts[1]) + + # Load metadata for each test + for test_name in test_names: + try: + metadata_obj = production_branch.objects.get(f'ab_tests/{test_name}/metadata.json') + metadata = json.loads(metadata_obj.reader().read().decode()) + tests.append(metadata) + except Exception as e: + self.logger.warning(f"Could not load metadata for test {test_name}: {e}") + + except Exception as e: + self.logger.error(f"Error listing A/B tests: {e}") + + return tests + + def get_test_results(self, test_name: str) -> Optional[ABTestResult]: + """Get results for a specific A/B test""" + + try: + production_branch = self.repo.branches.get('production') + results_obj = production_branch.objects.get(f'ab_tests/{test_name}/results.json') + results_data = json.loads(results_obj.reader().read().decode()) + + # Convert datetime strings back to datetime objects + results_data['start_date'] = datetime.fromisoformat(results_data['start_date']) + results_data['end_date'] = datetime.fromisoformat(results_data['end_date']) + results_data['confidence_interval'] = tuple(results_data['confidence_interval']) + + return ABTestResult(**results_data) + + except Exception as e: + self.logger.error(f"Error loading test results: {e}") + return None + +# Example usage and complete A/B testing workflow +def run_ab_testing_example(): + """Demonstrate A/B testing workflow""" + + logging.basicConfig(level=logging.INFO) + + # Initialize A/B testing framework + ab_framework = ABTestingFramework() + + # Create A/B test configuration + test_config = ABTestConfig( + test_name="churn_model_comparison", + description="Compare Random Forest vs XGBoost for customer churn prediction", + model_a_name="customer_churn_predictor", + model_a_version="v20240115_120000", # Assuming these models exist + model_b_name="customer_churn_xgboost", + model_b_version="v20240115_130000", + traffic_split=0.5, + success_metric='accuracy', + minimum_sample_size=1000, + significance_level=0.05, + test_duration_days=7 + ) + + # Create A/B test + try: + test_id = ab_framework.create_ab_test(test_config) + print(f"Created A/B test: {test_id}") + except Exception as e: + print(f"Could not create A/B test: {e}") + return + + # Load test dataset (in practice, this would be production data) + try: + test_dataset = ab_framework.model_registry.fs + # For demo, we'll create synthetic data + np.random.seed(42) + n_samples = 2000 + + test_data = pd.DataFrame({ + 'feature_0': np.random.randn(n_samples), + 'feature_1': np.random.randn(n_samples), + 'feature_2': np.random.randn(n_samples), + 'target': np.random.choice([0, 1], n_samples, p=[0.7, 0.3]) + }) + + feature_columns = ['feature_0', 'feature_1', 'feature_2'] + + # Simulate A/B test + # result = ab_framework.simulate_ab_test( + # test_name="churn_model_comparison", + # test_dataset=test_data, + # target_column='target', + # feature_columns=feature_columns + # ) + + # print(f"A/B Test Results:") + # print(f"Winner: {result.winner}") + # print(f"Statistical Significance: {result.statistical_significance}") + # print(f"P-value: {result.p_value:.4f}") + # print(f"Model A Accuracy: {result.model_a_metrics['accuracy']:.4f}") + # print(f"Model B Accuracy: {result.model_b_metrics['accuracy']:.4f}") + # print(f"Recommendation: {result.recommendation}") + + except Exception as e: + print(f"Could not run A/B test simulation: {e}") + + # List all A/B tests + all_tests = ab_framework.list_ab_tests() + print(f"\nAll A/B Tests ({len(all_tests)}):") + for test in all_tests: + print(f"- {test['test_name']} (Status: {test.get('status', 'unknown')})") + +if __name__ == "__main__": + run_ab_testing_example() +``` + +## Step 7: Production MLOps Pipeline + +### Complete MLOps Workflow + +```python +# src/mlops_pipeline.py +import lakefs +import pandas as pd +import numpy as np +from datetime import datetime, timedelta +from typing import Dict, List, Any, Optional +import logging +import json +import schedule +import time +from dataclasses import dataclass +import smtplib +from email.mime.text import MIMEText + +from src.experiment_tracker import MLExperimentTracker +from src.model_registry import ModelRegistry, ModelStage +from src.ab_testing import ABTestingFramework +from src.ml_pipeline import MLPipeline + +logger = logging.getLogger(__name__) + +@dataclass +class MLOpsConfig: + """Configuration for MLOps pipeline""" + model_name: str + dataset_path: str + target_column: str + feature_columns: List[str] + model_type: str + hyperparameters: Dict[str, Any] + retraining_schedule: str # cron-like schedule + performance_threshold: float + data_drift_threshold: float + alert_recipients: List[str] + auto_deploy: bool = False + +class MLOpsPipeline: + """Complete MLOps pipeline with automated retraining and deployment""" + + def __init__(self, repository_name: str = 'ml-experiments'): + self.ml_pipeline = MLPipeline(repository_name) + self.model_registry = ModelRegistry(repository_name) + self.ab_framework = ABTestingFramework(repository_name) + self.repo = self.ml_pipeline.repo + self.logger = logging.getLogger(__name__) + + # Pipeline state + self.configs: Dict[str, MLOpsConfig] = {} + self.monitoring_data: Dict[str, List[Dict[str, Any]]] = {} + + def register_model_pipeline(self, config: MLOpsConfig): + """Register a model for automated MLOps pipeline""" + + self.configs[config.model_name] = config + self.monitoring_data[config.model_name] = [] + + # Schedule retraining + if config.retraining_schedule: + schedule.every().day.at(config.retraining_schedule).do( + self._retrain_model, config.model_name + ) + + self.logger.info(f"Registered MLOps pipeline for model: {config.model_name}") + + def monitor_model_performance(self, model_name: str, predictions: pd.DataFrame, actuals: pd.DataFrame): + """Monitor model performance in production""" + + if model_name not in self.configs: + raise ValueError(f"Model {model_name} not registered in MLOps pipeline") + + config = self.configs[model_name] + + # Calculate performance metrics + from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score + + metrics = { + 'timestamp': datetime.now().isoformat(), + 'accuracy': accuracy_score(actuals, predictions), + 'precision': precision_score(actuals, predictions, average='weighted', zero_division=0), + 'recall': recall_score(actuals, predictions, average='weighted', zero_division=0), + 'f1_score': f1_score(actuals, predictions, average='weighted', zero_division=0), + 'sample_size': len(predictions) + } + + # Store monitoring data + self.monitoring_data[model_name].append(metrics) + + # Check for performance degradation + if metrics['accuracy'] < config.performance_threshold: + self._trigger_performance_alert(model_name, metrics) + + if config.auto_deploy: + self._trigger_retraining(model_name) + + # Save monitoring data + self._save_monitoring_data(model_name) + + self.logger.info(f"Monitored performance for {model_name}: accuracy={metrics['accuracy']:.4f}") + + def detect_data_drift(self, model_name: str, new_data: pd.DataFrame) -> bool: + """Detect data drift in production data""" + + if model_name not in self.configs: + raise ValueError(f"Model {model_name} not registered in MLOps pipeline") + + config = self.configs[model_name] + + try: + # Load training data for comparison + training_data = self.ml_pipeline.load_dataset(config.dataset_path) + training_features = training_data[config.feature_columns] + new_features = new_data[config.feature_columns] + + # Simple drift detection using statistical tests + drift_detected = False + drift_scores = {} + + for column in config.feature_columns: + if column in training_features.columns and column in new_features.columns: + # Kolmogorov-Smirnov test for distribution comparison + from scipy.stats import ks_2samp + + statistic, p_value = ks_2samp( + training_features[column].dropna(), + new_features[column].dropna() + ) + + drift_scores[column] = { + 'ks_statistic': statistic, + 'p_value': p_value, + 'drift_detected': p_value < 0.05 # Significant difference + } + + if p_value < 0.05: + drift_detected = True + + # Log drift detection results + drift_result = { + 'timestamp': datetime.now().isoformat(), + 'model_name': model_name, + 'drift_detected': drift_detected, + 'drift_scores': drift_scores + } + + self._save_drift_detection_results(model_name, drift_result) + + if drift_detected: + self._trigger_drift_alert(model_name, drift_result) + + if config.auto_deploy: + self._trigger_retraining(model_name) + + return drift_detected + + except Exception as e: + self.logger.error(f"Error detecting data drift for {model_name}: {e}") + return False + + def _retrain_model(self, model_name: str): + """Retrain a model automatically""" + + if model_name not in self.configs: + self.logger.error(f"Model {model_name} not found in configurations") + return + + config = self.configs[model_name] + + try: + self.logger.info(f"Starting automatic retraining for {model_name}") + + # Run experiment with updated data + model, metrics = self.ml_pipeline.run_experiment( + experiment_name=f"{model_name}_retrain_{datetime.now().strftime('%Y%m%d_%H%M%S')}", + dataset_path=config.dataset_path, + target_column=config.target_column, + model_type=config.model_type, + hyperparameters=config.hyperparameters, + feature_columns=config.feature_columns, + notes=f"Automatic retraining triggered for {model_name}", + tags=['retrain', 'automated', model_name] + ) + + # Get the experiment run ID (this would need to be returned from run_experiment) + # For now, we'll simulate this + experiment_run_id = "simulated-run-id" + + # Register new model version + registered_model = self.model_registry.register_model( + model_name=f"{model_name}_retrained", + experiment_run_id=experiment_run_id, + description=f"Automatically retrained version of {model_name}", + tags=['retrained', 'automated'], + stage=ModelStage.STAGING, + created_by='mlops_pipeline' + ) + + # Compare with current production model + current_production_models = self.model_registry.list_models( + model_name=model_name, + stage=ModelStage.PRODUCTION + ) + + if current_production_models: + # Set up A/B test + from src.ab_testing import ABTestConfig + + ab_config = ABTestConfig( + test_name=f"{model_name}_retrain_test", + description=f"A/B test for retrained {model_name}", + model_a_name=current_production_models[0].model_name, + model_a_version=current_production_models[0].model_version, + model_b_name=registered_model.model_name, + model_b_version=registered_model.model_version, + traffic_split=0.1, # Start with 10% traffic to new model + success_metric='accuracy' + ) + + test_id = self.ab_framework.create_ab_test(ab_config) + + self.logger.info(f"Created A/B test for retrained model: {test_id}") + + # Send notification + self._send_notification( + subject=f"Model Retrained: {model_name}", + message=f""" + Model {model_name} has been automatically retrained. + + New model metrics: + {json.dumps(metrics, indent=2)} + + A/B test created: {test_id} + + The new model is staged for testing with 10% traffic. + """, + recipients=config.alert_recipients + ) + + except Exception as e: + self.logger.error(f"Error retraining model {model_name}: {e}") + self._send_notification( + subject=f"Model Retraining Failed: {model_name}", + message=f"Automatic retraining failed for {model_name}: {str(e)}", + recipients=config.alert_recipients + ) + + def _trigger_performance_alert(self, model_name: str, metrics: Dict[str, Any]): + """Trigger alert for performance degradation""" + + config = self.configs[model_name] + + message = f""" + Performance Alert: {model_name} + + Current accuracy: {metrics['accuracy']:.4f} + Threshold: {config.performance_threshold:.4f} + + Recent performance metrics: + {json.dumps(metrics, indent=2)} + + Consider retraining the model or investigating data quality issues. + """ + + self._send_notification( + subject=f"Performance Alert: {model_name}", + message=message, + recipients=config.alert_recipients + ) + + def _trigger_drift_alert(self, model_name: str, drift_result: Dict[str, Any]): + """Trigger alert for data drift""" + + config = self.configs[model_name] + + message = f""" + Data Drift Alert: {model_name} + + Data drift detected in production data. + + Drift detection results: + {json.dumps(drift_result['drift_scores'], indent=2)} + + Consider retraining the model with recent data. + """ + + self._send_notification( + subject=f"Data Drift Alert: {model_name}", + message=message, + recipients=config.alert_recipients + ) + + def _trigger_retraining(self, model_name: str): + """Trigger model retraining""" + + self.logger.info(f"Triggering retraining for {model_name}") + self._retrain_model(model_name) + + def _save_monitoring_data(self, model_name: str): + """Save monitoring data to lakeFS""" + + try: + production_branch = self.repo.branches.get('production') + + monitoring_path = f"monitoring/{model_name}/performance_metrics.json" + monitoring_data = { + 'model_name': model_name, + 'metrics': self.monitoring_data[model_name], + 'last_updated': datetime.now().isoformat() + } + + production_branch.objects.upload( + path=monitoring_path, + data=json.dumps(monitoring_data, indent=2).encode(), + content_type='application/json' + ) + + except Exception as e: + self.logger.error(f"Error saving monitoring data: {e}") + + def _save_drift_detection_results(self, model_name: str, drift_result: Dict[str, Any]): + """Save drift detection results to lakeFS""" + + try: + production_branch = self.repo.branches.get('production') + + drift_path = f"monitoring/{model_name}/drift_detection.json" + + # Load existing drift data + try: + drift_obj = production_branch.objects.get(drift_path) + existing_data = json.loads(drift_obj.reader().read().decode()) + existing_data['results'].append(drift_result) + except: + existing_data = { + 'model_name': model_name, + 'results': [drift_result], + 'last_updated': datetime.now().isoformat() + } + + production_branch.objects.upload( + path=drift_path, + data=json.dumps(existing_data, indent=2).encode(), + content_type='application/json' + ) + + except Exception as e: + self.logger.error(f"Error saving drift detection results: {e}") + + def _send_notification(self, subject: str, message: str, recipients: List[str]): + """Send email notification""" + + try: + # This is a simplified email notification + # In practice, you'd configure SMTP settings + self.logger.info(f"NOTIFICATION: {subject}") + self.logger.info(f"Recipients: {', '.join(recipients)}") + self.logger.info(f"Message: {message}") + + # Actual email sending would go here + # msg = MIMEText(message) + # msg['Subject'] = subject + # msg['From'] = 'mlops@company.com' + # msg['To'] = ', '.join(recipients) + # + # with smtplib.SMTP('localhost') as server: + # server.send_message(msg) + + except Exception as e: + self.logger.error(f"Error sending notification: {e}") + + def run_monitoring_loop(self): + """Run the continuous monitoring loop""" + + self.logger.info("Starting MLOps monitoring loop") + + while True: + try: + # Run scheduled tasks + schedule.run_pending() + + # Sleep for a minute + time.sleep(60) + + except KeyboardInterrupt: + self.logger.info("Stopping MLOps monitoring loop") + break + except Exception as e: + self.logger.error(f"Error in monitoring loop: {e}") + time.sleep(60) # Continue after error + + def generate_mlops_report(self, model_name: str, days: int = 30) -> Dict[str, Any]: + """Generate MLOps report for a model""" + + if model_name not in self.configs: + raise ValueError(f"Model {model_name} not registered in MLOps pipeline") + + # Get monitoring data + recent_metrics = [] + cutoff_date = datetime.now() - timedelta(days=days) + + for metric in self.monitoring_data.get(model_name, []): + metric_date = datetime.fromisoformat(metric['timestamp']) + if metric_date >= cutoff_date: + recent_metrics.append(metric) + + # Calculate summary statistics + if recent_metrics: + accuracies = [m['accuracy'] for m in recent_metrics] + avg_accuracy = np.mean(accuracies) + min_accuracy = np.min(accuracies) + max_accuracy = np.max(accuracies) + accuracy_trend = 'improving' if accuracies[-1] > accuracies[0] else 'declining' + else: + avg_accuracy = min_accuracy = max_accuracy = 0.0 + accuracy_trend = 'no_data' + + # Get model information + config = self.configs[model_name] + + report = { + 'model_name': model_name, + 'report_period_days': days, + 'generated_at': datetime.now().isoformat(), + 'config': config.__dict__, + 'performance_summary': { + 'avg_accuracy': avg_accuracy, + 'min_accuracy': min_accuracy, + 'max_accuracy': max_accuracy, + 'accuracy_trend': accuracy_trend, + 'total_predictions': sum(m['sample_size'] for m in recent_metrics), + 'monitoring_points': len(recent_metrics) + }, + 'recent_metrics': recent_metrics[-10:], # Last 10 data points + 'alerts_triggered': self._count_recent_alerts(model_name, days), + 'retraining_history': self._get_retraining_history(model_name, days) + } + + return report + + def _count_recent_alerts(self, model_name: str, days: int) -> int: + """Count recent alerts for a model""" + # This would query alert logs in a real implementation + return 0 + + def _get_retraining_history(self, model_name: str, days: int) -> List[Dict[str, Any]]: + """Get retraining history for a model""" + # This would query retraining logs in a real implementation + return [] + +# Example usage +def setup_mlops_pipeline(): + """Set up a complete MLOps pipeline""" + + logging.basicConfig(level=logging.INFO) + + # Initialize MLOps pipeline + mlops = MLOpsPipeline() + + # Configure model pipeline + config = MLOpsConfig( + model_name="customer_churn_predictor", + dataset_path="datasets/raw/customer_churn_dataset.csv", + target_column="target", + feature_columns=['feature_0', 'feature_1', 'feature_2'], + model_type="random_forest_classifier", + hyperparameters={'n_estimators': 100, 'random_state': 42}, + retraining_schedule="02:00", # 2 AM daily + performance_threshold=0.85, + data_drift_threshold=0.05, + alert_recipients=["ml-team@company.com", "ops-team@company.com"], + auto_deploy=False # Manual approval required + ) + + # Register model for MLOps + mlops.register_model_pipeline(config) + + # Simulate monitoring data + np.random.seed(42) + for i in range(10): + # Simulate predictions and actuals + predictions = pd.Series(np.random.choice([0, 1], 100)) + actuals = pd.Series(np.random.choice([0, 1], 100)) + + mlops.monitor_model_performance("customer_churn_predictor", predictions, actuals) + + # Generate report + report = mlops.generate_mlops_report("customer_churn_predictor") + + print("MLOps Report:") + print(f"Model: {report['model_name']}") + print(f"Average Accuracy: {report['performance_summary']['avg_accuracy']:.4f}") + print(f"Accuracy Trend: {report['performance_summary']['accuracy_trend']}") + print(f"Total Predictions: {report['performance_summary']['total_predictions']}") + + return mlops + +if __name__ == "__main__": + mlops_pipeline = setup_mlops_pipeline() + + # In production, you would run: + # mlops_pipeline.run_monitoring_loop() +```## +Best Practices and Production Considerations + +### ML Experiment Tracking Best Practices + +1. **Comprehensive Logging** + - Log all hyperparameters, even default values + - Track data versions and preprocessing steps + - Save model artifacts and metadata + - Record environment information (Python version, library versions) + +2. **Reproducibility** + - Set random seeds consistently + - Version control your code alongside experiments + - Document data sources and preprocessing steps + - Use containerization for consistent environments + +3. **Experiment Organization** + - Use meaningful experiment names and descriptions + - Tag experiments with relevant metadata + - Group related experiments together + - Maintain experiment lineage and relationships + +4. **Model Versioning** + - Version models with semantic versioning + - Track model lineage and experiment relationships + - Maintain model metadata and signatures + - Implement proper model lifecycle management + +5. **Performance Monitoring** + - Monitor model performance in production + - Set up alerts for performance degradation + - Track data drift and model decay + - Implement automated retraining pipelines + +### Troubleshooting Common Issues + +1. **Large Model Storage** +```python +# Use model compression for large models +import joblib + +# Save with compression +joblib.dump(model, 'model.joblib', compress=3) + +# Or use model-specific serialization +import pickle +with open('model.pkl', 'wb') as f: + pickle.dump(model, f, protocol=pickle.HIGHEST_PROTOCOL) +``` + +2. **Memory Issues with Large Datasets** +```python +# Process data in chunks +def train_model_incrementally(model, data_path, chunk_size=1000): + for chunk in pd.read_csv(data_path, chunksize=chunk_size): + # Partial fit for models that support it + if hasattr(model, 'partial_fit'): + model.partial_fit(chunk.drop('target', axis=1), chunk['target']) + else: + # Accumulate data and train in batches + pass +``` + +3. **Experiment Comparison Issues** +```python +# Ensure fair comparison +def compare_experiments_fairly(experiments): + # Use same data splits + # Use same evaluation metrics + # Account for randomness with multiple runs + # Use statistical significance tests + pass +``` + +### Integration with External Tools + +1. **MLflow Integration** +```python +import mlflow +import mlflow.sklearn + +# Log to both lakeFS and MLflow +def log_to_both_systems(model, metrics, artifacts): + # Log to lakeFS (our implementation) + tracker.log_model(model) + tracker.log_metrics(metrics) + + # Log to MLflow + with mlflow.start_run(): + mlflow.log_metrics(metrics) + mlflow.sklearn.log_model(model, "model") +``` + +2. **Weights & Biases Integration** +```python +import wandb + +# Initialize both tracking systems +wandb.init(project="ml-experiments") +tracker = MLExperimentTracker() + +# Log to both systems +def dual_logging(metrics, artifacts): + # Log to lakeFS + tracker.log_metrics(metrics) + + # Log to W&B + wandb.log(metrics) +``` + +3. **Kubernetes Deployment** +```yaml +# k8s-ml-pipeline.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: ml-experiment +spec: + template: + spec: + containers: + - name: ml-experiment + image: ml-experiment:latest + env: + - name: LAKEFS_HOST + value: "http://lakefs-service:8000" + - name: EXPERIMENT_NAME + value: "customer-churn-v2" + command: ["python", "run_experiment.py"] + restartPolicy: Never +``` + +## Next Steps + +### Advanced Topics to Explore + +1. **[Data Science Workflow](data-science-workflow.md)** - Interactive analysis patterns +2. **[ETL Pipeline Tutorial](etl-pipeline.md)** - Production data pipelines +3. **[Advanced Features](../high-level-sdk/advanced.md)** - Performance optimization +4. **[Best Practices](../reference/best-practices.md)** - Production deployment + +### Integration Opportunities + +1. **AutoML Integration** - Combine with AutoML tools for automated model selection +2. **Feature Stores** - Integrate with feature stores for consistent feature management +3. **Model Serving** - Deploy models with serving frameworks like Seldon or KServe +4. **CI/CD Integration** - Automated model testing and deployment pipelines + +### Advanced MLOps Patterns + +1. **Multi-Model Pipelines** - Manage ensembles and model chains +2. **Federated Learning** - Distributed model training across multiple data sources +3. **Model Interpretability** - Track and version model explanations +4. **Compliance and Governance** - Audit trails and regulatory compliance + +## Troubleshooting + +### Common Issues + +1. **Experiment Tracking Failures** +```python +# Implement retry logic for tracking operations +from tenacity import retry, stop_after_attempt, wait_exponential + +@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10)) +def robust_log_metric(key, value): + try: + tracker.log_metric(key, value) + except Exception as e: + logger.warning(f"Failed to log metric {key}: {e}") + raise +``` + +2. **Model Loading Issues** +```python +# Handle model loading errors gracefully +def safe_load_model(model_path): + try: + return joblib.load(model_path) + except Exception as e: + logger.error(f"Failed to load model from {model_path}: {e}") + # Try alternative loading methods + try: + import pickle + with open(model_path, 'rb') as f: + return pickle.load(f) + except Exception as e2: + logger.error(f"Alternative loading also failed: {e2}") + raise +``` + +3. **Branch Management Issues** +```python +# Clean up experiment branches periodically +def cleanup_old_experiment_branches(days_old=30): + cutoff_date = datetime.now() - timedelta(days=days_old) + + for branch in repo.branches.list(): + if branch.id.startswith('experiment/'): + # Check branch age and delete if old + pass +``` + +## See Also + +**Prerequisites and Setup:** +- **[Python SDK Overview](../index.md)** - Compare all Python SDK options +- **[Getting Started Guide](../getting-started.md)** - Installation and authentication setup +- **[High-Level SDK Quickstart](../high-level-sdk/quickstart.md)** - Basic operations + +**Related Tutorials:** +- **[Data Science Workflow](data-science-workflow.md)** - Interactive analysis patterns +- **[ETL Pipeline Tutorial](etl-pipeline.md)** - Production data pipeline patterns + +**Advanced Features:** +- **[Transaction Patterns](../high-level-sdk/transactions.md)** - Atomic operations +- **[lakefs-spec Integration](../lakefs-spec/integrations.md)** - Filesystem operations +- **[Best Practices](../reference/best-practices.md)** - Production deployment guidance + +**External Resources:** +- **[MLflow Documentation](https://mlflow.org/docs/latest/index.html){:target="_blank"}** - ML lifecycle management +- **[Weights & Biases](https://docs.wandb.ai/){:target="_blank"}** - Experiment tracking and visualization +- **[Kubeflow](https://www.kubeflow.org/docs/){:target="_blank"}** - ML workflows on Kubernetes +- **[DVC Documentation](https://dvc.org/doc){:target="_blank"}** - Data version control +- **[Great Expectations](https://greatexpectations.io/){:target="_blank"}** - Data validation and profiling \ No newline at end of file From a5cc800faed991139b3d026c5c9b946a8f21a68d Mon Sep 17 00:00:00 2001 From: Barak Amar Date: Mon, 21 Jul 2025 11:53:03 +0300 Subject: [PATCH 2/2] remove boto3 from sdk comparison --- docs/src/integrations/python/index.md | 68 ++----- .../python/reference/api-comparison.md | 184 +++++++----------- 2 files changed, 85 insertions(+), 167 deletions(-) diff --git a/docs/src/integrations/python/index.md b/docs/src/integrations/python/index.md index ca5996d4a06..9bcd335826f 100644 --- a/docs/src/integrations/python/index.md +++ b/docs/src/integrations/python/index.md @@ -1,7 +1,7 @@ --- title: Python Integration Overview description: Comprehensive guide to using Python with lakeFS - SDK comparison and getting started -sdk_types: ["high-level", "generated", "lakefs-spec", "boto3"] +sdk_types: ["high-level", "generated", "lakefs-spec"] difficulty: "beginner" use_cases: ["general", "decision-making", "getting-started"] topics: ["overview", "comparison", "selection"] @@ -25,41 +25,36 @@ graph TD A[Your Application] --> B[High-Level SDK] A --> C[Generated SDK] A --> D[lakefs-spec] - A --> E[Boto3] B --> C C --> F[lakeFS API] D --> F - E --> G[lakeFS S3 Gateway] - G --> F style B fill:#e1f5fe style C fill:#f3e5f5 style D fill:#e8f5e8 - style E fill:#fff3e0 ``` - **High-Level SDK** is built on top of the **Generated SDK**, providing simplified interfaces while maintaining access to the underlying client - **Generated SDK** provides direct access to all lakeFS API endpoints based on the OpenAPI specification - **lakefs-spec** offers a filesystem-like interface compatible with the fsspec ecosystem -- **Boto3** integrates through lakeFS's S3-compatible gateway ## Comprehensive SDK Comparison -| Feature | High-Level SDK | Generated SDK | lakefs-spec | Boto3 | -|---------|----------------|---------------|-------------|-------| -| **Installation** | `pip install lakefs` | `pip install lakefs-sdk` | `pip install lakefs-spec` | `pip install boto3` | -| **API Style** | Object-oriented, simplified | Direct API mapping | Filesystem-like | S3-compatible | -| **Learning Curve** | Easy | Moderate | Easy | Easy (if familiar with S3) | -| **Repository Management** | ✅ Full support | ✅ Full support | ❌ Not supported | ❌ Not supported | -| **Branch Operations** | ✅ Simplified interface | ✅ Full API access | ❌ Limited | ❌ Not supported | -| **Object Operations** | ✅ Streaming I/O | ✅ Manual handling | ✅ File-like operations | ✅ S3-style operations | -| **Transactions** | ✅ Built-in support | ⚠️ Manual implementation | ✅ Context managers | ❌ Not supported | -| **Data Science Integration** | ⚠️ Via file-like objects | ❌ Manual integration | ✅ Native pandas/dask support | ⚠️ Via S3 compatibility | -| **Async Support** | ❌ Sync only | ⚠️ Limited | ❌ Sync only | ⚠️ Via aioboto3 | -| **Error Handling** | ✅ Pythonic exceptions | ✅ API-level exceptions | ✅ Filesystem exceptions | ✅ Boto3 exceptions | -| **Performance** | Good | Best (direct API) | Good | Good | -| **Maintenance** | lakeFS team | Auto-generated | Third-party | AWS/Community | +| Feature | High-Level SDK | Generated SDK | lakefs-spec | +|---------|----------------|---------------|-------------| +| **Installation** | `pip install lakefs` | `pip install lakefs-sdk` | `pip install lakefs-spec` | +| **API Style** | Object-oriented, simplified | Direct API mapping | Filesystem-like | +| **Learning Curve** | Easy | Moderate | Easy | +| **Repository Management** | ✅ Full support | ✅ Full support | ❌ Not supported | +| **Branch Operations** | ✅ Simplified interface | ✅ Full API access | ❌ Limited | +| **Object Operations** | ✅ Streaming I/O | ✅ Manual handling | ✅ File-like operations | +| **Transactions** | ✅ Built-in support | ⚠️ Manual implementation | ✅ Context managers | +| **Data Science Integration** | ⚠️ Via file-like objects | ❌ Manual integration | ✅ Native pandas/dask support | +| **Async Support** | ❌ Sync only | ⚠️ Limited | ❌ Sync only | +| **Error Handling** | ✅ Pythonic exceptions | ✅ API-level exceptions | ✅ Filesystem exceptions | +| **Performance** | Good | Best (direct API) | Good | +| **Maintenance** | lakeFS team | Auto-generated | Third-party | ### SDK Strengths and Use Cases @@ -123,25 +118,7 @@ graph TD - Data exploration in notebooks - Integration with existing data science stacks -#### Boto3 -**Strengths:** -- Familiar S3-compatible interface -- Minimal code changes from existing S3 workflows -- Extensive documentation and community support -- Integration with AWS ecosystem tools -- Support for multipart uploads and presigned URLs - -**Best for:** -- Migrating existing S3-based applications -- Teams familiar with AWS S3 -- Applications using S3-compatible tools -- Hybrid S3/lakeFS deployments -**Example use cases:** -- S3 application migration -- Backup and archival workflows -- Integration with S3-compatible tools -- Gradual lakeFS adoption ## SDK Selection Decision Matrix @@ -164,10 +141,7 @@ Use this decision tree to choose the right SDK for your needs: - **Need full API control?** → [Generated SDK](generated-sdk/) - **Integrating with existing systems?** → [Generated SDK](generated-sdk/) -#### 🔄 Migration from S3 -- **Existing S3 codebase?** → [Boto3](boto3/) -- **Using S3-compatible tools?** → [Boto3](boto3/) -- **Gradual migration strategy?** → [Boto3](boto3/) + [Boto S3 Router](boto3/s3-router.md) + ### 🎯 Feature-Based Selection @@ -176,7 +150,6 @@ Use this decision tree to choose the right SDK for your needs: | **Simplest API** | High-Level SDK | Pythonic, intuitive interface | | **Complete API access** | Generated SDK | All endpoints available | | **Pandas integration** | lakefs-spec | Native fsspec support | -| **S3 compatibility** | Boto3 | Familiar S3 interface | | **Transaction support** | High-Level SDK or lakefs-spec | Built-in context managers | | **Streaming large files** | High-Level SDK | Optimized I/O operations | | **Custom tooling** | Generated SDK | Full control and flexibility | @@ -195,9 +168,9 @@ Use this decision tree to choose the right SDK for your needs: - Check [best practices](reference/best-practices.md) for optimization #### Migrating from S3 -1. Review [Boto3 configuration](boto3/configuration.md) -2. Consider [Boto S3 Router](boto3/s3-router.md) for hybrid setups -3. Plan gradual migration with [migration guide](boto3/migration-guide.md) +1. Review [S3 Gateway documentation](../../understand/architecture.md#s3-gateway) for S3-compatible access +2. Consider gradual migration strategies +3. Plan integration with existing S3-based workflows ## Quick Start @@ -211,7 +184,7 @@ Use this decision tree to choose the right SDK for your needs: - **[High-Level SDK](high-level-sdk/)** - Comprehensive SDK documentation - **[Generated SDK](generated-sdk/)** - Direct API access patterns - **[lakefs-spec](lakefs-spec/)** - Filesystem API and data science integrations -- **[Boto3](boto3/)** - S3-compatible operations + - **[Tutorials](tutorials/)** - Real-world examples and workflows - **[Reference](reference/)** - API comparison, best practices, and troubleshooting @@ -231,7 +204,6 @@ Use this decision tree to choose the right SDK for your needs: - [High-Level SDK Overview](high-level-sdk/index.md) - Simplified Python interface - [Generated SDK Overview](generated-sdk/index.md) - Direct API access - [lakefs-spec Overview](lakefs-spec/index.md) - Filesystem operations -- [Boto3 Integration](boto3/index.md) - S3-compatible interface **Learning Resources:** - [Real-World Tutorials](tutorials/index.md) - End-to-end examples and workflows diff --git a/docs/src/integrations/python/reference/api-comparison.md b/docs/src/integrations/python/reference/api-comparison.md index 08bf2b3e6a8..5abfac8bd49 100644 --- a/docs/src/integrations/python/reference/api-comparison.md +++ b/docs/src/integrations/python/reference/api-comparison.md @@ -1,7 +1,7 @@ --- title: API Comparison description: Comprehensive feature comparison across all Python SDK options -sdk_types: ["high-level", "generated", "lakefs-spec", "boto3"] +sdk_types: ["high-level", "generated", "lakefs-spec"] difficulty: "intermediate" use_cases: ["general", "decision-making"] --- @@ -16,7 +16,6 @@ This comprehensive comparison helps you choose the right Python SDK for your spe |----------|----------------|-------------| | **Data Science & Analytics** | lakefs-spec | High-Level SDK | | **Production ETL Pipelines** | High-Level SDK | Generated SDK | -| **Existing S3 Workflows** | Boto3 | High-Level SDK | | **Custom API Operations** | Generated SDK | High-Level SDK | | **Jupyter Notebooks** | lakefs-spec | High-Level SDK | | **ML Experiment Tracking** | High-Level SDK | lakefs-spec | @@ -27,93 +26,89 @@ This comprehensive comparison helps you choose the right Python SDK for your spe ### Core Repository Operations -| Feature | High-Level SDK | Generated SDK | lakefs-spec | Boto3 | -|---------|----------------|---------------|-------------|-------| +| Feature | High-Level SDK | Generated SDK | lakefs-spec | +|---------|----------------|---------------|-------------| | **Repository Management** | -| Create Repository | ✅ Full | ✅ Full | ❌ None | ❌ None | -| Delete Repository | ✅ Full | ✅ Full | ❌ None | ❌ None | -| List Repositories | ✅ Full | ✅ Full | ❌ None | ❌ None | -| Repository Metadata | ✅ Full | ✅ Full | ❌ None | ❌ None | +| Create Repository | ✅ Full | ✅ Full | ❌ None | +| Delete Repository | ✅ Full | ✅ Full | ❌ None | +| List Repositories | ✅ Full | ✅ Full | ❌ None | +| Repository Metadata | ✅ Full | ✅ Full | ❌ None | | **Branch Operations** | -| Create Branch | ✅ Full | ✅ Full | ✅ Limited | ❌ None | -| Delete Branch | ✅ Full | ✅ Full | ✅ Limited | ❌ None | -| List Branches | ✅ Full | ✅ Full | ✅ Limited | ❌ None | -| Branch Protection | ✅ Full | ✅ Full | ❌ None | ❌ None | +| Create Branch | ✅ Full | ✅ Full | ✅ Limited | +| Delete Branch | ✅ Full | ✅ Full | ✅ Limited | +| List Branches | ✅ Full | ✅ Full | ✅ Limited | +| Branch Protection | ✅ Full | ✅ Full | ❌ None | | **Commit Operations** | -| Create Commit | ✅ Full | ✅ Full | ✅ Full | ❌ None | -| List Commits | ✅ Full | ✅ Full | ✅ Limited | ❌ None | -| Commit Metadata | ✅ Full | ✅ Full | ✅ Limited | ❌ None | -| Cherry Pick | ✅ Full | ✅ Full | ❌ None | ❌ None | +| Create Commit | ✅ Full | ✅ Full | ✅ Full | +| List Commits | ✅ Full | ✅ Full | ✅ Limited | +| Commit Metadata | ✅ Full | ✅ Full | ✅ Limited | +| Cherry Pick | ✅ Full | ✅ Full | ❌ None | ### Object Operations -| Feature | High-Level SDK | Generated SDK | lakefs-spec | Boto3 | -|---------|----------------|---------------|-------------|-------| +| Feature | High-Level SDK | Generated SDK | lakefs-spec | +|---------|----------------|---------------|-------------| | **Basic Operations** | -| Upload Object | ✅ Full | ✅ Full | ✅ Full | ✅ Full | -| Download Object | ✅ Full | ✅ Full | ✅ Full | ✅ Full | -| Delete Object | ✅ Full | ✅ Full | ✅ Full | ✅ Full | -| List Objects | ✅ Full | ✅ Full | ✅ Full | ✅ Full | +| Upload Object | ✅ Full | ✅ Full | ✅ Full | +| Download Object | ✅ Full | ✅ Full | ✅ Full | +| Delete Object | ✅ Full | ✅ Full | ✅ Full | +| List Objects | ✅ Full | ✅ Full | ✅ Full | | **Advanced Operations** | -| Streaming I/O | ✅ Full | 🔶 Manual | ✅ Full | ✅ Full | -| Batch Operations | ✅ Full | 🔶 Manual | ✅ Full | ✅ Full | -| Object Metadata | ✅ Full | ✅ Full | ✅ Full | ✅ Full | -| Presigned URLs | ✅ Full | ✅ Full | ❌ None | ✅ Full | -| Multipart Upload | ✅ Full | ✅ Full | ✅ Full | ✅ Full | +| Streaming I/O | ✅ Full | 🔶 Manual | ✅ Full | +| Batch Operations | ✅ Full | 🔶 Manual | ✅ Full | +| Object Metadata | ✅ Full | ✅ Full | ✅ Full | +| Presigned URLs | ✅ Full | ✅ Full | ❌ None | +| Multipart Upload | ✅ Full | ✅ Full | ✅ Full | ### Data Management Features -| Feature | High-Level SDK | Generated SDK | lakefs-spec | Boto3 | -|---------|----------------|---------------|-------------|-------| +| Feature | High-Level SDK | Generated SDK | lakefs-spec | +|---------|----------------|---------------|-------------| | **Transactions** | -| Atomic Operations | ✅ Full | 🔶 Manual | ✅ Full | ❌ None | -| Rollback Support | ✅ Full | 🔶 Manual | ✅ Full | ❌ None | -| Context Managers | ✅ Full | ❌ None | ✅ Full | ❌ None | +| Atomic Operations | ✅ Full | 🔶 Manual | ✅ Full | +| Rollback Support | ✅ Full | 🔶 Manual | ✅ Full | +| Context Managers | ✅ Full | ❌ None | ✅ Full | | **Import/Export** | -| Data Import | ✅ Full | ✅ Full | ❌ None | ❌ None | -| Import Status | ✅ Full | ✅ Full | ❌ None | ❌ None | -| Export Operations | ✅ Full | ✅ Full | ❌ None | ❌ None | +| Data Import | ✅ Full | ✅ Full | ❌ None | +| Import Status | ✅ Full | ✅ Full | ❌ None | +| Export Operations | ✅ Full | ✅ Full | ❌ None | | **Merge Operations** | -| Branch Merging | ✅ Full | ✅ Full | ❌ None | ❌ None | -| Conflict Resolution | ✅ Full | ✅ Full | ❌ None | ❌ None | -| Merge Strategies | ✅ Full | ✅ Full | ❌ None | ❌ None | +| Branch Merging | ✅ Full | ✅ Full | ❌ None | +| Conflict Resolution | ✅ Full | ✅ Full | ❌ None | +| Merge Strategies | ✅ Full | ✅ Full | ❌ None | ### Integration Capabilities -| Feature | High-Level SDK | Generated SDK | lakefs-spec | Boto3 | -|---------|----------------|---------------|-------------|-------| +| Feature | High-Level SDK | Generated SDK | lakefs-spec | +|---------|----------------|---------------|-------------| | **Data Science Libraries** | -| Pandas Integration | ✅ Full | 🔶 Manual | ✅ Native | 🔶 Manual | -| Dask Integration | ✅ Full | 🔶 Manual | ✅ Native | 🔶 Manual | -| PyArrow Integration | ✅ Full | 🔶 Manual | ✅ Native | 🔶 Manual | +| Pandas Integration | ✅ Full | 🔶 Manual | ✅ Native | +| Dask Integration | ✅ Full | 🔶 Manual | ✅ Native | +| PyArrow Integration | ✅ Full | 🔶 Manual | ✅ Native | | **File System Interface** | -| fsspec Compatibility | 🔶 Limited | ❌ None | ✅ Native | 🔶 Limited | -| Path-like Operations | ✅ Full | 🔶 Manual | ✅ Native | 🔶 Limited | -| Glob Patterns | ✅ Full | 🔶 Manual | ✅ Native | 🔶 Limited | -| **S3 Compatibility** | -| S3 API Compatibility | ❌ None | ❌ None | ❌ None | ✅ Full | -| Existing S3 Code | ❌ None | ❌ None | ❌ None | ✅ Full | -| S3 Tools Integration | ❌ None | ❌ None | ❌ None | ✅ Full | +| fsspec Compatibility | 🔶 Limited | ❌ None | ✅ Native | +| Path-like Operations | ✅ Full | 🔶 Manual | ✅ Native | +| Glob Patterns | ✅ Full | 🔶 Manual | ✅ Native | ## Performance Characteristics ### Throughput Comparison -| Operation Type | High-Level SDK | Generated SDK | lakefs-spec | Boto3 | -|----------------|----------------|---------------|-------------|-------| +| Operation Type | High-Level SDK | Generated SDK | lakefs-spec | +|----------------|----------------|---------------|-------------| | **Small Files (< 1MB)** | -| Single Upload | Good | Good | Excellent | Good | -| Batch Upload | Excellent | Good | Excellent | Good | -| Single Download | Good | Good | Excellent | Good | -| Batch Download | Excellent | Good | Excellent | Good | +| Single Upload | Good | Good | Excellent | +| Batch Upload | Excellent | Good | Excellent | +| Single Download | Good | Good | Excellent | +| Batch Download | Excellent | Good | Excellent | | **Large Files (> 100MB)** | -| Streaming Upload | Excellent | Good | Excellent | Excellent | -| Streaming Download | Excellent | Good | Excellent | Excellent | -| Multipart Upload | Excellent | Good | Excellent | Excellent | +| Streaming Upload | Excellent | Good | Excellent | +| Streaming Download | Excellent | Good | Excellent | +| Multipart Upload | Excellent | Good | Excellent | | **Metadata Operations** | -| List Objects | Good | Good | Excellent | Good | -| Object Stats | Good | Good | Excellent | Good | -| Branch Operations | Excellent | Good | Good | N/A | +| List Objects | Good | Good | Excellent | +| Object Stats | Good | Good | Excellent | +| Branch Operations | Excellent | Good | Good | ### Memory Usage @@ -122,17 +117,16 @@ This comprehensive comparison helps you choose the right Python SDK for your spe | **High-Level SDK** | Good | Optimized for common patterns, connection pooling | | **Generated SDK** | Fair | Direct API access, manual optimization needed | | **lakefs-spec** | Excellent | Designed for large datasets, streaming-first | -| **Boto3** | Good | Mature S3 optimizations, configurable buffering | ### Latency Characteristics -| Operation | High-Level SDK | Generated SDK | lakefs-spec | Boto3 | -|-----------|----------------|---------------|-------------|-------| -| **Connection Setup** | Fast | Fast | Fast | Fast | -| **Authentication** | Fast | Fast | Fast | Fast | -| **First Request** | Medium | Medium | Fast | Medium | -| **Subsequent Requests** | Fast | Fast | Fast | Fast | -| **Batch Operations** | Fast | Medium | Fast | Fast | +| Operation | High-Level SDK | Generated SDK | lakefs-spec | +|-----------|----------------|---------------|-------------| +| **Connection Setup** | Fast | Fast | Fast | +| **Authentication** | Fast | Fast | Fast | +| **First Request** | Medium | Medium | Fast | +| **Subsequent Requests** | Fast | Fast | Fast | +| **Batch Operations** | Fast | Medium | Fast | ## Trade-offs Analysis @@ -199,26 +193,7 @@ This comprehensive comparison helps you choose the right Python SDK for your spe - Integration with existing fsspec-based tools - Teams familiar with filesystem interfaces -### Boto3 -**Strengths:** -- Full S3 API compatibility -- Seamless migration from existing S3 workflows -- Mature ecosystem and tooling support -- Excellent performance for object operations -- Familiar interface for AWS users - -**Weaknesses:** -- No access to lakeFS-specific features (branches, commits, etc.) -- Limited to object operations only -- Requires S3 Gateway configuration -- No transaction support - -**Best For:** -- Migrating existing S3-based applications -- Teams with strong AWS/S3 expertise -- Applications requiring S3 tool compatibility -- Simple object storage use cases ## Decision Guidelines @@ -286,34 +261,10 @@ processed_df = df.groupby("category").sum() processed_df.to_parquet("lakefs://repo/branch/results/summary.parquet") ``` -### Choose Boto3 When: -- Migrating existing S3-based applications -- Need S3 tool compatibility -- Simple object storage requirements -- Team has strong AWS expertise -- Using S3-compatible tools and libraries - -```python -# Example: S3-compatible operations -import boto3 - -s3 = boto3.client('s3', endpoint_url='http://localhost:8000') -s3.put_object( - Bucket='repo', - Key='branch/path/to/file.txt', - Body=data -) -``` ## Migration Paths -### From S3 to lakeFS - -1. **Start with Boto3**: Minimal code changes, immediate compatibility -2. **Add lakefs-spec**: For data science workflows requiring filesystem interface -3. **Upgrade to High-Level SDK**: For advanced lakeFS features and better integration - ### From File Systems to lakeFS 1. **Start with lakefs-spec**: Familiar filesystem interface @@ -341,15 +292,12 @@ s3.put_object( - [Generated SDK Examples](../generated-sdk/examples.md) - Common usage patterns - [lakefs-spec Overview](../lakefs-spec/index.md) - Filesystem interface documentation - [lakefs-spec Integrations](../lakefs-spec/integrations.md) - Data science library examples -- [Boto3 Integration](../boto3/index.md) - S3-compatible operations -- [Boto3 Configuration](../boto3/configuration.md) - Setup and authentication **Feature-Specific Guides:** - [Transaction Patterns](../high-level-sdk/transactions.md) - Atomic operations across SDKs - [Object I/O Operations](../high-level-sdk/objects-and-io.md) - File handling patterns - [Data Import/Export](../high-level-sdk/imports-and-exports.md) - Bulk data operations - [Filesystem Operations](../lakefs-spec/filesystem-api.md) - File-like operations -- [S3 Operations](../boto3/s3-operations.md) - S3-compatible patterns **Learning Resources:** - [Data Science Tutorial](../tutorials/data-science-workflow.md) - End-to-end workflow examples @@ -363,12 +311,10 @@ s3.put_object( - [Error Handling Patterns](troubleshooting.md#error-handling) - Exception handling strategies **Migration Guides:** -- [S3 Migration Patterns](../boto3/s3-operations.md#migration-patterns) - Convert S3 code to lakeFS - [SDK Migration Strategies](best-practices.md#sdk-migration) - Moving between SDKs - [Legacy Integration](best-practices.md#legacy-integration) - Integrate with existing systems **External Resources:** - [High-Level SDK API Reference](https://pydocs-lakefs.lakefs.io){:target="_blank"} - Complete API documentation - [Generated SDK API Reference](https://pydocs-sdk.lakefs.io){:target="_blank"} - Auto-generated API docs -- [lakefs-spec Documentation](https://lakefs-spec.org/){:target="_blank"} - Third-party filesystem interface -- [Boto3 Documentation](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html){:target="_blank"} - Official Boto3 documentation \ No newline at end of file +- [lakefs-spec Documentation](https://lakefs-spec.org/){:target="_blank"} - Third-party filesystem interface \ No newline at end of file