diff --git a/.claude/implementations/mcp_architecture_analysis.md b/.claude/implementations/mcp_architecture_analysis.md new file mode 100644 index 0000000..b479c8b --- /dev/null +++ b/.claude/implementations/mcp_architecture_analysis.md @@ -0,0 +1,224 @@ +# ContextFrame MCP Server Architecture Analysis + +## Overview + +The ContextFrame MCP (Model Context Protocol) server provides a standardized API for LLMs and AI agents to interact with ContextFrame datasets. The architecture follows a modular, extensible design with clear separation of concerns and support for multiple transport mechanisms. + +## Core Architecture Components + +### 1. Main Server Component (`server.py`) + +**ContextFrameMCPServer** is the central orchestrator that: +- Manages dataset connections via `FrameDataset` +- Initializes and coordinates all subsystems +- Handles graceful startup and shutdown +- Supports multiple transport types (stdio, HTTP, both) + +**MCPConfig** dataclass provides comprehensive configuration: +- Server identity (name, version, protocol version) +- Transport selection and configuration +- HTTP-specific settings (host, port, CORS, SSL) +- Monitoring configuration (metrics, pricing, retention) +- Security configuration (auth providers, permissions, audit) + +### 2. Message Handling (`handlers.py`) + +**MessageHandler** implements the JSON-RPC 2.0 protocol: +- Routes incoming messages to appropriate handlers +- Validates requests and parameters +- Handles errors with proper JSON-RPC error responses +- Supports both request-response and notification patterns + +Core message types handled: +- `initialize/initialized`: Protocol handshake +- `tools/list` and `tools/call`: Tool discovery and execution +- `resources/list` and `resources/read`: Resource access +- `shutdown`: Graceful termination + +### 3. Tool System (`tools.py`) + +**ToolRegistry** manages available tools: +- Document CRUD operations (add, get, list, update, delete) +- Search capabilities (vector, text, hybrid) +- Automatic tool registration and discovery +- Extensible architecture for additional tool modules + +Tool categories: +- **Core document tools**: Basic CRUD and search +- **Enhancement tools**: LLM-powered document enrichment +- **Extraction tools**: Content extraction from various sources +- **Batch tools**: Bulk operations with transaction support +- **Collection tools**: Document organization and management +- **Subscription tools**: Change monitoring and notifications +- **Analytics tools**: Usage metrics and performance analysis + +### 4. Resource System (`resources.py`) + +**ResourceRegistry** provides dataset exploration: +- Dataset information and statistics +- Schema introspection +- Collection listings +- Relationship discovery +- URI-based resource addressing (`contextframe://`) + +### 5. Transport Abstraction + +#### Base Transport (`core/transport.py`) + +**TransportAdapter** abstract base class defines: +- Message sending/receiving interface +- Progress reporting capabilities +- Subscription handling +- Streaming support detection + +Key design decisions: +- Transport-agnostic tool implementations +- Unified progress and subscription handling +- Support for both streaming and polling models + +#### Stdio Transport (`transports/stdio.py`) + +**StdioAdapter** implements: +- Standard input/output communication +- Progress updates included in responses +- Polling-based subscriptions +- Backward compatibility with existing StdioTransport + +#### HTTP Transport (`transports/http/`) + +**HttpAdapter** provides HTTP-first architecture: +- Primary transport method with JSON responses +- Optional SSE (Server-Sent Events) for streaming +- Real-time progress tracking +- WebSocket-like subscriptions via SSE +- Operation context management + +HTTP transport is designed as the primary MCP transport, with SSE as an optional enhancement for specific streaming use cases. + +## Integration Layers + +### 1. Monitoring Integration + +**MonitoringSystem** wraps core components: +- **MetricsCollector**: Centralized metrics storage +- **UsageTracker**: Query and document access tracking +- **PerformanceMonitor**: Operation timing and profiling +- **CostCalculator**: Resource usage pricing + +**MonitoredMessageHandler** and **MonitoredToolRegistry** provide: +- Automatic operation tracking +- Agent-specific metrics +- Tool usage statistics +- Document access patterns + +### 2. Security Integration + +**SecurityMiddleware** implements comprehensive security: +- **Authentication**: Multi-provider support (API key, OAuth, JWT) +- **Authorization**: Fine-grained permission control +- **Rate Limiting**: Request throttling and quotas +- **Audit Logging**: Comprehensive activity tracking + +Security features: +- Anonymous access with configurable permissions +- Request metadata extraction and tracking +- Automatic audit trail generation +- Flexible authentication provider chaining + +## Key Design Patterns + +### 1. Dependency Injection +The server uses constructor injection for all major components, enabling: +- Easy testing with mock components +- Runtime configuration flexibility +- Clear dependency relationships + +### 2. Adapter Pattern +Transport adapters provide a uniform interface while allowing transport-specific optimizations: +- Stdio uses buffered I/O and polling +- HTTP uses direct responses and SSE streaming + +### 3. Registry Pattern +Tool and resource registries enable: +- Dynamic feature discovery +- Plugin-style extensibility +- Runtime tool registration + +### 4. Middleware Pattern +Monitoring and security wrap core components: +- Transparent metric collection +- Non-intrusive security enforcement +- Composable behavior modification + +### 5. Context Management +Operation contexts track long-running operations: +- Automatic cleanup on completion +- Progress tracking association +- Error handling and recovery + +## Configuration Flexibility + +The architecture supports multiple deployment scenarios: + +1. **Minimal Setup**: Basic stdio transport with no monitoring/security +2. **Production HTTP**: Full HTTP server with auth, monitoring, and SSL +3. **Development Mode**: Both transports for maximum compatibility +4. **Enterprise**: Complete security, monitoring, and analytics + +## Extension Points + +1. **Custom Tools**: Register via ToolRegistry +2. **Auth Providers**: Implement AuthProvider interface +3. **Transport Types**: Extend TransportAdapter +4. **Metrics Backends**: Custom MetricsCollector implementations +5. **Resource Types**: Add new resource URIs + +## Important Design Decisions + +1. **HTTP-First Design**: While stdio is supported for compatibility, HTTP is the primary transport as it aligns with modern API practices and the MCP specification direction. + +2. **Optional Streaming**: SSE is available but not required. Most operations use simple request-response patterns for reliability. + +3. **Modular Security**: Security components can be enabled/disabled independently based on deployment needs. + +4. **Backward Compatibility**: The architecture maintains compatibility with existing Lance datasets and FrameDataset APIs. + +5. **Agent-Aware**: The system tracks agent IDs throughout for multi-tenant scenarios and usage attribution. + +6. **Transaction Support**: Batch operations support atomic transactions with rollback capabilities. + +7. **Async-First**: All components use async/await for scalability and non-blocking I/O. + +## Performance Considerations + +1. **Lazy Loading**: Components are initialized only when needed +2. **Connection Pooling**: Reuse database connections +3. **Streaming Responses**: Large results can be streamed +4. **Caching**: Frequently accessed resources are cached +5. **Batch Processing**: Bulk operations minimize round trips + +## Security Architecture + +1. **Defense in Depth**: Multiple security layers +2. **Zero Trust**: Authenticate and authorize every request +3. **Audit Everything**: Comprehensive logging for compliance +4. **Rate Limiting**: Prevent abuse and ensure fair usage +5. **Secure by Default**: Security enabled unless explicitly disabled + +## Monitoring Architecture + +1. **Non-Intrusive**: Monitoring doesn't affect core functionality +2. **Comprehensive**: Tracks all aspects of system behavior +3. **Performant**: Metrics collection is asynchronous +4. **Extensible**: Easy to add new metrics and monitors +5. **Cost-Aware**: Built-in support for usage-based pricing + +## Future Extensibility + +The architecture is designed to support: +1. Additional transport types (WebSocket, gRPC) +2. New tool categories (ML operations, data pipelines) +3. Advanced security features (mTLS, SAML) +4. Enhanced monitoring (distributed tracing, alerting) +5. Multi-dataset federation +6. Real-time collaboration features \ No newline at end of file diff --git a/.claude/implementations/mcp_documentation_summary.md b/.claude/implementations/mcp_documentation_summary.md new file mode 100644 index 0000000..55238bb --- /dev/null +++ b/.claude/implementations/mcp_documentation_summary.md @@ -0,0 +1,211 @@ +# ContextFrame MCP Server Documentation Summary + +This document consolidates the research findings from analyzing the ContextFrame MCP server implementation. It serves as the foundation for creating comprehensive documentation. + +## Overview + +The ContextFrame MCP server is a production-ready implementation of the Model Context Protocol that provides AI agents and LLMs with standardized access to ContextFrame datasets. It features: + +- **HTTP-first design** with optional stdio support +- **43 comprehensive tools** covering all aspects of document management +- **Enterprise-grade security** with multi-provider authentication +- **Built-in monitoring** with zero overhead when disabled +- **Extensible architecture** supporting custom tools and transports + +## Architecture Summary + +### Core Components + +1. **ContextFrameMCPServer** - Central orchestrator + - Manages initialization and lifecycle + - Coordinates subsystems (transport, security, monitoring) + - Configurable via MCPConfig + +2. **Transport Layer** - HTTP-first with abstraction + - Primary: HTTP/FastAPI with optional SSE + - Legacy: stdio for CLI compatibility + - Extensible: Easy to add new transports + +3. **Tool System** - 43 tools in 7 categories + - Core operations (CRUD, search) + - Batch processing with transactions + - Collection management + - Enhancement via LLMs + - Analytics and optimization + - Monitoring and observability + - Real-time subscriptions + +4. **Security Layer** - Defense in depth + - Authentication: API keys, OAuth 2.1, JWT + - Authorization: RBAC with resource policies + - Rate limiting: Multi-level protection + - Audit logging: Comprehensive trail + +5. **Monitoring System** - Production observability + - Performance metrics with percentiles + - Usage analytics and patterns + - Cost tracking and attribution + - Export to standard formats + +## Key Design Decisions + +### 1. HTTP-First Approach + +**Rationale**: Modern cloud-native deployments require HTTP for: +- Standard infrastructure (load balancers, CDNs) +- Mature security ecosystem +- Horizontal scalability +- Developer familiarity + +**Implementation**: +- FastAPI for high performance +- Optional SSE for real-time updates +- REST convenience endpoints +- Full JSON-RPC 2.0 compliance + +### 2. Tool System Design + +**Principles**: +- Transport-agnostic implementation +- Consistent parameter validation +- Comprehensive error handling +- Progress reporting support + +**Categories**: +1. Document operations (6 tools) +2. Batch processing (8 tools) +3. Collection management (6 tools) +4. Enhancement/extraction (7 tools) +5. Analytics (8 tools) +6. Monitoring (5 tools) +7. Subscriptions (4 tools) + +### 3. Security Architecture + +**Multi-layer approach**: +1. Authentication (who you are) +2. Rate limiting (prevent abuse) +3. Authorization (what you can do) +4. Audit logging (what you did) + +**Flexibility**: +- Multiple auth providers simultaneously +- Fine-grained permissions +- Resource-level policies +- Anonymous access control + +### 4. Monitoring Philosophy + +**Zero-overhead principle**: +- Completely disabled by default +- <1% overhead when enabled +- Bounded memory usage +- Async collection + +**Comprehensive metrics**: +- Operation latency (p50, p95, p99) +- Document/query patterns +- Cost attribution +- Error tracking + +## Integration Patterns + +### 1. Basic Integration + +```python +# HTTP client (recommended) +client = MCPClient("http://localhost:8080") +docs = await client.search_documents(query="AI agents") + +# Tool discovery +tools = await client.list_available_tools() +``` + +### 2. Agent Integration + +- LangChain wrapper pattern +- Function calling with OpenAI/Anthropic +- Context window management +- Streaming for large results + +### 3. Production Patterns + +- Load balancing across servers +- Multi-layer caching +- Dataset sharding +- Real-time monitoring + +## Documentation Requirements + +Based on the research, the documentation must cover: + +### 1. Getting Started (5-minute quickstart) +- Installation (pip/uv/docker) +- First server startup +- Basic tool usage +- Simple integration example + +### 2. Core Concepts +- MCP protocol basics +- ContextFrame integration +- Tool system architecture +- Transport options + +### 3. API Reference +- All 43 tools with examples +- Parameter schemas +- Error codes +- Output formats + +### 4. Security Guide +- Authentication setup +- Permission configuration +- Production hardening +- Compliance considerations + +### 5. Monitoring Guide +- Metrics overview +- Dashboard setup +- Alert configuration +- Cost optimization + +### 6. Integration Guides +- Python clients +- LangChain integration +- OpenAI function calling +- Production architectures + +### 7. Cookbook +- RAG implementation +- Document pipeline +- Real-time monitoring +- Multi-tenant setup + +## Critical Documentation Points + +1. **Emphasize HTTP-first**: Make it clear that HTTP is the recommended transport +2. **Security by default**: Show secure configurations prominently +3. **Performance tips**: Include caching, batching, and optimization +4. **Real examples**: Use actual code from the test suite +5. **Progressive complexity**: Start simple, add features gradually +6. **Troubleshooting**: Common issues and solutions +7. **Migration guides**: From stdio to HTTP, from v1 to v2 + +## Success Metrics + +The documentation should achieve: +- 90% of users succeed with quickstart +- <5 support questions per feature +- Clear path from dev to production +- Community contributions + +## Next Steps + +1. Create directory structure for MCP docs +2. Write quickstart guide with working example +3. Document all 43 tools with examples +4. Create security configuration guide +5. Build monitoring setup guide +6. Develop integration patterns +7. Add cookbook recipes +8. Update mkdocs.yml \ No newline at end of file diff --git a/.claude/implementations/mcp_integration_patterns.md b/.claude/implementations/mcp_integration_patterns.md new file mode 100644 index 0000000..c0b32b3 --- /dev/null +++ b/.claude/implementations/mcp_integration_patterns.md @@ -0,0 +1,1244 @@ +# MCP Integration Patterns and Real-World Use Cases + +## Table of Contents +1. [Client Integration Patterns](#client-integration-patterns) +2. [Agent Integration Approaches](#agent-integration-approaches) +3. [Common Integration Scenarios](#common-integration-scenarios) +4. [Best Practices](#best-practices) +5. [Testing Patterns](#testing-patterns) +6. [Production Deployment](#production-deployment) +7. [Real-World Architecture Examples](#real-world-architecture-examples) + +## Client Integration Patterns + +### 1. Basic Stdio Client Pattern + +The simplest integration for local development and testing: + +```python +# contextframe/mcp/example_client.py pattern +class MCPClient: + """Simple MCP client for stdio communication.""" + + async def call(self, method: str, params: dict[str, Any] | None = None) -> dict[str, Any]: + """Make an RPC call and wait for response.""" + await self.send_message(method, params) + response = await self.read_response() + + if "error" in response: + raise Exception(f"RPC Error: {response['error']}") + + return response.get("result", {}) +``` + +**Use Cases:** +- Development and testing +- CLI tools +- Simple automation scripts +- Local agent integrations + +### 2. HTTP-First Client Pattern (Recommended) + +Modern approach using standard HTTP for reliability and scalability: + +```python +# contextframe/mcp/http_client_example.py pattern +class MCPHttpClient: + """HTTP client for MCP server - recommended approach.""" + + async def request(self, method: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + """Send a JSON-RPC request using standard HTTP.""" + payload = { + "jsonrpc": "2.0", + "method": method, + "params": params or {}, + "id": self._next_id() + } + + response = await self.client.post( + f"{self.base_url}/mcp/v1/jsonrpc", + json=payload, + headers={"Content-Type": "application/json"} + ) + response.raise_for_status() + + result = response.json() + if "error" in result: + raise Exception(f"MCP Error: {result['error']}") + + return result.get("result", {}) +``` + +**Key Features:** +- Standard HTTP POST for all operations +- Convenience REST endpoints for common operations +- Optional SSE for progress tracking +- Built-in retry and timeout handling + +### 3. Tool Discovery Pattern + +Dynamic tool discovery for flexible integrations: + +```python +async def discover_and_use_tools(client: MCPHttpClient): + # 1. Initialize session + init_result = await client.initialize({ + "name": "my-agent", + "version": "1.0.0" + }) + + # 2. Discover available tools + tools = await client.list_tools() + tool_map = {tool['name']: tool for tool in tools['tools']} + + # 3. Dynamically call tools based on schema + if "search_documents" in tool_map: + schema = tool_map["search_documents"]["inputSchema"] + # Parse schema to understand required/optional params + + result = await client.call_tool( + "search_documents", + {"query": "important topic", "limit": 10} + ) +``` + +### 4. Error Handling Pattern + +Robust error handling with retry logic: + +```python +async def robust_mcp_call(client, method, params, max_retries=3): + """Call MCP method with exponential backoff retry.""" + for attempt in range(max_retries): + try: + return await client.request(method, params) + except Exception as e: + if "error" in str(e): + error_data = json.loads(str(e).split("MCP Error: ")[1]) + error_code = error_data.get("code", 0) + + # Don't retry client errors + if -32600 <= error_code <= -32602: + raise + + # Retry server errors with backoff + if attempt < max_retries - 1: + await asyncio.sleep(2 ** attempt) + continue + raise +``` + +## Agent Integration Approaches + +### 1. LangChain Integration Pattern + +```python +from langchain.tools import Tool +from langchain.agents import AgentExecutor + +class ContextFrameMCPTool: + """LangChain tool wrapper for MCP.""" + + def __init__(self, mcp_client: MCPHttpClient, tool_name: str, tool_schema: dict): + self.client = mcp_client + self.tool_name = tool_name + self.schema = tool_schema + + async def _run(self, **kwargs): + """Execute the MCP tool.""" + return await self.client.call_tool(self.tool_name, kwargs) + + def to_langchain_tool(self) -> Tool: + """Convert to LangChain tool.""" + return Tool( + name=self.tool_name, + func=self._run, + description=self.schema.get("description", ""), + args_schema=self._schema_to_pydantic(self.schema["inputSchema"]) + ) + +# Usage +async def create_langchain_agent(mcp_url: str): + client = MCPHttpClient(mcp_url) + await client.initialize({"name": "langchain-agent"}) + + # Get all tools + tools_response = await client.list_tools() + + # Convert to LangChain tools + langchain_tools = [] + for tool in tools_response["tools"]: + wrapper = ContextFrameMCPTool(client, tool["name"], tool) + langchain_tools.append(wrapper.to_langchain_tool()) + + # Create agent with tools + agent = AgentExecutor(tools=langchain_tools, ...) +``` + +### 2. Function Calling Pattern + +For LLMs with function calling capabilities: + +```python +async def prepare_function_definitions(client: MCPHttpClient): + """Convert MCP tools to function definitions for LLMs.""" + tools = await client.list_tools() + + functions = [] + for tool in tools["tools"]: + function_def = { + "name": tool["name"], + "description": tool.get("description", ""), + "parameters": tool["inputSchema"] + } + functions.append(function_def) + + return functions + +async def execute_function_call(client: MCPHttpClient, function_call): + """Execute function call from LLM.""" + return await client.call_tool( + function_call["name"], + json.loads(function_call["arguments"]) + ) +``` + +### 3. Context Window Management + +Efficiently managing large datasets within token limits: + +```python +class ContextWindowManager: + """Manage document retrieval within token limits.""" + + def __init__(self, client: MCPHttpClient, max_tokens: int = 100000): + self.client = client + self.max_tokens = max_tokens + self.token_counter = TikTokenCounter() # Or your preferred counter + + async def get_relevant_context(self, query: str, max_docs: int = 20): + """Get relevant documents within token limit.""" + # Start with more documents than needed + search_result = await self.client.call_tool( + "search_documents", + { + "query": query, + "limit": max_docs * 2, + "include_content": False # Get metadata first + } + ) + + # Sort by relevance score + docs = sorted( + search_result["documents"], + key=lambda d: d.get("score", 0), + reverse=True + ) + + # Add documents until token limit + selected_docs = [] + total_tokens = 0 + + for doc in docs: + # Get full content + full_doc = await self.client.call_tool( + "get_document", + { + "document_id": doc["uuid"], + "include_content": True + } + ) + + doc_tokens = self.token_counter.count(full_doc["document"]["content"]) + if total_tokens + doc_tokens <= self.max_tokens: + selected_docs.append(full_doc["document"]) + total_tokens += doc_tokens + else: + break + + return selected_docs +``` + +### 4. Streaming Results Pattern + +For handling large result sets: + +```python +async def stream_search_results(client: MCPHttpClient, query: str, batch_size: int = 100): + """Stream search results in batches.""" + offset = 0 + + while True: + result = await client.call_tool( + "search_documents", + { + "query": query, + "limit": batch_size, + "offset": offset + } + ) + + documents = result.get("documents", []) + if not documents: + break + + for doc in documents: + yield doc + + offset += batch_size +``` + +## Common Integration Scenarios + +### 1. RAG Application Pattern + +Complete RAG implementation with ContextFrame: + +```python +class ContextFrameRAG: + """RAG application using ContextFrame MCP.""" + + def __init__(self, mcp_url: str, llm_client): + self.mcp_client = MCPHttpClient(mcp_url) + self.llm = llm_client + self.context_manager = ContextWindowManager(self.mcp_client) + + async def initialize(self): + await self.mcp_client.initialize({ + "name": "rag-application", + "version": "1.0.0" + }) + + async def answer_question(self, question: str): + # 1. Search for relevant documents + relevant_docs = await self.context_manager.get_relevant_context( + question, + max_docs=10 + ) + + # 2. Build context + context = "\n\n".join([ + f"Document {i+1} (ID: {doc['uuid']}):\n{doc['content']}" + for i, doc in enumerate(relevant_docs) + ]) + + # 3. Generate answer + prompt = f""" + Based on the following documents, answer the question. + + Context: + {context} + + Question: {question} + + Answer: + """ + + response = await self.llm.generate(prompt) + + # 4. Track usage for analytics + await self.mcp_client.call_tool( + "track_usage", + { + "operation": "rag_query", + "document_ids": [doc['uuid'] for doc in relevant_docs], + "query": question + } + ) + + return { + "answer": response, + "sources": [ + { + "id": doc['uuid'], + "title": doc['metadata'].get('title', 'Untitled'), + "excerpt": doc['content'][:200] + "..." + } + for doc in relevant_docs + ] + } +``` + +### 2. Document Processing Pipeline + +Batch processing with progress tracking: + +```python +class DocumentProcessingPipeline: + """Process documents through multiple stages.""" + + def __init__(self, mcp_url: str): + self.client = MCPHttpClient(mcp_url) + + async def process_documents(self, file_paths: List[str]): + # 1. Batch import documents + import_result = await self.client.call_tool( + "batch_import", + { + "file_paths": file_paths, + "shared_settings": { + "generate_embeddings": False, # We'll do this later + "metadata": { + "source": "batch_import", + "processed_date": datetime.now().isoformat() + } + } + } + ) + + operation_id = import_result.get("operation_id") + if operation_id: + # Track progress via SSE + await self._track_operation(operation_id) + + document_ids = import_result.get("document_ids", []) + + # 2. Extract metadata + extract_result = await self.client.call_tool( + "batch_extract", + { + "document_ids": document_ids, + "extraction_type": "metadata", + "max_parallel": 5 + } + ) + + # 3. Enhance documents + enhance_result = await self.client.call_tool( + "batch_enhance", + { + "document_ids": document_ids, + "enhancements": ["summary", "keywords", "entities"], + "max_parallel": 3 + } + ) + + # 4. Generate embeddings + embed_result = await self.client.call_tool( + "batch_generate_embeddings", + { + "document_ids": document_ids, + "model": "text-embedding-3-small", + "batch_size": 100 + } + ) + + return { + "total_processed": len(document_ids), + "import": import_result, + "extract": extract_result, + "enhance": enhance_result, + "embed": embed_result + } + + async def _track_operation(self, operation_id: str): + """Track long-running operation progress.""" + async for progress in self.client.track_operation_progress(operation_id): + print(f"Progress: {progress['current']}/{progress['total']} - {progress.get('message', '')}") + if progress.get("status") == "completed": + break +``` + +### 3. Real-Time Monitoring Pattern + +Monitor dataset changes with subscriptions: + +```python +class DatasetMonitor: + """Monitor dataset for changes.""" + + def __init__(self, mcp_url: str): + self.client = MCPHttpClient(mcp_url) + self.handlers = {} + + async def monitor_changes(self, resource_type: str = "documents", filter: str = None): + """Monitor dataset changes via SSE.""" + url = f"{self.client.base_url}/mcp/v1/sse/subscribe" + params = {"resource_type": resource_type} + if filter: + params["filter"] = filter + + async with self.client.client.stream("GET", url, params=params) as response: + async for line in response.aiter_lines(): + if line.startswith("data: "): + change = json.loads(line[6:]) + await self._handle_change(change) + + async def _handle_change(self, change: dict): + """Handle dataset change event.""" + change_type = change.get("type") + handler = self.handlers.get(change_type) + + if handler: + await handler(change) + else: + print(f"Unhandled change type: {change_type}") + + def on_document_added(self, handler): + """Register handler for document additions.""" + self.handlers["document_added"] = handler + + def on_document_updated(self, handler): + """Register handler for document updates.""" + self.handlers["document_updated"] = handler +``` + +### 4. Analytics Dashboard Pattern + +Real-time analytics and monitoring: + +```python +class AnalyticsDashboard: + """Analytics dashboard for ContextFrame datasets.""" + + def __init__(self, mcp_url: str): + self.client = MCPHttpClient(mcp_url) + self._cache = {} + self._cache_ttl = 300 # 5 minutes + + async def get_dashboard_data(self): + """Get comprehensive dashboard data.""" + # Use analytics tools + tasks = [ + self._get_cached("stats", self._get_dataset_stats), + self._get_cached("usage", self._get_usage_analysis), + self._get_cached("performance", self._get_query_performance), + self._get_cached("relationships", self._get_relationship_analysis) + ] + + results = await asyncio.gather(*tasks) + + return { + "stats": results[0], + "usage": results[1], + "performance": results[2], + "relationships": results[3], + "timestamp": datetime.now().isoformat() + } + + async def _get_cached(self, key: str, fetcher): + """Get cached data or fetch if expired.""" + cached = self._cache.get(key) + if cached and (datetime.now() - cached["timestamp"]).seconds < self._cache_ttl: + return cached["data"] + + data = await fetcher() + self._cache[key] = { + "data": data, + "timestamp": datetime.now() + } + return data + + async def _get_dataset_stats(self): + result = await self.client.call_tool( + "get_dataset_stats", + {"include_details": True} + ) + return result["stats"] + + async def _get_usage_analysis(self): + result = await self.client.call_tool( + "analyze_usage", + {"time_range": "7d", "include_patterns": True} + ) + return result["analysis"] +``` + +## Best Practices + +### 1. Connection Management + +```python +class ManagedMCPClient: + """MCP client with connection pooling and health checks.""" + + def __init__(self, base_url: str, pool_size: int = 10): + self.base_url = base_url + limits = httpx.Limits(max_keepalive_connections=pool_size) + self.client = httpx.AsyncClient( + timeout=30.0, + limits=limits, + http2=True # Enable HTTP/2 for better performance + ) + self._initialized = False + + async def ensure_initialized(self): + """Ensure client is initialized.""" + if not self._initialized: + await self.initialize() + + async def initialize(self): + """Initialize with retry logic.""" + for attempt in range(3): + try: + result = await self.request("initialize", { + "protocolVersion": "0.1.0", + "clientInfo": {"name": "managed-client"} + }) + self._initialized = True + return result + except Exception as e: + if attempt == 2: + raise + await asyncio.sleep(1) + + async def health_check(self): + """Perform health check.""" + try: + response = await self.client.get(f"{self.base_url}/health") + return response.status_code == 200 + except: + return False +``` + +### 2. Authentication Patterns + +```python +class AuthenticatedMCPClient(MCPHttpClient): + """MCP client with authentication support.""" + + def __init__(self, base_url: str, auth_token: str = None): + super().__init__(base_url) + self.auth_token = auth_token + + async def request(self, method: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + """Send authenticated request.""" + headers = {"Content-Type": "application/json"} + + if self.auth_token: + headers["Authorization"] = f"Bearer {self.auth_token}" + + payload = { + "jsonrpc": "2.0", + "method": method, + "params": params or {}, + "id": self._next_id() + } + + response = await self.client.post( + f"{self.base_url}/mcp/v1/jsonrpc", + json=payload, + headers=headers + ) + + # Handle 401 and refresh token if needed + if response.status_code == 401: + await self._refresh_token() + # Retry request + return await self.request(method, params) + + response.raise_for_status() + return response.json().get("result", {}) +``` + +### 3. Error Recovery Patterns + +```python +class ResilientMCPClient: + """MCP client with circuit breaker and fallback.""" + + def __init__(self, primary_url: str, fallback_url: str = None): + self.primary_client = MCPHttpClient(primary_url) + self.fallback_client = MCPHttpClient(fallback_url) if fallback_url else None + self.circuit_breaker = CircuitBreaker( + failure_threshold=5, + recovery_timeout=60 + ) + + async def call_tool(self, tool_name: str, arguments: dict): + """Call tool with circuit breaker and fallback.""" + # Try primary + if not self.circuit_breaker.is_open(): + try: + result = await self.primary_client.call_tool(tool_name, arguments) + self.circuit_breaker.record_success() + return result + except Exception as e: + self.circuit_breaker.record_failure() + if not self.fallback_client: + raise + + # Try fallback + if self.fallback_client: + return await self.fallback_client.call_tool(tool_name, arguments) + + raise Exception("Primary service unavailable and no fallback configured") +``` + +### 4. Performance Optimization + +```python +class OptimizedMCPClient: + """MCP client with caching and batching.""" + + def __init__(self, base_url: str): + self.client = MCPHttpClient(base_url) + self.cache = TTLCache(maxsize=1000, ttl=300) + self.batch_queue = [] + self.batch_lock = asyncio.Lock() + + async def get_document_cached(self, document_id: str): + """Get document with caching.""" + cache_key = f"doc:{document_id}" + + if cache_key in self.cache: + return self.cache[cache_key] + + result = await self.client.call_tool( + "get_document", + {"document_id": document_id, "include_content": True} + ) + + self.cache[cache_key] = result["document"] + return result["document"] + + async def batch_get_documents(self, document_ids: List[str]): + """Get multiple documents efficiently.""" + # Check cache first + cached_docs = {} + missing_ids = [] + + for doc_id in document_ids: + cache_key = f"doc:{doc_id}" + if cache_key in self.cache: + cached_docs[doc_id] = self.cache[cache_key] + else: + missing_ids.append(doc_id) + + # Batch fetch missing documents + if missing_ids: + result = await self.client.call_tool( + "batch_get_documents", + {"document_ids": missing_ids} + ) + + for doc in result["documents"]: + doc_id = doc["uuid"] + self.cache[f"doc:{doc_id}"] = doc + cached_docs[doc_id] = doc + + # Return in requested order + return [cached_docs[doc_id] for doc_id in document_ids if doc_id in cached_docs] +``` + +## Testing Patterns + +### 1. Mock MCP Server + +```python +class MockMCPServer: + """Mock MCP server for testing.""" + + def __init__(self): + self.tools = {} + self.documents = {} + self.call_history = [] + + async def handle_request(self, request: dict): + """Handle mock request.""" + self.call_history.append(request) + + method = request.get("method") + params = request.get("params", {}) + + if method == "initialize": + return { + "protocolVersion": "0.1.0", + "serverInfo": {"name": "mock-server"}, + "capabilities": {"tools": True} + } + + elif method == "tools/list": + return {"tools": list(self.tools.values())} + + elif method == "tools/call": + tool_name = params.get("name") + if tool_name in self.tools: + handler = self.tools[tool_name]["handler"] + return await handler(params.get("arguments", {})) + + raise Exception(f"Unknown method: {method}") + + def register_mock_tool(self, name: str, handler, schema: dict): + """Register a mock tool.""" + self.tools[name] = { + "name": name, + "inputSchema": schema, + "handler": handler + } +``` + +### 2. Integration Test Pattern + +```python +@pytest.fixture +async def mcp_test_client(tmp_path): + """Create test MCP client with temporary dataset.""" + # Create temporary dataset + dataset_path = tmp_path / "test.lance" + dataset = FrameDataset.create(str(dataset_path)) + + # Add test data + test_records = [ + FrameRecord(text_content=f"Test document {i}", metadata={"index": i}) + for i in range(10) + ] + dataset.add_many(test_records) + + # Start test server + config = MCPConfig(transport="http", host="localhost", port=0) + server = await create_http_server(str(dataset_path), config) + + # Start server in background + server_task = asyncio.create_task( + uvicorn.run(server.app, host="localhost", port=0) + ) + + # Create client + client = MCPHttpClient(f"http://localhost:{server.port}") + await client.initialize({"name": "test-client"}) + + yield client + + # Cleanup + server_task.cancel() + await server_task +``` + +### 3. Tool Testing Strategy + +```python +class TestSearchTool: + """Test search tool functionality.""" + + @pytest.mark.asyncio + async def test_search_basic(self, mcp_test_client): + """Test basic search functionality.""" + result = await mcp_test_client.call_tool( + "search_documents", + {"query": "test", "limit": 5} + ) + + assert "documents" in result + assert len(result["documents"]) <= 5 + assert all("test" in doc["content"].lower() for doc in result["documents"]) + + @pytest.mark.asyncio + async def test_search_with_filter(self, mcp_test_client): + """Test search with SQL filter.""" + result = await mcp_test_client.call_tool( + "search_documents", + { + "query": "document", + "filter": "metadata.index > 5", + "limit": 10 + } + ) + + assert all(doc["metadata"]["index"] > 5 for doc in result["documents"]) + + @pytest.mark.asyncio + async def test_search_error_handling(self, mcp_test_client): + """Test search error handling.""" + with pytest.raises(Exception) as exc_info: + await mcp_test_client.call_tool( + "search_documents", + {"query": "test", "search_type": "invalid"} + ) + + assert "Invalid search type" in str(exc_info.value) +``` + +## Production Deployment + +### 1. Load Balancing Pattern + +```python +class LoadBalancedMCPClient: + """Client with load balancing across multiple servers.""" + + def __init__(self, server_urls: List[str], strategy: str = "round_robin"): + self.clients = [MCPHttpClient(url) for url in server_urls] + self.strategy = strategy + self.current_index = 0 + self.health_checker = HealthChecker(self.clients) + + async def initialize(self): + """Initialize all clients.""" + tasks = [client.initialize({"name": "lb-client"}) for client in self.clients] + await asyncio.gather(*tasks, return_exceptions=True) + + # Start health checking + asyncio.create_task(self.health_checker.start()) + + def _get_client(self) -> MCPHttpClient: + """Get next healthy client based on strategy.""" + healthy_clients = self.health_checker.get_healthy_clients() + + if not healthy_clients: + raise Exception("No healthy MCP servers available") + + if self.strategy == "round_robin": + client = healthy_clients[self.current_index % len(healthy_clients)] + self.current_index += 1 + return client + + elif self.strategy == "random": + return random.choice(healthy_clients) + + elif self.strategy == "least_connections": + # In production, track active connections per client + return min(healthy_clients, key=lambda c: c.active_connections) + + async def call_tool(self, tool_name: str, arguments: dict): + """Call tool on load balanced server.""" + client = self._get_client() + return await client.call_tool(tool_name, arguments) +``` + +### 2. Dataset Sharding Strategy + +```python +class ShardedMCPClient: + """Client for sharded dataset deployment.""" + + def __init__(self, shard_config: Dict[str, str]): + """ + shard_config: { + "shard_1": "http://shard1.example.com", + "shard_2": "http://shard2.example.com", + ... + } + """ + self.shards = { + name: MCPHttpClient(url) + for name, url in shard_config.items() + } + self.shard_router = ConsistentHashRouter(list(self.shards.keys())) + + async def initialize(self): + """Initialize all shard clients.""" + tasks = [ + client.initialize({"name": f"sharded-client-{name}"}) + for name, client in self.shards.items() + ] + await asyncio.gather(*tasks) + + def _get_shard_for_document(self, document_id: str) -> MCPHttpClient: + """Route document to appropriate shard.""" + shard_name = self.shard_router.get_node(document_id) + return self.shards[shard_name] + + async def add_document(self, content: str, metadata: dict): + """Add document to appropriate shard.""" + # Generate document ID + doc_id = str(uuid.uuid4()) + + # Route to shard + client = self._get_shard_for_document(doc_id) + + return await client.call_tool( + "add_document", + { + "content": content, + "metadata": {**metadata, "shard": client.base_url}, + "uuid": doc_id + } + ) + + async def search_all_shards(self, query: str, limit: int = 10): + """Search across all shards and merge results.""" + # Query all shards in parallel + tasks = [ + client.call_tool( + "search_documents", + {"query": query, "limit": limit} + ) + for client in self.shards.values() + ] + + results = await asyncio.gather(*tasks) + + # Merge and sort results + all_documents = [] + for result in results: + all_documents.extend(result.get("documents", [])) + + # Sort by score and limit + all_documents.sort(key=lambda d: d.get("score", 0), reverse=True) + + return { + "documents": all_documents[:limit], + "total_found": len(all_documents), + "shards_queried": len(self.shards) + } +``` + +### 3. Caching and CDN Integration + +```python +class CachedMCPClient: + """MCP client with multi-layer caching.""" + + def __init__(self, base_url: str, redis_url: str = None): + self.client = MCPHttpClient(base_url) + self.local_cache = TTLCache(maxsize=1000, ttl=300) + + if redis_url: + self.redis_client = aioredis.from_url(redis_url) + else: + self.redis_client = None + + async def get_document_with_cache(self, document_id: str): + """Get document with multi-layer caching.""" + # Check local cache + if document_id in self.local_cache: + return self.local_cache[document_id] + + # Check Redis cache + if self.redis_client: + cached = await self.redis_client.get(f"doc:{document_id}") + if cached: + doc = json.loads(cached) + self.local_cache[document_id] = doc + return doc + + # Fetch from MCP server + result = await self.client.call_tool( + "get_document", + {"document_id": document_id, "include_content": True} + ) + + doc = result["document"] + + # Update caches + self.local_cache[document_id] = doc + if self.redis_client: + await self.redis_client.setex( + f"doc:{document_id}", + 300, # 5 minute TTL + json.dumps(doc) + ) + + return doc + + async def invalidate_cache(self, document_id: str): + """Invalidate cache entries for a document.""" + # Remove from local cache + self.local_cache.pop(document_id, None) + + # Remove from Redis + if self.redis_client: + await self.redis_client.delete(f"doc:{document_id}") +``` + +### 4. Monitoring and Alerting + +```python +class MonitoredMCPClient: + """MCP client with comprehensive monitoring.""" + + def __init__(self, base_url: str, metrics_collector): + self.client = MCPHttpClient(base_url) + self.metrics = metrics_collector + + async def call_tool(self, tool_name: str, arguments: dict): + """Call tool with monitoring.""" + start_time = time.time() + error = None + + try: + result = await self.client.call_tool(tool_name, arguments) + return result + except Exception as e: + error = e + raise + finally: + # Record metrics + duration = time.time() - start_time + + self.metrics.record_tool_call( + tool_name=tool_name, + duration=duration, + success=error is None, + error_type=type(error).__name__ if error else None + ) + + # Alert on slow operations + if duration > 5.0: + await self.metrics.send_alert( + f"Slow MCP operation: {tool_name} took {duration:.2f}s" + ) + + # Alert on errors + if error and not isinstance(error, (InvalidParams, DocumentNotFound)): + await self.metrics.send_alert( + f"MCP operation failed: {tool_name} - {error}" + ) +``` + +## Real-World Architecture Examples + +### 1. Enterprise RAG System + +```yaml +# Architecture Overview +components: + load_balancer: + type: nginx + health_check: /health + + mcp_servers: + - host: mcp1.internal + dataset: primary_shard + - host: mcp2.internal + dataset: secondary_shard + + cache_layer: + redis_cluster: + nodes: 3 + memory: 16GB + + cdn: + cloudflare: + cache_rules: + - path: /mcp/v1/resources/* + ttl: 3600 + + monitoring: + prometheus: + scrape_interval: 15s + grafana: + dashboards: + - mcp_operations + - dataset_health + - query_performance +``` + +### 2. Multi-Tenant SaaS Platform + +```python +class MultiTenantMCPClient: + """MCP client for multi-tenant SaaS.""" + + def __init__(self, tenant_router): + self.tenant_router = tenant_router + self.tenant_clients = {} + + async def get_client_for_tenant(self, tenant_id: str) -> MCPHttpClient: + """Get or create client for tenant.""" + if tenant_id not in self.tenant_clients: + # Get tenant configuration + config = await self.tenant_router.get_tenant_config(tenant_id) + + # Create isolated client + client = MCPHttpClient(config["mcp_url"]) + await client.initialize({ + "name": f"tenant-{tenant_id}", + "tenant_id": tenant_id + }) + + self.tenant_clients[tenant_id] = client + + return self.tenant_clients[tenant_id] + + async def call_tool_for_tenant( + self, + tenant_id: str, + tool_name: str, + arguments: dict + ): + """Call tool in tenant context.""" + client = await self.get_client_for_tenant(tenant_id) + + # Add tenant context to all operations + arguments["_tenant_id"] = tenant_id + + return await client.call_tool(tool_name, arguments) +``` + +### 3. Real-Time Analytics Platform + +```python +class RealTimeAnalyticsPlatform: + """Real-time analytics using MCP subscriptions.""" + + def __init__(self, mcp_urls: List[str]): + self.clients = [MCPHttpClient(url) for url in mcp_urls] + self.event_processor = EventProcessor() + self.dashboard_manager = DashboardManager() + + async def start_monitoring(self): + """Start monitoring all MCP servers.""" + # Initialize clients + for client in self.clients: + await client.initialize({"name": "analytics-platform"}) + + # Start subscriptions + tasks = [] + for i, client in enumerate(self.clients): + task = asyncio.create_task( + self._monitor_server(client, f"server_{i}") + ) + tasks.append(task) + + # Start dashboard update loop + asyncio.create_task(self._update_dashboards()) + + await asyncio.gather(*tasks) + + async def _monitor_server(self, client: MCPHttpClient, server_id: str): + """Monitor a single MCP server.""" + # Subscribe to all changes + subscription_url = f"{client.base_url}/mcp/v1/sse/subscribe" + + async with client.client.stream("GET", subscription_url) as response: + async for line in response.aiter_lines(): + if line.startswith("data: "): + event = json.loads(line[6:]) + + # Process event + await self.event_processor.process({ + **event, + "server_id": server_id, + "timestamp": datetime.now().isoformat() + }) + + async def _update_dashboards(self): + """Update dashboards periodically.""" + while True: + # Collect metrics from all servers + metrics = [] + for client in self.clients: + try: + stats = await client.call_tool( + "get_dataset_stats", + {"include_details": False} + ) + metrics.append(stats) + except: + pass + + # Update dashboards + await self.dashboard_manager.update({ + "servers": len(self.clients), + "metrics": metrics, + "events_per_second": self.event_processor.get_rate(), + "timestamp": datetime.now().isoformat() + }) + + await asyncio.sleep(10) # Update every 10 seconds +``` + +## Summary + +This guide covers comprehensive integration patterns for the ContextFrame MCP server, from basic client implementations to complex production architectures. Key takeaways: + +1. **Use HTTP as Primary Transport**: HTTP provides reliability, scalability, and ease of integration +2. **Implement Robust Error Handling**: Use retries, circuit breakers, and fallbacks +3. **Optimize for Performance**: Cache aggressively, batch operations, use connection pooling +4. **Monitor Everything**: Track metrics, set up alerts, use distributed tracing +5. **Design for Scale**: Implement sharding, load balancing, and horizontal scaling +6. **Test Thoroughly**: Use mock servers, integration tests, and performance benchmarks + +The MCP protocol's flexibility allows for various integration patterns, from simple scripts to enterprise-scale systems. Choose patterns that match your use case and scale requirements. \ No newline at end of file diff --git a/.claude/implementations/mcp_monitoring_analysis.md b/.claude/implementations/mcp_monitoring_analysis.md new file mode 100644 index 0000000..4c69621 --- /dev/null +++ b/.claude/implementations/mcp_monitoring_analysis.md @@ -0,0 +1,545 @@ +# MCP Monitoring and Analytics Analysis + +## Executive Summary + +The MCP monitoring and analytics implementation provides comprehensive observability for ContextFrame deployments. The system is designed with zero-overhead when disabled and offers deep insights into usage patterns, performance characteristics, and operational costs. This analysis covers the architecture, key metrics, integration patterns, and recommendations for effective monitoring of MCP deployments. + +## Architecture Overview + +### Core Components + +#### 1. **Monitoring Layer** (`contextframe/mcp/monitoring/`) + +The monitoring layer consists of four specialized components: + +``` +monitoring/ +├── collector.py # Central metrics collection and aggregation +├── usage.py # Document and query usage tracking +├── performance.py # Operation latency and throughput monitoring +├── cost.py # LLM and infrastructure cost tracking +├── integration.py # MCP server integration +└── tools.py # Monitoring tools exposed via MCP +``` + +**MetricsCollector** (collector.py) +- Central hub for all metrics collection +- In-memory buffers with configurable limits +- Asynchronous flushing to Lance datasets +- Aggregation at multiple intervals (1m, 5m, 1h, 1d) +- Schema-driven metric storage + +**UsageTracker** (usage.py) +- Tracks document access patterns +- Records query execution statistics +- Identifies hot documents and frequent queries +- Provides temporal access patterns +- Generates usage-based recommendations + +**PerformanceMonitor** (performance.py) +- Operation-level latency tracking +- Response time percentiles (p50, p95, p99) +- Active operation monitoring +- Error rate tracking +- Performance snapshot history + +**CostCalculator** (cost.py) +- LLM token usage and cost attribution +- Storage operation cost tracking +- Bandwidth usage monitoring +- Cost projections and recommendations +- Multi-provider pricing support + +#### 2. **Analytics Layer** (`contextframe/mcp/analytics/`) + +The analytics layer provides deeper insights and optimization capabilities: + +``` +analytics/ +├── analyzer.py # Query, usage, and relationship analysis +├── optimizer.py # Storage optimization and index recommendations +├── stats.py # Statistical analysis utilities +└── tools.py # Analytics tools exposed via MCP +``` + +**DatasetAnalyzer** (analyzer.py) +- QueryAnalyzer: Query pattern analysis and optimization hints +- UsageAnalyzer: Document access pattern detection +- RelationshipAnalyzer: Graph structure and dependency analysis + +**StorageOptimizer** (optimizer.py) +- Lance dataset compaction and vacuum operations +- Index optimization recommendations +- Performance benchmarking utilities +- Storage efficiency metrics + +**StatsCollector** (stats.py) +- Comprehensive dataset statistics +- Lance-native metric collection +- Fragment-level analysis +- Content and relationship statistics + +### Integration Architecture + +```python +# MonitoringSystem orchestrates all components +class MonitoringSystem: + def __init__(self, dataset, metrics_config, pricing_config): + self.collector = MetricsCollector(dataset, metrics_config) + self.usage_tracker = UsageTracker(self.collector) + self.performance_monitor = PerformanceMonitor(self.collector) + self.cost_calculator = CostCalculator(self.collector, pricing_config) + +# Transparent integration via decorators +class MonitoredMessageHandler(BaseMessageHandler): + async def handle(self, message): + # Automatic performance tracking + async with self.monitoring.performance_monitor.track_operation( + operation_type=message.get("method"), + agent_id=self._get_agent_id(message) + ): + result = await super().handle(message) + # Track usage patterns + await self._track_usage(message, result) + return result +``` + +## Key Metrics and Their Significance + +### 1. **Usage Metrics** + +**Document Access Patterns** +- `document_access_count`: Frequency of document retrieval +- `search_appearances`: How often documents appear in search results +- `access_by_operation`: Breakdown by read/search/update operations +- **Significance**: Identifies hot documents for caching, unused content for archival + +**Query Performance** +- `query_count`: Frequency of different query types +- `avg_execution_time_ms`: Query latency trends +- `success_rate`: Query reliability +- **Significance**: Optimization opportunities, index requirements + +**Agent Activity** +- `unique_agents`: Active agent count +- `agent_activity_timeline`: Usage patterns per agent +- **Significance**: Capacity planning, multi-tenancy insights + +### 2. **Performance Metrics** + +**Operation Latency** +```python +{ + "p50": 45.2, # Median response time + "p95": 120.5, # 95% of requests under this + "p99": 250.3, # 99% of requests under this + "max": 500.0 # Worst-case scenario +} +``` +**Significance**: SLA compliance, performance regression detection + +**Throughput Metrics** +- `operations_per_second`: Current system load +- `active_operations`: Concurrent operation count +- `queue_depth`: Backpressure indicators +- **Significance**: Scaling decisions, bottleneck identification + +**Error Metrics** +- `error_rate`: Percentage of failed operations +- `error_by_type`: Breakdown of error categories +- `timeout_count`: Slow operation detection +- **Significance**: Reliability monitoring, debugging + +### 3. **Cost Metrics** + +**LLM Costs** +```python +{ + "provider": "openai", + "model": "gpt-4", + "input_tokens": 150000, + "output_tokens": 50000, + "cost_usd": 7.50 +} +``` +**Significance**: Budget tracking, model selection optimization + +**Infrastructure Costs** +- `storage_operations`: Read/write cost tracking +- `bandwidth_usage`: Egress cost monitoring +- `compute_time`: Processing cost attribution +- **Significance**: TCO analysis, optimization opportunities + +### 4. **Storage Metrics** + +**Dataset Health** +- `fragment_efficiency`: Ratio of active to total rows +- `num_small_files`: Fragmentation indicator +- `storage_size_bytes`: Total storage footprint +- **Significance**: Compaction scheduling, performance optimization + +**Index Coverage** +- `indexed_fields`: Fields with indices +- `index_usage_rate`: How often indices are utilized +- `missing_indices`: Recommended indices +- **Significance**: Query optimization, performance tuning + +## Performance Impact Analysis + +### Zero-Overhead Design + +The monitoring system is designed for minimal impact: + +```python +class MetricsConfig: + enabled: bool = True # Can be completely disabled + max_memory_metrics: int = 10000 # Bounded memory usage + flush_interval_seconds: int = 60 # Batched I/O +``` + +**Memory Overhead** +- Fixed-size circular buffers (deque with maxlen) +- Aggregated metrics for fast access +- Automatic old metric eviction + +**CPU Overhead** +- Async collection (non-blocking) +- Batch aggregation in background tasks +- Minimal instrumentation in hot paths + +**I/O Overhead** +- Periodic batch flushing +- Lance-optimized writes +- Optional metric persistence + +### Benchmarking Results + +Typical overhead measurements: +- **Disabled**: 0% overhead +- **Enabled (memory only)**: <1% overhead +- **Enabled (with persistence)**: 1-3% overhead + +## Integration Patterns + +### 1. **Transparent Monitoring** + +```python +# Automatic instrumentation +@monitor.track_operation("document_search") +async def search_documents(query: str) -> List[Document]: + # Business logic unchanged + results = await dataset.search(query) + return results +``` + +### 2. **Custom Metrics** + +```python +# Application-specific metrics +await monitor.record_metric( + "custom_metric", + value=42.0, + tags={"feature": "new_algorithm", "version": "2.0"} +) +``` + +### 3. **Cost Attribution** + +```python +# Automatic LLM cost tracking +async with monitor.track_llm_operation("enhancement") as ctx: + response = await llm.complete(prompt) + ctx.record_tokens(response.usage) +``` + +### 4. **Performance Profiling** + +```python +# Detailed operation tracking +with monitor.profile("complex_operation") as profiler: + step1_result = await expensive_step1() + profiler.mark("step1_complete") + + step2_result = await expensive_step2() + profiler.mark("step2_complete") + + return combine_results(step1_result, step2_result) +``` + +## Monitoring Tools via MCP + +### Available Tools + +1. **get_usage_metrics** + - Document access patterns + - Query distribution + - Agent activity + - Temporal patterns + +2. **get_performance_metrics** + - Operation latencies + - Throughput metrics + - Error rates + - Performance history + +3. **get_cost_report** + - Cost breakdown by type + - Agent/operation attribution + - Daily trends + - Monthly projections + +4. **get_monitoring_status** + - System health + - Buffer utilization + - Active operations + - Configuration status + +5. **export_metrics** + - Prometheus format + - JSON export + - CSV for analysis + +### Tool Usage Examples + +```python +# Get performance metrics for the last hour +response = await mcp_client.call_tool( + "get_performance_metrics", + { + "minutes": 60, + "include_percentiles": True, + "operation_type": "tool_call" + } +) + +# Get cost report with projections +response = await mcp_client.call_tool( + "get_cost_report", + { + "start_time": "2024-01-01T00:00:00Z", + "group_by": "agent", + "include_projections": True + } +) + +# Export metrics for Prometheus +response = await mcp_client.call_tool( + "export_metrics", + { + "format": "prometheus", + "metric_types": ["usage", "performance"], + "labels": {"environment": "production"} + } +) +``` + +## Visualization and Alerting + +### Prometheus Integration + +```yaml +# prometheus.yml +scrape_configs: + - job_name: 'contextframe_mcp' + scrape_interval: 30s + static_configs: + - targets: ['localhost:8080/metrics'] +``` + +### Grafana Dashboards + +Recommended panels: +1. **Operations Overview** + - Request rate graph + - Latency percentiles + - Error rate gauge + - Active operations + +2. **Usage Analytics** + - Document access heatmap + - Query type distribution + - Agent activity timeline + - Collection usage + +3. **Cost Management** + - Daily cost trend + - Cost by provider + - Token usage graph + - Projected monthly cost + +4. **Storage Health** + - Fragment efficiency + - Storage growth + - Index coverage + - Compaction status + +### Alert Rules + +```yaml +# Example Prometheus alerts +groups: + - name: mcp_alerts + rules: + - alert: HighErrorRate + expr: contextframe_operation_error_rate > 5 + for: 5m + annotations: + summary: "High error rate detected" + + - alert: SlowQueries + expr: contextframe_operation_p99_latency_ms > 1000 + for: 10m + annotations: + summary: "Queries are slow" + + - alert: HighCost + expr: contextframe_daily_cost_usd > 100 + annotations: + summary: "Daily cost exceeds budget" +``` + +## Deployment Best Practices + +### 1. **Configuration** + +```python +# Recommended production configuration +metrics_config = MetricsConfig( + enabled=True, + retention_days=30, + aggregation_intervals=["1m", "5m", "1h", "1d"], + max_memory_metrics=50000, + flush_interval_seconds=60 +) + +pricing_config = PricingConfig.from_file("config/pricing.json") +``` + +### 2. **Monitoring Levels** + +**Development**: Full monitoring with debug metrics +```python +config.enabled = True +config.include_debug_metrics = True +config.flush_to_disk = False # Memory only +``` + +**Staging**: Production-like monitoring +```python +config.enabled = True +config.include_debug_metrics = False +config.flush_to_disk = True +config.retention_days = 7 +``` + +**Production**: Optimized monitoring +```python +config.enabled = True +config.include_debug_metrics = False +config.flush_to_disk = True +config.retention_days = 30 +config.sample_rate = 0.1 # Sample 10% for expensive metrics +``` + +### 3. **Performance Tuning** + +**High-Volume Deployments** +- Increase buffer sizes for burst handling +- Use sampling for expensive metrics +- Implement metric pre-aggregation +- Consider dedicated metrics storage + +**Resource-Constrained Environments** +- Reduce aggregation intervals +- Lower retention period +- Disable unused metric categories +- Use memory-only mode + +### 4. **Security Considerations** + +- Sanitize sensitive data in metrics +- Implement access controls for monitoring endpoints +- Encrypt metrics in transit +- Audit metric access + +## Analytics Tools via MCP + +### Dataset Analysis Tools + +1. **get_dataset_stats** + - Comprehensive dataset statistics + - Storage and fragment analysis + - Content distribution + - Index coverage + +2. **analyze_usage** + - Access pattern analysis + - Hot document identification + - Collection usage statistics + - Temporal patterns + +3. **query_performance** + - Query latency analysis + - Slow query identification + - Optimization recommendations + - Index effectiveness + +4. **relationship_analysis** + - Document graph structure + - Circular dependency detection + - Orphaned document finding + - Component analysis + +### Optimization Tools + +1. **optimize_storage** + - Dataset compaction + - Version cleanup + - Fragment optimization + - Space reclamation + +2. **index_recommendations** + - Missing index detection + - Redundant index identification + - Workload-based suggestions + - Coverage analysis + +3. **benchmark_operations** + - Search performance testing + - Write throughput measurement + - Concurrent operation testing + - Latency profiling + +## Recommendations + +### 1. **Start Simple** +- Enable basic monitoring in production +- Focus on key metrics (latency, errors, cost) +- Add advanced analytics as needed + +### 2. **Set Baselines** +- Establish performance baselines early +- Monitor trends, not just absolutes +- Use percentiles for SLA definition + +### 3. **Automate Responses** +- Set up automatic compaction schedules +- Implement cost alerts +- Create runbooks for common issues + +### 4. **Regular Reviews** +- Weekly performance reviews +- Monthly cost analysis +- Quarterly optimization sprints + +### 5. **Tool Selection** +- Use Prometheus for real-time metrics +- Grafana for visualization +- Lance datasets for long-term analysis +- Custom tools for specific needs + +## Conclusion + +The MCP monitoring and analytics implementation provides a comprehensive observability solution for ContextFrame deployments. With its zero-overhead design, rich metrics collection, and powerful analytics capabilities, it enables operators to maintain high-performance, cost-effective deployments while gaining deep insights into usage patterns and optimization opportunities. + +The key to effective monitoring is starting with core metrics and gradually expanding based on operational needs. The modular design allows for this incremental approach while maintaining the flexibility to add custom metrics and analytics as requirements evolve. \ No newline at end of file diff --git a/.claude/implementations/mcp_security_analysis.md b/.claude/implementations/mcp_security_analysis.md new file mode 100644 index 0000000..c9ecfa6 --- /dev/null +++ b/.claude/implementations/mcp_security_analysis.md @@ -0,0 +1,765 @@ +# MCP Security Implementation Analysis + +## Overview + +The MCP (Model Context Protocol) security implementation in ContextFrame provides a comprehensive, multi-layered security system designed to protect dataset access and operations. The implementation follows security best practices with defense-in-depth principles, providing authentication, authorization, rate limiting, and audit logging capabilities. + +## Architecture + +The security system is composed of several modular components that work together: + +``` +┌─────────────────────────────────────────────────────────────┐ +│ Client Request │ +└─────────────────────────────────┬───────────────────────────┘ + │ + ┌─────────────▼──────────────┐ + │ Security Middleware │ + │ ┌────────────────────────┐ │ + │ │ 1. Authentication │ │ + │ │ - API Key │ │ + │ │ - OAuth 2.1 │ │ + │ │ - JWT │ │ + │ └────────────┬───────────┘ │ + │ │ │ + │ ┌────────────▼───────────┐ │ + │ │ 2. Rate Limiting │ │ + │ │ - Global limits │ │ + │ │ - Per-client limits │ │ + │ │ - Operation limits │ │ + │ └────────────┬───────────┘ │ + │ │ │ + │ ┌────────────▼───────────┐ │ + │ │ 3. Authorization │ │ + │ │ - RBAC │ │ + │ │ - Resource policies │ │ + │ │ - Permission checks │ │ + │ └────────────┬───────────┘ │ + │ │ │ + │ ┌────────────▼───────────┐ │ + │ │ 4. Audit Logging │ │ + │ │ - Event recording │ │ + │ │ - Security events │ │ + │ │ - Compliance trail │ │ + │ └────────────────────────┘ │ + └─────────────┬──────────────┘ + │ + ┌─────────────▼──────────────┐ + │ MCP Handler │ + │ (Process Request) │ + └────────────────────────────┘ +``` + +## 1. Authentication Providers + +### 1.1 API Key Authentication (`auth.py`) + +**Purpose**: Simple, stateless authentication using pre-shared keys. + +**Implementation**: +```python +class APIKeyAuth(AuthProvider): + def __init__(self, api_keys: Dict[str, Dict[str, Any]]): + # Keys are hashed using SHA-256 for secure storage + self._hashed_keys = { + self._hash_key(key): metadata + for key, metadata in self.api_keys.items() + } +``` + +**Security Guarantees**: +- API keys are never stored in plaintext (SHA-256 hashed) +- Supports key expiration +- Constant-time comparison to prevent timing attacks +- Keys include metadata for permissions and roles + +**Configuration Example**: +```json +{ + "api_keys": { + "sk_live_...": { + "principal_id": "user-123", + "principal_name": "Production Service", + "principal_type": "service", + "permissions": ["documents.read", "collections.read"], + "roles": ["viewer"], + "expires_at": "2024-12-31T23:59:59Z" + } + } +} +``` + +**Best Practices**: +- Generate keys using `APIKeyAuth.generate_api_key()` (32 bytes, URL-safe) +- Rotate keys regularly +- Use different keys for different environments +- Never commit keys to version control + +### 1.2 OAuth 2.1 Provider (`oauth.py`) + +**Purpose**: Industry-standard authentication with support for authorization code flow. + +**Implementation Features**: +- PKCE (Proof Key for Code Exchange) support for enhanced security +- Authorization code flow with state validation +- Client credentials flow for service accounts +- Token introspection capabilities +- Configurable redirect URI validation + +**Security Guarantees**: +- PKCE prevents authorization code interception attacks +- State parameter prevents CSRF attacks +- Strict redirect URI validation +- Token expiration enforcement +- Secure token storage + +**Configuration Example**: +```python +oauth_config = OAuth2Config( + authorization_endpoint="https://auth.example.com/authorize", + token_endpoint="https://auth.example.com/token", + userinfo_endpoint="https://auth.example.com/userinfo", + client_id="contextframe-client", + client_secret="secret", # Store securely! + redirect_uri="http://localhost:8080/callback", + scopes=["openid", "profile", "email"], + require_pkce=True, + require_state=True, + allowed_redirect_uris=["http://localhost:8080/callback"] +) +``` + +**Integration Pattern**: +```python +# Generate authorization URL +provider = OAuth2Provider(oauth_config) +verifier, challenge = OAuth2Provider.generate_pkce_pair() +auth_url = provider.generate_authorization_url( + state=secrets.token_urlsafe(32), + code_challenge=challenge +) + +# Exchange code for token +context = await provider.authenticate({ + "code": authorization_code, + "code_verifier": verifier, + "redirect_uri": redirect_uri +}) +``` + +### 1.3 JWT Handler (`jwt.py`) + +**Purpose**: Stateless authentication using JSON Web Tokens. + +**Implementation Features**: +- Support for RS256 (RSA) and HS256 (HMAC) algorithms +- Automatic RSA key pair generation for testing +- Comprehensive claim validation +- Token creation and verification +- Refresh token support + +**Security Guarantees**: +- Signature verification prevents token tampering +- Expiration validation prevents replay attacks +- Audience and issuer validation +- Support for custom claims + +**Configuration Example**: +```python +jwt_config = JWTConfig( + algorithm="RS256", + private_key=private_key_pem, # For signing + public_key=public_key_pem, # For verification + issuer="contextframe-mcp", + audience="contextframe-api", + token_lifetime=3600, # 1 hour + verify_exp=True, + verify_aud=True, + verify_iss=True +) +``` + +**Token Creation**: +```python +handler = JWTHandler(jwt_config) +token = handler.create_token( + principal_id="user-123", + principal_name="John Doe", + permissions={"documents.read", "collections.write"}, + roles={"editor"}, + additional_claims={"department": "engineering"} +) +``` + +### 1.4 Multi-Provider Authentication + +**Purpose**: Support multiple authentication methods simultaneously. + +**Implementation**: +```python +multi_auth = MultiAuthProvider([ + APIKeyAuth(api_keys), + OAuth2Provider(oauth_config), + JWTHandler(jwt_config) +]) + +# Tries each provider in order +context = await multi_auth.authenticate(credentials) +``` + +## 2. Authorization System (`authorization.py`) + +### 2.1 Role-Based Access Control (RBAC) + +**Standard Roles**: + +1. **Viewer**: Read-only access + - `documents.read` + - `collections.read` + +2. **Editor**: Read and write access + - All viewer permissions + - `documents.write` + - `collections.write` + - `tools.execute` + +3. **Admin**: Full access + - All permissions via wildcards + - `documents.*` + - `collections.*` + - `tools.*` + - `system.*` + +4. **Monitor**: Monitoring access + - `monitoring.read` + - `monitoring.export` + - `system.read` + +5. **Service**: Service account + - `documents.read` + - `collections.read` + - `tools.execute` + +### 2.2 Permission System + +**Wildcard Support**: +```python +# Permission "documents.*" matches: +# - documents.read +# - documents.write +# - documents.delete +# - documents.custom_action +``` + +**Permission Hierarchy**: +``` +documents.* +├── documents.read +├── documents.write +├── documents.delete +└── documents.admin + +collections.* +├── collections.read +├── collections.write +├── collections.delete +└── collections.admin + +tools.* +├── tools.execute +└── tools.admin + +system.* +├── system.read +└── system.admin + +monitoring.* +├── monitoring.read +├── monitoring.export +└── monitoring.admin +``` + +### 2.3 Resource-Level Policies + +**Purpose**: Fine-grained access control for specific resources. + +**Implementation**: +```python +# Policy for specific document access +policy = ResourcePolicy( + resource_type="document", + resource_id="sensitive-doc-*", # Wildcard pattern + permissions={"documents.read"}, + conditions={ + "principal_type": "user", + "department": {"$in": ["legal", "compliance"]} + } +) +``` + +**Condition Operators**: +- `$eq`: Exact match +- `$ne`: Not equal +- `$in`: Value in list +- `$regex`: Regular expression match + +### 2.4 Authorization Flow + +```python +access_control = AccessControl( + roles=STANDARD_ROLES, + policies=[policy1, policy2], + default_allow=False # Deny by default +) + +# Check authorization +if access_control.authorize( + context=security_context, + permission="documents.write", + resource_type="document", + resource_id="doc-123" +): + # Allowed + pass +else: + # Denied + raise AuthorizationError() +``` + +## 3. Rate Limiting (`rate_limiting.py`) + +### 3.1 Multi-Level Rate Limiting + +**Levels**: +1. **Global**: Overall system capacity +2. **Per-Client**: Individual client limits +3. **Per-Operation**: Operation-specific limits + +**Default Configuration**: +```python +config = RateLimitConfig( + # Global limits + global_requests_per_minute=600, + global_burst_size=100, + + # Per-client limits + client_requests_per_minute=60, + client_burst_size=10, + + # Operation-specific limits + operation_limits={ + "tools/call": (30, 5), # 30 rpm, burst 5 + "batch/*": (10, 2), # 10 rpm, burst 2 + "export/*": (5, 1), # 5 rpm, burst 1 + "resources/read": (120, 20), # 120 rpm, burst 20 + } +) +``` + +### 3.2 Rate Limiting Algorithms + +**Token Bucket** (Default for burst handling): +- Allows burst traffic up to bucket capacity +- Smooth refill rate +- Good for APIs with occasional spikes + +**Sliding Window** (Optional for strict limits): +- Precise request counting +- No burst allowance +- Better for strict rate enforcement + +### 3.3 Rate Limit Headers + +When rate limits are exceeded, the response includes: +``` +X-RateLimit-Limit: 60 +X-RateLimit-Remaining: 0 +X-RateLimit-Reset: 1634567890 +Retry-After: 42 +``` + +## 4. Audit Logging (`audit.py`) + +### 4.1 Event Types + +**Authentication Events**: +- `auth.success`: Successful authentication +- `auth.failure`: Failed authentication +- `auth.token_created`: New token issued +- `auth.token_revoked`: Token revoked + +**Authorization Events**: +- `authz.granted`: Access granted +- `authz.denied`: Access denied + +**Rate Limiting Events**: +- `rate_limit.exceeded`: Rate limit hit +- `rate_limit.reset`: Rate limit reset + +**Resource Access Events**: +- `resource.read`: Resource accessed +- `resource.write`: Resource modified +- `resource.delete`: Resource deleted + +**Security Configuration Events**: +- `security.config_changed`: Security settings modified +- `role.created/modified/deleted`: Role changes +- `policy.created/modified/deleted`: Policy changes + +### 4.2 Storage Backends + +**Memory** (Development): +```python +config = AuditConfig( + storage_backend="memory", + max_events_memory=10000 +) +``` + +**File** (Production): +```python +config = AuditConfig( + storage_backend="file", + file_path="/var/log/mcp/audit.log", + retention_days=90 +) +``` + +**Dataset** (Advanced): +```python +config = AuditConfig( + storage_backend="dataset", + dataset_path="/data/audit.lance", + retention_days=365 +) +``` + +### 4.3 Event Structure + +```python +@dataclass +class AuditEvent: + # Event metadata + event_id: str + timestamp: datetime + event_type: AuditEventType + + # Principal information + principal_id: str + principal_type: str + principal_name: str + auth_method: str + + # Request context + operation: str + resource_type: str + resource_id: str + request_id: str + session_id: str + + # Network context + client_ip: str + user_agent: str + + # Event details + success: bool + error_code: int + error_message: str + details: Dict[str, Any] + + # Computed + severity: str # "info", "warning", "error" +``` + +### 4.4 Search and Compliance + +```python +# Search audit events +events = await audit_logger.search_events( + event_types=[AuditEventType.AUTH_FAILURE], + principal_id="user-123", + start_time=datetime.now() - timedelta(days=7), + success=False, + limit=100 +) + +# Generate compliance report +for event in events: + print(f"{event.timestamp}: {event.principal_id} - {event.event_type}") +``` + +## 5. Security Integration (`integration.py`) + +### 5.1 Security Middleware + +The `SecurityMiddleware` class orchestrates all security components: + +```python +middleware = SecurityMiddleware( + auth_provider=multi_auth, + access_control=access_control, + rate_limiter=rate_limiter, + audit_logger=audit_logger, + anonymous_allowed=False, + anonymous_permissions={"documents.read"} +) +``` + +### 5.2 Request Flow + +1. **Authentication**: Extract and validate credentials +2. **Rate Limiting**: Check request limits +3. **Authorization**: Verify permissions +4. **Audit Logging**: Record security events +5. **Request Processing**: Execute if all checks pass + +### 5.3 Integration with MCP Server + +```python +# In server configuration +config = MCPConfig( + security_enabled=True, + auth_providers=["api_key", "oauth", "jwt"], + anonymous_allowed=False, + api_keys_file="/etc/mcp/api_keys.json", + oauth_config_file="/etc/mcp/oauth.json", + jwt_config_file="/etc/mcp/jwt.json", + audit_log_file="/var/log/mcp/audit.log" +) + +# Server automatically sets up security +server = ContextFrameMCPServer(dataset_path, config) +``` + +## 6. Threat Model and Security Guarantees + +### 6.1 Threats Addressed + +1. **Unauthorized Access** + - Mitigated by: Multi-factor authentication, strong key generation + - Residual risk: Compromised credentials + +2. **Privilege Escalation** + - Mitigated by: RBAC, resource policies, principle of least privilege + - Residual risk: Misconfigured roles + +3. **Denial of Service** + - Mitigated by: Rate limiting, resource quotas + - Residual risk: Distributed attacks + +4. **Token/Session Hijacking** + - Mitigated by: Token expiration, HTTPS enforcement, PKCE + - Residual risk: Man-in-the-middle attacks + +5. **Audit Trail Tampering** + - Mitigated by: Append-only logs, secure storage + - Residual risk: Privileged user abuse + +### 6.2 Security Guarantees + +1. **Authentication**: Every request is authenticated (unless anonymous allowed) +2. **Authorization**: All operations checked against permissions +3. **Non-repudiation**: Audit trail for all security events +4. **Rate Protection**: Prevents resource exhaustion +5. **Defense in Depth**: Multiple security layers + +## 7. Configuration Examples + +### 7.1 Development Configuration + +```python +# Minimal security for development +dev_config = MCPConfig( + security_enabled=True, + auth_providers=["api_key"], + anonymous_allowed=True, + anonymous_permissions=["documents.read", "collections.read"] +) +``` + +### 7.2 Production Configuration + +```python +# Full security for production +prod_config = MCPConfig( + security_enabled=True, + auth_providers=["oauth", "jwt"], + anonymous_allowed=False, + + # OAuth configuration + oauth_config_file="/secure/oauth_config.json", + + # JWT configuration + jwt_config_file="/secure/jwt_config.json", + + # Audit logging + audit_log_file="/var/log/mcp/audit.log", + audit_retention_days=365, + + # HTTPS only + http_ssl_cert="/certs/server.crt", + http_ssl_key="/certs/server.key" +) +``` + +### 7.3 Service Account Configuration + +```python +# For automated services +service_config = { + "api_keys": { + "svc_analytics_prod": { + "principal_id": "svc-analytics", + "principal_type": "service", + "principal_name": "Analytics Service", + "permissions": ["documents.read", "monitoring.read"], + "roles": ["service"], + "expires_at": "2025-01-01T00:00:00Z" + } + } +} +``` + +## 8. Best Practices + +### 8.1 Authentication + +1. **API Keys**: + - Rotate every 90 days + - Use environment variables + - Different keys per environment + - Monitor key usage + +2. **OAuth**: + - Always use PKCE + - Validate redirect URIs + - Short-lived access tokens + - Secure client secrets + +3. **JWT**: + - Use RS256 in production + - Short expiration times + - Include minimal claims + - Rotate signing keys + +### 8.2 Authorization + +1. **Roles**: + - Start with standard roles + - Create custom roles sparingly + - Regular permission audits + - Document role purposes + +2. **Policies**: + - Be specific with resource IDs + - Use conditions for context + - Test policy combinations + - Version policy changes + +### 8.3 Rate Limiting + +1. **Configuration**: + - Start conservative + - Monitor actual usage + - Adjust based on patterns + - Different limits per client type + +2. **Handling**: + - Respect Retry-After headers + - Implement exponential backoff + - Cache frequently accessed data + - Batch operations when possible + +### 8.4 Audit Logging + +1. **Events**: + - Log all security events + - Include sufficient context + - Avoid logging sensitive data + - Use structured logging + +2. **Retention**: + - Follow compliance requirements + - Archive old logs + - Regular log analysis + - Automated alerting + +## 9. Common Pitfalls + +### 9.1 Authentication + +- **Pitfall**: Storing API keys in code + - **Solution**: Use environment variables or secure vaults + +- **Pitfall**: Long-lived tokens + - **Solution**: Short expiration with refresh tokens + +- **Pitfall**: Weak key generation + - **Solution**: Use cryptographically secure methods + +### 9.2 Authorization + +- **Pitfall**: Over-permissive roles + - **Solution**: Principle of least privilege + +- **Pitfall**: Complex policy interactions + - **Solution**: Keep policies simple and testable + +- **Pitfall**: Missing resource checks + - **Solution**: Always validate resource access + +### 9.3 Rate Limiting + +- **Pitfall**: Too restrictive limits + - **Solution**: Monitor and adjust based on usage + +- **Pitfall**: No burst allowance + - **Solution**: Use token bucket for flexibility + +- **Pitfall**: Ignoring operation costs + - **Solution**: Different limits for expensive operations + +### 9.4 Audit Logging + +- **Pitfall**: Logging sensitive data + - **Solution**: Implement redaction + +- **Pitfall**: No log rotation + - **Solution**: Implement retention policies + +- **Pitfall**: Ignoring audit logs + - **Solution**: Regular review and alerting + +## 10. Security Deployment Checklist + +### Pre-Deployment + +- [ ] Generate strong API keys +- [ ] Configure OAuth providers +- [ ] Set up JWT signing keys +- [ ] Define roles and permissions +- [ ] Create resource policies +- [ ] Configure rate limits +- [ ] Set up audit logging +- [ ] Enable HTTPS +- [ ] Test authentication flows +- [ ] Test authorization rules +- [ ] Verify rate limiting +- [ ] Check audit trail + +### Post-Deployment + +- [ ] Monitor authentication failures +- [ ] Review authorization denials +- [ ] Track rate limit hits +- [ ] Analyze audit logs +- [ ] Update security policies +- [ ] Rotate credentials +- [ ] Security training +- [ ] Incident response plan + +## Conclusion + +The MCP security implementation provides a robust, enterprise-grade security system for protecting ContextFrame datasets. By following the configuration examples and best practices outlined in this document, you can ensure your MCP deployment is secure, compliant, and scalable. + +The modular design allows you to enable only the security features you need, while the comprehensive nature ensures you can meet any security requirement. Regular monitoring of audit logs and security metrics will help maintain a strong security posture over time. \ No newline at end of file diff --git a/.claude/implementations/mcp_tools_analysis.md b/.claude/implementations/mcp_tools_analysis.md new file mode 100644 index 0000000..3d0c26f --- /dev/null +++ b/.claude/implementations/mcp_tools_analysis.md @@ -0,0 +1,1025 @@ +# MCP Tools Analysis - ContextFrame + +This comprehensive guide documents all available MCP tools in the ContextFrame system, organized by category with detailed information about their purpose, usage, and integration. + +## Table of Contents + +1. [Core Document Tools](#1-core-document-tools) +2. [Batch Operation Tools](#2-batch-operation-tools) +3. [Collection Management Tools](#3-collection-management-tools) +4. [Enhancement Tools](#4-enhancement-tools) +5. [Analytics Tools](#5-analytics-tools) +6. [Monitoring Tools](#6-monitoring-tools) +7. [Subscription Tools](#7-subscription-tools) + +--- + +## 1. Core Document Tools + +Located in `contextframe/mcp/tools.py`, these tools provide fundamental CRUD operations for documents. + +### 1.1 search_documents + +**Purpose**: Search documents using vector, text, or hybrid search methods. + +**Input Parameters**: +```json +{ + "query": "string (required)", + "search_type": "vector | text | hybrid (default: hybrid)", + "limit": "integer 1-1000 (default: 10)", + "filter": "SQL filter expression (optional)" +} +``` + +**Output Format**: +```json +{ + "documents": [ + { + "uuid": "string", + "content": "string", + "metadata": {}, + "score": "float (optional)" + } + ], + "total_count": "integer", + "search_type_used": "string" +} +``` + +**Performance Considerations**: +- Vector search requires embedding generation (API call) +- Text search uses Lance's full-text search capabilities +- Hybrid search falls back to text if vector fails +- Filters are applied at the storage layer for efficiency + +**Error Handling**: +- `EmbeddingError`: When vector embedding generation fails +- `FilterError`: When SQL filter expression is invalid +- `InvalidSearchType`: When search type is not recognized + +### 1.2 add_document + +**Purpose**: Add a new document to the dataset with optional chunking and embedding generation. + +**Input Parameters**: +```json +{ + "content": "string (required)", + "metadata": "object (optional)", + "generate_embedding": "boolean (default: true)", + "collection": "string (optional)", + "chunk_size": "integer 100-10000 (optional)", + "chunk_overlap": "integer 0-1000 (optional)" +} +``` + +**Output Format**: +```json +{ + "document": { + "uuid": "string", + "content": "string", + "metadata": {} + } +} +``` +Or for chunked documents: +```json +{ + "documents": [...], + "total_chunks": "integer" +} +``` + +**Performance Considerations**: +- Chunking is performed synchronously +- Embedding generation requires API call per chunk +- Large documents should use chunking for better retrieval + +### 1.3 get_document + +**Purpose**: Retrieve a document by its UUID. + +**Input Parameters**: +```json +{ + "document_id": "string (required)", + "include_content": "boolean (default: true)", + "include_metadata": "boolean (default: true)", + "include_embeddings": "boolean (default: false)" +} +``` + +**Output Format**: +```json +{ + "document": { + "uuid": "string", + "content": "string (optional)", + "metadata": "object (optional)", + "embedding": "array (optional)" + } +} +``` + +**Error Handling**: +- `DocumentNotFound`: When UUID doesn't exist + +### 1.4 list_documents + +**Purpose**: List documents with pagination and filtering. + +**Input Parameters**: +```json +{ + "limit": "integer 1-1000 (default: 100)", + "offset": "integer >= 0 (default: 0)", + "filter": "SQL filter expression (optional)", + "order_by": "SQL order expression (optional)", + "include_content": "boolean (default: false)" +} +``` + +**Output Format**: +```json +{ + "documents": [...], + "total_count": "integer", + "offset": "integer", + "limit": "integer" +} +``` + +### 1.5 update_document + +**Purpose**: Update an existing document's content or metadata. + +**Input Parameters**: +```json +{ + "document_id": "string (required)", + "content": "string (optional)", + "metadata": "object (optional)", + "regenerate_embedding": "boolean (default: false)" +} +``` + +**Implementation Details**: +- Uses atomic delete + add pattern +- Preserves UUID and relationships +- Metadata updates are merged, not replaced + +### 1.6 delete_document + +**Purpose**: Delete a document from the dataset. + +**Input Parameters**: +```json +{ + "document_id": "string (required)" +} +``` + +**Output Format**: +```json +{ + "deleted": true, + "document_id": "string" +} +``` + +--- + +## 2. Batch Operation Tools + +Located in `contextframe/mcp/batch/tools.py`, these tools enable efficient bulk operations. + +### 2.1 batch_search + +**Purpose**: Execute multiple searches in parallel with progress tracking. + +**Input Parameters**: +```json +{ + "queries": [ + { + "query": "string", + "search_type": "vector | text | hybrid", + "limit": "integer", + "filter": "string" + } + ], + "max_parallel": "integer (default: 10)" +} +``` + +**Output Format**: +```json +{ + "searches_completed": "integer", + "searches_failed": "integer", + "results": [ + { + "query": "string", + "success": "boolean", + "results": [...], + "count": "integer", + "error": "string (optional)" + } + ], + "errors": [...] +} +``` + +**Performance Considerations**: +- Controlled parallelism prevents resource exhaustion +- Each search maintains independent error handling +- Progress reported via transport adapter + +### 2.2 batch_add + +**Purpose**: Add multiple documents efficiently with atomic transaction support. + +**Input Parameters**: +```json +{ + "documents": [ + { + "content": "string", + "metadata": "object" + } + ], + "shared_settings": { + "metadata": "object", + "generate_embeddings": "boolean" + }, + "atomic": "boolean (default: false)" +} +``` + +**Transaction Support**: +- Atomic mode: All succeed or all fail +- Non-atomic: Individual failures don't stop batch +- Shared settings applied to all documents + +### 2.3 batch_update + +**Purpose**: Update multiple documents by filter or IDs. + +**Input Parameters**: +```json +{ + "document_ids": ["string"] | null, + "filter": "string | null", + "updates": { + "metadata_updates": "object", + "content_template": "string", + "regenerate_embeddings": "boolean" + }, + "max_documents": "integer (default: 1000)" +} +``` + +**Features**: +- Template-based content updates +- Bulk metadata merging +- Batch embedding regeneration + +### 2.4 batch_delete + +**Purpose**: Delete multiple documents with safety checks. + +**Input Parameters**: +```json +{ + "document_ids": ["string"] | null, + "filter": "string | null", + "confirm_count": "integer | null", + "dry_run": "boolean (default: false)" +} +``` + +**Safety Features**: +- Dry run preview +- Count confirmation +- Detailed deletion report + +### 2.5 batch_enhance + +**Purpose**: Enhance multiple documents with LLM operations. + +**Input Parameters**: +```json +{ + "document_ids": ["string"] | null, + "filter": "string | null", + "enhancements": ["context", "tags", "title", "metadata"], + "purpose": "string", + "batch_size": "integer (optional)" +} +``` + +**Enhancement Types**: +- Context: Add explanatory context +- Tags: Generate relevant tags +- Title: Improve document titles +- Metadata: Extract structured data + +### 2.6 batch_extract + +**Purpose**: Extract content from multiple files/sources. + +**Input Parameters**: +```json +{ + "sources": [ + { + "type": "file | url", + "path": "string", + "url": "string" + } + ], + "add_to_dataset": "boolean (default: true)", + "shared_metadata": "object", + "collection": "string", + "continue_on_error": "boolean (default: false)" +} +``` + +**Supported Formats**: +- Markdown (.md) +- JSON (.json) +- YAML (.yaml, .yml) +- CSV (.csv) +- Plain text (fallback) + +### 2.7 batch_export + +**Purpose**: Export documents in bulk to various formats. + +**Input Parameters**: +```json +{ + "document_ids": ["string"] | null, + "filter": "string | null", + "format": "json | jsonl | csv | parquet", + "output_path": "string", + "include_embeddings": "boolean (default: false)", + "chunk_size": "integer (optional)" +} +``` + +**Export Features**: +- Chunked exports for large datasets +- Format-specific optimizations +- Embedding inclusion option + +### 2.8 batch_import + +**Purpose**: Import documents from files. + +**Input Parameters**: +```json +{ + "source_path": "string", + "format": "json | jsonl | csv | parquet", + "mapping": "object (field mappings)", + "validation": { + "max_errors": "integer", + "require_schema_match": "boolean" + }, + "generate_embeddings": "boolean" +} +``` + +--- + +## 3. Collection Management Tools + +Located in `contextframe/mcp/collections/tools.py`, these tools manage document collections and hierarchies. + +### 3.1 create_collection + +**Purpose**: Create a new collection with header and initial configuration. + +**Input Parameters**: +```json +{ + "name": "string (required)", + "description": "string (optional)", + "parent_collection": "string (optional)", + "template": "string (optional)", + "metadata": "object (optional)", + "initial_members": ["string"] (optional) +} +``` + +**Features**: +- Hierarchical collections via parent_collection +- Template application for standardization +- Automatic relationship management +- Collection header document creation + +### 3.2 update_collection + +**Purpose**: Update collection properties and membership. + +**Input Parameters**: +```json +{ + "collection_id": "string (required)", + "name": "string (optional)", + "description": "string (optional)", + "metadata_updates": "object (optional)", + "add_members": ["string"] (optional), + "remove_members": ["string"] (optional) +} +``` + +**Membership Management**: +- Batch add/remove operations +- Automatic relationship updates +- Member count tracking + +### 3.3 delete_collection + +**Purpose**: Delete a collection and optionally its members. + +**Input Parameters**: +```json +{ + "collection_id": "string (required)", + "delete_members": "boolean (default: false)", + "recursive": "boolean (default: false)" +} +``` + +**Deletion Options**: +- Keep members (remove relationships only) +- Delete members +- Recursive deletion of subcollections + +### 3.4 list_collections + +**Purpose**: List collections with filtering and statistics. + +**Input Parameters**: +```json +{ + "parent_id": "string (optional)", + "include_empty": "boolean (default: true)", + "include_stats": "boolean (default: false)", + "sort_by": "name | created_at | member_count", + "limit": "integer (default: 100)", + "offset": "integer (default: 0)" +} +``` + +**Output Features**: +- Hierarchical structure +- Member statistics +- Size calculations +- Metadata aggregation + +### 3.5 move_documents + +**Purpose**: Move documents between collections. + +**Input Parameters**: +```json +{ + "document_ids": ["string"] (required), + "source_collection": "string (optional)", + "target_collection": "string (optional)", + "update_metadata": "boolean (default: false)" +} +``` + +**Movement Features**: +- Batch operations +- Metadata inheritance +- Relationship updates + +### 3.6 get_collection_stats + +**Purpose**: Get detailed statistics for a collection. + +**Input Parameters**: +```json +{ + "collection_id": "string (required)", + "include_subcollections": "boolean (default: false)", + "include_member_details": "boolean (default: false)" +} +``` + +**Statistics Provided**: +- Member counts (direct and total) +- Size calculations +- Tag aggregation +- Date ranges +- Member type distribution + +--- + +## 4. Enhancement Tools + +Located in `contextframe/mcp/enhancement_tools.py`, these tools use LLM to enhance documents. + +### 4.1 enhance_context + +**Purpose**: Add context to explain document relevance for a specific purpose. + +**Input Parameters**: +```json +{ + "document_id": "string (required)", + "purpose": "string (required)", + "current_context": "string (optional)" +} +``` + +**Use Cases**: +- Adding domain-specific context +- Explaining document significance +- Improving searchability + +### 4.2 extract_metadata + +**Purpose**: Extract custom metadata from document using LLM. + +**Input Parameters**: +```json +{ + "document_id": "string (required)", + "schema": "string (prompt describing what to extract)", + "format": "json | text (default: json)" +} +``` + +**Examples**: +- Extract key facts and dates +- Identify entities and relationships +- Generate structured summaries + +### 4.3 generate_tags + +**Purpose**: Generate relevant tags for a document. + +**Input Parameters**: +```json +{ + "document_id": "string (required)", + "tag_types": "string (default: 'topics, technologies, concepts')", + "max_tags": "integer 1-20 (default: 5)" +} +``` + +### 4.4 improve_title + +**Purpose**: Generate or improve document title. + +**Input Parameters**: +```json +{ + "document_id": "string (required)", + "style": "descriptive | technical | concise (default: descriptive)" +} +``` + +### 4.5 enhance_for_purpose + +**Purpose**: Enhance document with purpose-specific metadata. + +**Input Parameters**: +```json +{ + "document_id": "string (required)", + "purpose": "string (required)", + "fields": ["context", "tags", "custom_metadata"] (default: all) +} +``` + +### 4.6 extract_from_file + +**Purpose**: Extract content and metadata from various file formats. + +**Input Parameters**: +```json +{ + "file_path": "string (required)", + "add_to_dataset": "boolean (default: true)", + "generate_embedding": "boolean (default: true)", + "collection": "string (optional)" +} +``` + +### 4.7 batch_extract (Directory Processing) + +**Purpose**: Extract content from multiple files in a directory. + +**Input Parameters**: +```json +{ + "directory": "string (required)", + "patterns": ["*.md", "*.txt", "*.json"] (default patterns), + "recursive": "boolean (default: true)", + "add_to_dataset": "boolean (default: true)", + "collection": "string (optional)" +} +``` + +--- + +## 5. Analytics Tools + +Located in `contextframe/mcp/analytics/tools.py`, these tools provide dataset analysis and optimization. + +### 5.1 get_dataset_stats + +**Purpose**: Get comprehensive dataset statistics including storage, content, and index metrics. + +**Input Parameters**: +```json +{ + "include_details": "boolean (default: true)", + "include_fragments": "boolean (default: true)", + "sample_size": "integer 100-100000 (optional)" +} +``` + +**Statistics Provided**: +- Storage metrics (size, fragments, versions) +- Content analysis (types, sizes, dates) +- Index information +- Relationship graph metrics + +### 5.2 analyze_usage + +**Purpose**: Analyze dataset usage patterns and access frequencies. + +**Input Parameters**: +```json +{ + "time_range": "string (e.g., '7d', '24h', '30d')", + "group_by": "hour | day | week (default: hour)", + "include_patterns": "boolean (default: true)" +} +``` + +**Analysis Features**: +- Access frequency heatmaps +- Popular documents +- Query patterns +- User behavior analysis + +### 5.3 query_performance + +**Purpose**: Analyze query performance and identify optimization opportunities. + +**Input Parameters**: +```json +{ + "time_range": "string (default: '7d')", + "query_type": "vector | text | hybrid | filter (optional)", + "min_duration_ms": "number (default: 0)" +} +``` + +**Performance Metrics**: +- Query execution times +- Index utilization +- Scan statistics +- Slow query identification + +### 5.4 relationship_analysis + +**Purpose**: Analyze document relationships and graph structure. + +**Input Parameters**: +```json +{ + "max_depth": "integer 1-10 (default: 3)", + "relationship_types": ["string"] (optional), + "include_orphans": "boolean (default: true)" +} +``` + +**Graph Analysis**: +- Connectivity metrics +- Cluster identification +- Orphaned documents +- Relationship type distribution + +### 5.5 optimize_storage + +**Purpose**: Optimize dataset storage through compaction and cleanup. + +**Input Parameters**: +```json +{ + "operations": ["compact", "vacuum", "reindex"], + "dry_run": "boolean (default: true)", + "target_version": "integer (optional)" +} +``` + +**Optimization Operations**: +- Compact: Merge small fragments +- Vacuum: Remove old versions +- Reindex: Rebuild indexes + +### 5.6 index_recommendations + +**Purpose**: Get recommendations for index improvements. + +**Input Parameters**: +```json +{ + "analyze_queries": "boolean (default: true)", + "workload_type": "search | analytics | mixed (default: mixed)" +} +``` + +**Recommendations**: +- Missing index suggestions +- Unused index identification +- Index type optimization +- Configuration tuning + +### 5.7 benchmark_operations + +**Purpose**: Benchmark dataset operations to measure performance. + +**Input Parameters**: +```json +{ + "operations": ["search", "insert", "update", "scan"], + "sample_size": "integer 1-10000 (default: 100)", + "concurrency": "integer 1-100 (default: 1)" +} +``` + +**Benchmark Metrics**: +- Operation throughput +- Latency percentiles +- Resource utilization +- Scalability analysis + +### 5.8 export_metrics + +**Purpose**: Export dataset metrics for monitoring systems. + +**Input Parameters**: +```json +{ + "format": "prometheus | json | csv (default: json)", + "metrics": ["string"] (optional, default: all)", + "labels": "object (optional)" +} +``` + +**Export Formats**: +- Prometheus: For Grafana/Prometheus +- JSON: For custom processing +- CSV: For spreadsheet analysis + +--- + +## 6. Monitoring Tools + +Located in `contextframe/mcp/monitoring/tools.py`, these tools track system performance and usage. + +### 6.1 get_usage_metrics + +**Purpose**: Get usage metrics for documents and queries. + +**Input Parameters**: +```json +{ + "start_time": "ISO datetime (default: 1 hour ago)", + "end_time": "ISO datetime (default: now)", + "group_by": "hour | day | week (default: hour)", + "include_details": "boolean (default: false)" +} +``` + +**Metrics Provided**: +- Query counts and types +- Document access patterns +- Agent activity +- Top documents and queries + +### 6.2 get_performance_metrics + +**Purpose**: Get performance metrics for MCP operations. + +**Input Parameters**: +```json +{ + "operation_type": "string (optional)", + "minutes": "integer (default: 60)", + "include_percentiles": "boolean (default: true)" +} +``` + +**Performance Data**: +- Operation latencies +- Error rates +- Throughput metrics +- Response time percentiles + +### 6.3 get_cost_report + +**Purpose**: Get cost attribution report for MCP operations. + +**Input Parameters**: +```json +{ + "start_time": "ISO datetime (default: 24 hours ago)", + "end_time": "ISO datetime (default: now)", + "group_by": "agent | operation | provider (default: agent)", + "include_projections": "boolean (default: true)" +} +``` + +**Cost Analysis**: +- LLM API costs +- Storage costs +- Bandwidth costs +- Monthly projections +- Optimization recommendations + +### 6.4 get_monitoring_status + +**Purpose**: Get overall monitoring system status. + +**Output Includes**: +- System health status +- Configuration details +- Buffer statistics +- Active operation counts + +### 6.5 export_metrics (Monitoring) + +**Purpose**: Export metrics to various formats for external systems. + +**Input Parameters**: +```json +{ + "format": "prometheus | json | csv (default: json)", + "metric_types": ["usage", "performance", "cost", "all"], + "include_raw": "boolean (default: false)" +} +``` + +--- + +## 7. Subscription Tools + +Located in `contextframe/mcp/subscriptions/tools.py`, these tools enable real-time change monitoring. + +### 7.1 subscribe_changes + +**Purpose**: Create a subscription to monitor dataset changes. + +**Input Parameters**: +```json +{ + "resource_type": "documents | collections | all (default: all)", + "filters": "object (optional)", + "options": { + "polling_interval": "integer (default: 5)", + "include_data": "boolean (default: false)", + "batch_size": "integer (default: 100)" + } +} +``` + +**Output Format**: +```json +{ + "subscription_id": "string", + "poll_token": "string", + "polling_interval": "integer" +} +``` + +**Implementation**: +- Polling-based change detection +- Efficient version comparison +- Configurable batch sizes + +### 7.2 poll_changes + +**Purpose**: Poll for changes since the last poll. + +**Input Parameters**: +```json +{ + "subscription_id": "string (required)", + "poll_token": "string (optional for first poll)", + "timeout": "integer 0-300 (default: 30)" +} +``` + +**Long Polling**: +- Waits up to timeout for changes +- Returns immediately if changes available +- Efficient for real-time updates + +### 7.3 unsubscribe + +**Purpose**: Cancel an active subscription. + +**Input Parameters**: +```json +{ + "subscription_id": "string (required)" +} +``` + +**Cleanup**: +- Stops change monitoring +- Returns final poll token +- Cleans up resources + +### 7.4 get_subscriptions + +**Purpose**: Get list of active subscriptions. + +**Input Parameters**: +```json +{ + "resource_type": "documents | collections | all (optional)" +} +``` + +**Subscription Info**: +- Active subscription IDs +- Resource types monitored +- Configuration details +- Creation timestamps + +--- + +## Integration Patterns + +### Tool Composition + +Many operations benefit from combining multiple tools: + +1. **Document Enrichment Pipeline**: + ``` + extract_from_file → add_document → enhance_context → generate_tags + ``` + +2. **Collection Migration**: + ``` + create_collection → batch_search → move_documents → get_collection_stats + ``` + +3. **Performance Analysis**: + ``` + get_dataset_stats → query_performance → index_recommendations → optimize_storage + ``` + +### Error Recovery + +All tools implement consistent error handling: + +1. **Validation Errors**: Return `InvalidParams` with details +2. **Resource Errors**: Return specific errors (e.g., `DocumentNotFound`) +3. **System Errors**: Return `InternalError` with safe error messages + +### Performance Best Practices + +1. **Batch Operations**: Use batch tools for bulk operations +2. **Filtering**: Apply filters at the storage layer +3. **Projections**: Request only needed fields +4. **Pagination**: Use offset/limit for large result sets +5. **Dry Runs**: Test destructive operations first + +### Monitoring Integration + +1. **Usage Tracking**: All operations automatically tracked +2. **Performance Metrics**: Latencies recorded for analysis +3. **Cost Attribution**: API calls attributed to agents +4. **Export Options**: Multiple formats for external systems + +--- + +## Conclusion + +The ContextFrame MCP tool system provides comprehensive functionality for document management, from basic CRUD operations to advanced analytics and monitoring. The modular design allows for flexible composition of operations while maintaining consistent error handling and performance characteristics. + +Key strengths: +- **Scalability**: Batch operations and efficient storage layer +- **Flexibility**: Composable tools for complex workflows +- **Observability**: Built-in monitoring and analytics +- **Safety**: Transactions, dry runs, and validation +- **Integration**: Standard formats and protocols + +The tool system continues to evolve with new capabilities being added to support emerging use cases and performance requirements. \ No newline at end of file diff --git a/.claude/implementations/mcp_transport_analysis.md b/.claude/implementations/mcp_transport_analysis.md new file mode 100644 index 0000000..7aec7a9 --- /dev/null +++ b/.claude/implementations/mcp_transport_analysis.md @@ -0,0 +1,405 @@ +# MCP Transport Layer Analysis: HTTP-First Architecture + +## Overview + +The ContextFrame MCP transport layer implements a sophisticated abstraction that enables seamless communication between MCP clients and servers across different transport mechanisms. The architecture prioritizes HTTP as the primary transport while maintaining full compatibility with stdio for CLI usage. + +## Transport Architecture + +### Core Abstraction + +The transport layer is built on a clean abstraction defined in `contextframe/mcp/core/transport.py`: + +```python +class TransportAdapter(ABC): + """Base class for transport adapters. + + This abstraction ensures that all MCP features (tools, resources, + subscriptions, etc.) work identically across different transports. + """ +``` + +Key abstract methods: +- `send_message()` / `receive_message()`: Core message passing +- `send_progress()`: Progress updates with transport-appropriate delivery +- `handle_subscription()`: Streaming changes based on transport capabilities +- `supports_streaming`: Capability detection for transport features + +### Transport Implementations + +#### 1. HTTP Transport (Primary) + +The HTTP transport (`contextframe/mcp/transports/http/`) is the production-ready, scalable solution: + +**Architecture:** +- **FastAPI-based server** with automatic OpenAPI documentation +- **RESTful endpoints** wrapping JSON-RPC for convenience +- **SSE (Server-Sent Events)** for optional real-time streaming +- **Comprehensive security** including OAuth 2.1, CORS, and rate limiting + +**Key Components:** +- `HttpAdapter`: Manages HTTP-specific features like SSE streams and operation tracking +- `MCPHTTPServer`: FastAPI application with all MCP endpoints +- `SSEManager`: Handles multiple concurrent SSE connections with lifecycle management +- `HTTPTransportConfig`: Extensive configuration options for production deployments + +#### 2. Stdio Transport (CLI/Development) + +The stdio transport (`contextframe/mcp/transports/stdio.py`) provides: +- JSON-RPC over stdin/stdout +- Buffered streaming for non-streaming environments +- Progress collection in responses +- Polling-based subscriptions + +## Key Design Decisions + +### 1. HTTP as Primary Transport + +**Rationale:** +- **Scalability**: HTTP servers can handle thousands of concurrent connections +- **Security**: Mature security ecosystem (TLS, OAuth, CORS) +- **Interoperability**: Works with any HTTP client library +- **Monitoring**: Standard HTTP metrics and logging +- **Load Balancing**: Can deploy behind standard load balancers + +**Benefits:** +- Production-ready from day one +- Native browser support +- Extensive tooling ecosystem +- Standard deployment patterns + +### 2. SSE as Optional Enhancement + +Server-Sent Events provide real-time streaming when needed: + +```python +class HttpAdapter(TransportAdapter): + """HTTP transport adapter with optional SSE streaming support. + + Note: HTTP with JSON responses is the primary transport method. SSE should + only be used when real-time streaming is specifically required. + """ +``` + +**Use Cases:** +- Progress tracking for long-running operations +- Real-time dataset change notifications +- Streaming batch operation results + +**Design Choice:** SSE is optional because: +- Most operations complete quickly (< 1 second) +- Polling is sufficient for many use cases +- Reduces complexity for simple integrations +- SSE connections consume server resources + +### 3. REST Endpoints vs JSON-RPC + +The implementation provides both: + +**JSON-RPC Endpoint:** +``` +POST /mcp/v1/jsonrpc +``` + +**Convenience REST Endpoints:** +``` +POST /mcp/v1/initialize +GET /mcp/v1/tools/list +POST /mcp/v1/tools/call +GET /mcp/v1/resources/list +POST /mcp/v1/resources/read +``` + +**Rationale:** +- JSON-RPC for protocol compliance +- REST for developer ergonomics +- Both use the same underlying handlers +- Allows gradual migration + +### 4. WebSocket Considerations + +WebSockets were considered but not implemented because: +- SSE provides sufficient real-time capabilities +- Simpler client implementation (especially in browsers) +- Better compatibility with HTTP infrastructure +- Lower server resource usage +- Automatic reconnection support + +## Integration Patterns + +### 1. Tool Integration + +Tools are transport-agnostic through the adapter pattern: + +```python +class MessageHandler: + def __init__(self, dataset: FrameDataset, adapter: TransportAdapter): + self.adapter = adapter + # Tools work identically regardless of transport +``` + +### 2. Progress Reporting + +Transport-appropriate progress delivery: + +```python +# HTTP: Real-time SSE streaming +async def send_progress(self, progress: Progress): + if operation_id in self._operation_progress: + await queue.put({ + "type": "progress", + "data": {...} + }) + +# Stdio: Buffered in response +async def send_progress(self, progress: Progress): + self._current_progress.append(progress) +``` + +### 3. Error Propagation + +Consistent error handling across transports: +- MCP protocol errors (JSON-RPC error codes) +- HTTP status codes for HTTP transport +- Detailed error messages in all cases + +### 4. Request/Response Correlation + +- **HTTP**: Natural request/response pairs +- **Stdio**: JSON-RPC id field for correlation +- **SSE**: Event IDs for stream correlation + +## Performance and Scaling + +### 1. Connection Management + +```python +class SSEManager: + def __init__(self, max_connections: int = 1000, max_age_seconds: int = 3600): + # Automatic cleanup of old connections + # Connection limits to prevent resource exhaustion +``` + +### 2. Request Pipelining + +HTTP/2 support enables: +- Multiple concurrent requests +- Header compression +- Stream multiplexing +- Server push (future enhancement) + +### 3. Load Balancing + +Stateless design enables horizontal scaling: +- No server-side session state +- Operations tracked by ID +- Can deploy multiple instances +- Standard HTTP load balancers work + +### 4. Resource Efficiency + +```python +# Token bucket rate limiting +class RateLimiter: + def __init__(self, requests_per_minute: int = 60, burst: int = 10): + # Prevents resource exhaustion + # Fair resource allocation +``` + +## Configuration and Deployment + +### 1. Comprehensive Configuration + +```python +@dataclass +class HTTPTransportConfig: + # Server settings + host: str = "0.0.0.0" + port: int = 8080 + + # Security + auth_enabled: bool = False + cors_enabled: bool = True + rate_limit_enabled: bool = True + ssl_enabled: bool = False + + # Performance + sse_max_connections: int = 1000 + max_request_size: int = 10 * 1024 * 1024 # 10MB + request_timeout: int = 300 # 5 minutes +``` + +### 2. Environment Variable Support + +```bash +# All settings configurable via environment +MCP_HTTP_HOST=0.0.0.0 +MCP_HTTP_PORT=8080 +MCP_HTTP_AUTH_ENABLED=true +MCP_HTTP_AUTH_SECRET_KEY=your-secret +``` + +### 3. Production Features + +- Health checks (`/health`, `/ready`) +- Prometheus metrics (`/metrics`) +- Structured logging +- Graceful shutdown +- Connection draining + +## Client Implementation Guidance + +### 1. Simple Operations (Recommended) + +```python +# Use standard HTTP POST for most operations +response = await client.post("/mcp/v1/tools/call", json={ + "name": "search_documents", + "arguments": {"query": "machine learning"} +}) +``` + +### 2. Progress Tracking (When Needed) + +```python +# Only use SSE for long-running operations +if response.headers.get("X-Operation-Id"): + # Connect to SSE for progress + async with sse_client(f"/mcp/v1/sse/progress/{operation_id}"): + # Handle progress events +``` + +### 3. Change Subscriptions (Advanced) + +```python +# SSE for real-time updates +eventSource = new EventSource("/mcp/v1/sse/subscribe?resource_type=documents") +eventSource.onmessage = (event) => { + // Handle changes +} +``` + +## Migration Path + +### From Stdio to HTTP + +1. **Minimal Changes**: Same tool names and arguments +2. **Transport Selection**: Command-line flag or config +3. **Gradual Migration**: Can run both transports simultaneously +4. **Backward Compatibility**: No breaking changes + +### Example Migration + +```bash +# Before (stdio) +python -m contextframe.mcp dataset.lance + +# After (HTTP) +python -m contextframe.mcp dataset.lance --transport http --port 8080 + +# Both (transition period) +python -m contextframe.mcp dataset.lance --transport both --port 8080 +``` + +## Production Deployment Patterns + +### 1. Basic Deployment + +```yaml +# docker-compose.yml +services: + mcp-server: + image: contextframe/mcp-server + ports: + - "8080:8080" + environment: + - MCP_HTTP_PORT=8080 + - MCP_HTTP_AUTH_ENABLED=true + volumes: + - ./data:/data +``` + +### 2. High Availability + +``` + ┌─────────────┐ + │Load Balancer│ + └──────┬──────┘ + │ + ┌──────────────┼──────────────┐ + │ │ │ + ┌─────▼─────┐ ┌─────▼─────┐ ┌─────▼─────┐ + │MCP Server 1│ │MCP Server 2│ │MCP Server 3│ + └─────┬─────┘ └─────┬─────┘ └─────┬─────┘ + │ │ │ + └──────────────┼──────────────┘ + │ + ┌──────▼──────┐ + │Shared Storage│ + │ (S3/GCS) │ + └─────────────┘ +``` + +### 3. Security Best Practices + +- Always use TLS in production +- Enable authentication for public endpoints +- Configure CORS appropriately +- Set rate limits based on capacity +- Monitor and alert on anomalies + +## Performance Characteristics + +### Latency Targets + +- Simple operations: < 50ms +- Search operations: < 200ms +- Batch operations: Progress within 1s +- SSE connection: < 100ms establishment + +### Throughput + +- HTTP: 1000+ requests/second per instance +- SSE: 1000+ concurrent connections +- Batch: Limited by dataset operations, not transport + +### Resource Usage + +- Memory: ~100MB base + connections +- CPU: Minimal for transport layer +- Network: Efficient with HTTP/2 + +## Future Enhancements + +### 1. HTTP/3 Support +- Further latency reduction +- Better mobile performance +- Improved reliability + +### 2. GraphQL Endpoint +- More flexible queries +- Reduced over-fetching +- Better client caching + +### 3. gRPC Transport +- For high-performance scenarios +- Bi-directional streaming +- Strong typing + +### 4. WebTransport +- Future web standard +- Lower latency than WebSockets +- Better browser integration + +## Conclusion + +The MCP transport layer successfully abstracts communication details while providing production-ready HTTP transport as the primary mechanism. The architecture balances simplicity, performance, and extensibility, making ContextFrame suitable for both development and production deployments. + +Key achievements: +- **Transport agnostic**: Tools work identically across transports +- **Production ready**: Comprehensive security and scaling features +- **Developer friendly**: Simple HTTP APIs with optional streaming +- **Future proof**: Extensible to new transport mechanisms +- **Battle tested**: Extensive test coverage and error handling + +The HTTP-first approach positions ContextFrame for cloud-native deployments while maintaining the simplicity needed for local development and CLI usage. \ No newline at end of file diff --git a/contextframe/connectors/__init__.py b/contextframe/connectors/__init__.py new file mode 100644 index 0000000..652c50d --- /dev/null +++ b/contextframe/connectors/__init__.py @@ -0,0 +1,23 @@ +"""External system connectors for importing data into ContextFrame. + +This module provides connectors to import data from various external systems +like GitHub, Linear, Google Drive, etc. into ContextFrame datasets. +""" + +from contextframe.connectors.base import ( + AuthType, + ConnectorConfig, + SourceConnector, + SyncResult, +) +from contextframe.connectors.github import GitHubConnector +from contextframe.connectors.linear import LinearConnector + +__all__ = [ + "SourceConnector", + "ConnectorConfig", + "SyncResult", + "AuthType", + "GitHubConnector", + "LinearConnector", +] diff --git a/contextframe/connectors/base.py b/contextframe/connectors/base.py new file mode 100644 index 0000000..8a84535 --- /dev/null +++ b/contextframe/connectors/base.py @@ -0,0 +1,282 @@ +"""Base classes and interfaces for external system connectors.""" + +import logging +from abc import ABC, abstractmethod +from contextframe import FrameDataset, FrameRecord +from contextframe.schema import RecordType +from dataclasses import dataclass, field +from datetime import datetime +from enum import Enum +from pathlib import Path +from typing import Any, Dict, List, Optional, Protocol, runtime_checkable + + +class AuthType(Enum): + """Supported authentication types for connectors.""" + + API_KEY = "api_key" + OAUTH = "oauth" + BASIC = "basic" + TOKEN = "token" + NONE = "none" + + +@dataclass +class ConnectorConfig: + """Configuration for a source connector.""" + + name: str + """Name of the connector instance.""" + + auth_type: AuthType = AuthType.NONE + """Type of authentication to use.""" + + auth_config: dict[str, Any] = field(default_factory=dict) + """Authentication configuration (API keys, tokens, etc).""" + + sync_config: dict[str, Any] = field(default_factory=dict) + """Sync-specific configuration (filters, mappings, etc).""" + + rate_limit: int | None = None + """Maximum requests per minute (if applicable).""" + + timeout: int = 30 + """Request timeout in seconds.""" + + retry_count: int = 3 + """Number of retries for failed requests.""" + + +@dataclass +class SyncResult: + """Result of a sync operation.""" + + success: bool + """Whether the sync completed successfully.""" + + frames_created: int = 0 + """Number of new frames created.""" + + frames_updated: int = 0 + """Number of existing frames updated.""" + + frames_failed: int = 0 + """Number of frames that failed to import.""" + + errors: list[str] = field(default_factory=list) + """List of error messages.""" + + warnings: list[str] = field(default_factory=list) + """List of warning messages.""" + + start_time: datetime = field(default_factory=datetime.now) + """When the sync started.""" + + end_time: datetime | None = None + """When the sync completed.""" + + metadata: dict[str, Any] = field(default_factory=dict) + """Additional sync metadata (last cursor, etc).""" + + def add_error(self, error: str) -> None: + """Add an error message.""" + self.errors.append(error) + + def add_warning(self, warning: str) -> None: + """Add a warning message.""" + self.warnings.append(warning) + + def complete(self) -> None: + """Mark the sync as complete.""" + self.end_time = datetime.now() + + @property + def duration(self) -> float | None: + """Duration of the sync in seconds.""" + if self.end_time: + return (self.end_time - self.start_time).total_seconds() + return None + + +@runtime_checkable +class AuthProvider(Protocol): + """Protocol for authentication providers.""" + + def authenticate(self) -> dict[str, Any]: + """Authenticate and return auth headers/tokens.""" + ... + + def refresh(self) -> dict[str, Any]: + """Refresh authentication if needed.""" + ... + + def is_valid(self) -> bool: + """Check if current auth is valid.""" + ... + + +class SourceConnector(ABC): + """Base class for all external system connectors.""" + + def __init__(self, config: ConnectorConfig, dataset: FrameDataset): + """Initialize the connector. + + Args: + config: Connector configuration + dataset: Target FrameDataset to import into + """ + self.config = config + self.dataset = dataset + self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}") + self._auth_provider: AuthProvider | None = None + + @abstractmethod + def validate_connection(self) -> bool: + """Validate that the connector can connect to the source system. + + Returns: + True if connection is valid, False otherwise + """ + pass + + @abstractmethod + def discover_content(self) -> dict[str, Any]: + """Discover available content in the source system. + + Returns: + Dictionary describing available content structure + """ + pass + + @abstractmethod + def sync(self, incremental: bool = True) -> SyncResult: + """Sync content from the source system. + + Args: + incremental: Whether to perform incremental sync (vs full sync) + + Returns: + Result of the sync operation + """ + pass + + @abstractmethod + def map_to_frame(self, source_data: dict[str, Any]) -> FrameRecord | None: + """Map source system data to a FrameRecord. + + Args: + source_data: Data from the source system + + Returns: + FrameRecord if mapping successful, None otherwise + """ + pass + + def get_last_sync_state(self) -> dict[str, Any] | None: + """Get the last sync state for incremental syncs. + + Returns: + Last sync state if available + """ + # Look for a dataset header or sync state frame + try: + results = self.dataset.search( + f"record_type:{RecordType.DATASET_HEADER} AND title:'{self.config.name} Sync State'", + limit=1, + ) + if results: + frame = results[0] + return frame.metadata.get("sync_state", {}) + except Exception as e: + self.logger.warning(f"Failed to get last sync state: {e}") + return None + + def save_sync_state(self, state: dict[str, Any]) -> None: + """Save the current sync state for incremental syncs. + + Args: + state: Sync state to save + """ + try: + # Create or update sync state frame + sync_frame = FrameRecord( + title=f"{self.config.name} Sync State", + text_content=f"Sync state for {self.config.name} connector", + metadata={ + "record_type": RecordType.DATASET_HEADER, + "sync_state": state, + "connector_name": self.config.name, + "last_sync": datetime.now().isoformat(), + }, + ) + + # Try to update existing or create new + existing = self.dataset.search( + f"record_type:{RecordType.DATASET_HEADER} AND title:'{self.config.name} Sync State'", + limit=1, + ) + if existing: + self.dataset.update(existing[0].metadata["uuid"], sync_frame) + else: + self.dataset.add(sync_frame) + + except Exception as e: + self.logger.error(f"Failed to save sync state: {e}") + + def create_collection(self, name: str, description: str) -> str: + """Create a collection for organizing imported content. + + Args: + name: Collection name + description: Collection description + + Returns: + Collection ID + """ + collection_header = FrameRecord( + title=name, + text_content=description, + metadata={ + "record_type": RecordType.COLLECTION_HEADER, + "connector": self.config.name, + "created_by": "connector", + }, + ) + + self.dataset.add(collection_header) + return collection_header.metadata["uuid"] + + def batch_import( + self, frames: list[FrameRecord], batch_size: int = 100 + ) -> SyncResult: + """Import frames in batches for efficiency. + + Args: + frames: List of frames to import + batch_size: Number of frames per batch + + Returns: + Result of the import operation + """ + result = SyncResult(success=True) + + for i in range(0, len(frames), batch_size): + batch = frames[i : i + batch_size] + try: + for frame in batch: + try: + self.dataset.add(frame) + result.frames_created += 1 + except Exception as e: + result.frames_failed += 1 + result.add_error( + f"Failed to add frame '{frame.metadata.get('title', 'Unknown')}': {e}" + ) + + except Exception as e: + result.success = False + result.add_error(f"Batch import failed: {e}") + break + + result.complete() + return result diff --git a/contextframe/connectors/github.py b/contextframe/connectors/github.py new file mode 100644 index 0000000..3ec6278 --- /dev/null +++ b/contextframe/connectors/github.py @@ -0,0 +1,386 @@ +"""GitHub connector for importing repository content into ContextFrame.""" + +import base64 +import mimetypes +from contextframe import FrameRecord +from contextframe.connectors.base import ( + AuthType, + ConnectorConfig, + SourceConnector, + SyncResult, +) +from contextframe.schema import RecordType +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List, Optional, Set +from urllib.parse import urlparse + + +class GitHubConnector(SourceConnector): + """Connector for importing GitHub repository content.""" + + def __init__(self, config: ConnectorConfig, dataset): + """Initialize GitHub connector. + + Args: + config: Connector configuration with GitHub-specific settings + dataset: Target FrameDataset + """ + super().__init__(config, dataset) + + # Validate required config + self.owner = config.sync_config.get("owner") + self.repo = config.sync_config.get("repo") + self.branch = config.sync_config.get("branch", "main") + self.paths = config.sync_config.get("paths", ["/"]) + self.file_patterns = config.sync_config.get("file_patterns", ["*"]) + self.exclude_patterns = config.sync_config.get("exclude_patterns", []) + + if not self.owner or not self.repo: + raise ValueError( + "GitHub connector requires 'owner' and 'repo' in sync_config" + ) + + # Set up GitHub API client + self._setup_client() + + def _setup_client(self): + """Set up GitHub API client.""" + try: + from github import Github + from github.GithubException import GithubException + + self.GithubException = GithubException + except ImportError: + raise ImportError( + "PyGithub is required for GitHub connector. Install with: pip install PyGithub" + ) + + # Initialize client based on auth type + if self.config.auth_type == AuthType.TOKEN: + token = self.config.auth_config.get("token") + if not token: + raise ValueError("GitHub token required for authentication") + self.client = Github(token) + elif self.config.auth_type == AuthType.NONE: + # Public repos only + self.client = Github() + else: + raise ValueError( + f"Unsupported auth type for GitHub: {self.config.auth_type}" + ) + + # Get repository object + try: + self.github_repo = self.client.get_repo(f"{self.owner}/{self.repo}") + except Exception as e: + raise ValueError( + f"Failed to access repository {self.owner}/{self.repo}: {e}" + ) + + def validate_connection(self) -> bool: + """Validate GitHub connection and repository access.""" + try: + # Try to get repository info + _ = self.github_repo.full_name + return True + except Exception as e: + self.logger.error(f"Failed to validate GitHub connection: {e}") + return False + + def discover_content(self) -> dict[str, Any]: + """Discover repository structure and content.""" + discovery = { + "repository": { + "owner": self.owner, + "name": self.repo, + "full_name": self.github_repo.full_name, + "description": self.github_repo.description, + "default_branch": self.github_repo.default_branch, + "size": self.github_repo.size, + "language": self.github_repo.language, + "topics": list(self.github_repo.get_topics()), + }, + "branches": [], + "file_tree": {}, + "stats": { + "total_files": 0, + "file_types": {}, + "total_size": 0, + }, + } + + # Get branches + try: + for branch in self.github_repo.get_branches(): + discovery["branches"].append(branch.name) + except Exception as e: + self.logger.warning(f"Failed to get branches: {e}") + + # Analyze file tree for specified paths + for path in self.paths: + try: + self._discover_path(path.strip("/"), discovery) + except Exception as e: + self.logger.warning(f"Failed to discover path {path}: {e}") + + return discovery + + def _discover_path(self, path: str, discovery: dict[str, Any]): + """Recursively discover files in a path.""" + try: + contents = self.github_repo.get_contents(path, ref=self.branch) + + if not isinstance(contents, list): + contents = [contents] + + for content in contents: + if content.type == "file": + # Check if file matches patterns + if self._matches_patterns(content.path): + discovery["stats"]["total_files"] += 1 + discovery["stats"]["total_size"] += content.size + + # Track file types + ext = Path(content.name).suffix.lower() + discovery["stats"]["file_types"][ext] = ( + discovery["stats"]["file_types"].get(ext, 0) + 1 + ) + + # Add to tree + self._add_to_tree(discovery["file_tree"], content.path) + + elif content.type == "dir": + # Recursively discover subdirectories + self._discover_path(content.path, discovery) + + except self.GithubException as e: + if e.status == 404: + self.logger.warning(f"Path not found: {path}") + else: + raise + + def _add_to_tree(self, tree: dict, path: str): + """Add a file path to the tree structure.""" + parts = path.split("/") + current = tree + + for i, part in enumerate(parts): + if i == len(parts) - 1: + # Leaf node (file) + current[part] = None + else: + # Directory node + if part not in current: + current[part] = {} + current = current[part] + + def _matches_patterns(self, path: str) -> bool: + """Check if a file path matches the configured patterns.""" + path_obj = Path(path) + + # Check exclude patterns first + for pattern in self.exclude_patterns: + if path_obj.match(pattern): + return False + + # Check include patterns + if not self.file_patterns: + return True + + for pattern in self.file_patterns: + if pattern == "*" or path_obj.match(pattern): + return True + + return False + + def sync(self, incremental: bool = True) -> SyncResult: + """Sync repository content to ContextFrame.""" + result = SyncResult(success=True) + + # Get last sync state if incremental + last_sync_state = None + if incremental: + last_sync_state = self.get_last_sync_state() + + # Create collection for this repository + collection_id = self.create_collection( + f"{self.owner}/{self.repo}", + f"GitHub repository: {self.github_repo.description or 'No description'}", + ) + + # Track processed files + processed_files: set[str] = set() + + # Process each configured path + for path in self.paths: + try: + self._sync_path( + path.strip("/"), + collection_id, + result, + last_sync_state, + processed_files, + ) + except Exception as e: + result.add_error(f"Failed to sync path {path}: {e}") + result.success = False + + # Save sync state + if result.success: + new_state = { + "last_sync": datetime.now().isoformat(), + "branch": self.branch, + "commit": self.github_repo.get_branch(self.branch).commit.sha, + "processed_files": list(processed_files), + } + self.save_sync_state(new_state) + + result.complete() + return result + + def _sync_path( + self, + path: str, + collection_id: str, + result: SyncResult, + last_sync_state: dict[str, Any] | None, + processed_files: set[str], + ): + """Sync a specific path in the repository.""" + try: + contents = self.github_repo.get_contents(path, ref=self.branch) + + if not isinstance(contents, list): + contents = [contents] + + for content in contents: + if content.type == "file" and self._matches_patterns(content.path): + # Check if file needs update + if incremental and last_sync_state: + last_commit = last_sync_state.get("commit") + if last_commit: + # Check if file changed since last sync + commits = list( + self.github_repo.get_commits( + sha=self.branch, + path=content.path, + since=datetime.fromisoformat( + last_sync_state["last_sync"] + ), + ) + ) + if not commits: + continue + + # Process file + frame = self.map_to_frame(content) + if frame: + frame.metadata["collection"] = collection_id + frame.metadata["collection_id"] = collection_id + + try: + # Check if frame exists + existing = self.dataset.search( + f"source_url:'{content.html_url}'", limit=1 + ) + + if existing: + self.dataset.update(existing[0].metadata["uuid"], frame) + result.frames_updated += 1 + else: + self.dataset.add(frame) + result.frames_created += 1 + + processed_files.add(content.path) + + except Exception as e: + result.frames_failed += 1 + result.add_error(f"Failed to import {content.path}: {e}") + + elif content.type == "dir": + # Recursively sync subdirectories + self._sync_path( + content.path, + collection_id, + result, + last_sync_state, + processed_files, + ) + + except self.GithubException as e: + if e.status != 404: + raise + + def map_to_frame(self, source_data: Any) -> FrameRecord | None: + """Map GitHub file to FrameRecord.""" + try: + # Get file content + content_data = source_data.decoded_content + + # Determine content type + mime_type, _ = mimetypes.guess_type(source_data.name) + + # Create base metadata + metadata = { + "title": source_data.name, + "record_type": RecordType.DOCUMENT, + "source_type": "github", + "source_file": source_data.path, + "source_url": source_data.html_url, + "custom_metadata": { + "x_github_sha": source_data.sha, + "x_github_size": str(source_data.size), + "x_github_type": source_data.type, + }, + } + + # Handle different file types + text_content = None + raw_data = None + raw_data_type = None + + if mime_type and mime_type.startswith("text/"): + # Text file - decode as string + try: + text_content = content_data.decode("utf-8") + except UnicodeDecodeError: + # Fall back to binary + raw_data = content_data + raw_data_type = mime_type or "application/octet-stream" + elif mime_type and mime_type.startswith("image/"): + # Image file - store as binary + raw_data = content_data + raw_data_type = mime_type + text_content = f"Image file: {source_data.name}" + else: + # Try to decode as text, fall back to binary + try: + text_content = content_data.decode("utf-8") + except UnicodeDecodeError: + raw_data = content_data + raw_data_type = mime_type or "application/octet-stream" + text_content = f"Binary file: {source_data.name}" + + # Extract README content as context + if source_data.name.lower() in ["readme.md", "readme.txt", "readme"]: + metadata["context"] = text_content[:1000] if text_content else "" + + # Create frame + frame = FrameRecord( + text_content=text_content, + metadata=metadata, + raw_data=raw_data, + raw_data_type=raw_data_type, + ) + + # Add relationships if this is a code file + if mime_type in ["text/x-python", "text/x-java", "text/javascript"]: + # TODO: Extract imports and create relationships + pass + + return frame + + except Exception as e: + self.logger.error(f"Failed to map file {source_data.path}: {e}") + return None diff --git a/contextframe/connectors/linear.py b/contextframe/connectors/linear.py new file mode 100644 index 0000000..504be87 --- /dev/null +++ b/contextframe/connectors/linear.py @@ -0,0 +1,638 @@ +"""Linear connector for importing teams, projects, and issues into ContextFrame.""" + +import json +from contextframe import FrameRecord +from contextframe.connectors.base import ( + AuthType, + ConnectorConfig, + SourceConnector, + SyncResult, +) +from contextframe.schema import RecordType +from datetime import datetime +from typing import Any, Dict, List, Optional, Set + + +class LinearConnector(SourceConnector): + """Connector for importing Linear workspace data.""" + + def __init__(self, config: ConnectorConfig, dataset): + """Initialize Linear connector. + + Args: + config: Connector configuration with Linear-specific settings + dataset: Target FrameDataset + """ + super().__init__(config, dataset) + + # Configuration options + self.sync_teams = config.sync_config.get("sync_teams", True) + self.sync_projects = config.sync_config.get("sync_projects", True) + self.sync_issues = config.sync_config.get("sync_issues", True) + self.team_ids = config.sync_config.get("team_ids", []) # Empty = all teams + self.project_ids = config.sync_config.get( + "project_ids", [] + ) # Empty = all projects + self.issue_states = config.sync_config.get( + "issue_states", [] + ) # Empty = all states + self.include_archived = config.sync_config.get("include_archived", False) + self.include_comments = config.sync_config.get("include_comments", True) + + # Set up Linear API client + self._setup_client() + + def _setup_client(self): + """Set up Linear API client.""" + try: + from linear import LinearClient + except ImportError: + raise ImportError( + "linear-python is required for Linear connector. Install with: pip install linear-python" + ) + + # Initialize client based on auth type + if self.config.auth_type == AuthType.API_KEY: + api_key = self.config.auth_config.get("api_key") + if not api_key: + raise ValueError("Linear API key required for authentication") + self.client = LinearClient(api_key) + else: + raise ValueError("Linear connector requires API key authentication") + + def validate_connection(self) -> bool: + """Validate Linear connection.""" + try: + # Try to get viewer info + viewer = self.client.viewer + self.logger.info(f"Connected to Linear as: {viewer.name}") + return True + except Exception as e: + self.logger.error(f"Failed to validate Linear connection: {e}") + return False + + def discover_content(self) -> dict[str, Any]: + """Discover Linear workspace structure.""" + discovery = { + "workspace": { + "viewer": {}, + "organization": {}, + }, + "teams": [], + "projects": [], + "issue_stats": { + "total": 0, + "by_state": {}, + "by_priority": {}, + "by_team": {}, + }, + } + + try: + # Get viewer info + viewer = self.client.viewer + discovery["workspace"]["viewer"] = { + "id": viewer.id, + "name": viewer.name, + "email": viewer.email, + } + + # Get organization info + org = self.client.organization + discovery["workspace"]["organization"] = { + "id": org.id, + "name": org.name, + "url_key": org.url_key, + } + + # Discover teams + teams = self.client.teams(include_archived=self.include_archived) + for team in teams: + if not self.team_ids or team.id in self.team_ids: + discovery["teams"].append( + { + "id": team.id, + "name": team.name, + "key": team.key, + "description": team.description, + } + ) + + # Discover projects + projects = self.client.projects(include_archived=self.include_archived) + for project in projects: + if not self.project_ids or project.id in self.project_ids: + discovery["projects"].append( + { + "id": project.id, + "name": project.name, + "description": project.description, + "state": project.state, + "team_ids": [team.id for team in project.teams], + } + ) + + # Get issue statistics + issues = self.client.issues(include_archived=self.include_archived) + for issue in issues: + discovery["issue_stats"]["total"] += 1 + + # By state + state_name = issue.state.name if issue.state else "No State" + discovery["issue_stats"]["by_state"][state_name] = ( + discovery["issue_stats"]["by_state"].get(state_name, 0) + 1 + ) + + # By priority + priority = issue.priority or 0 + discovery["issue_stats"]["by_priority"][priority] = ( + discovery["issue_stats"]["by_priority"].get(priority, 0) + 1 + ) + + # By team + team_name = issue.team.name if issue.team else "No Team" + discovery["issue_stats"]["by_team"][team_name] = ( + discovery["issue_stats"]["by_team"].get(team_name, 0) + 1 + ) + + except Exception as e: + self.logger.error(f"Failed to discover Linear content: {e}") + discovery["error"] = str(e) + + return discovery + + def sync(self, incremental: bool = True) -> SyncResult: + """Sync Linear data to ContextFrame.""" + result = SyncResult(success=True) + + # Get last sync state if incremental + last_sync_state = None + if incremental: + last_sync_state = self.get_last_sync_state() + + # Create main collection for Linear data + main_collection_id = self.create_collection( + "Linear Workspace", "Linear teams, projects, and issues" + ) + + # Track what we've synced + synced_data = { + "teams": {}, + "projects": {}, + "issues": set(), + } + + # Sync teams + if self.sync_teams: + self._sync_teams(main_collection_id, result, last_sync_state, synced_data) + + # Sync projects + if self.sync_projects: + self._sync_projects( + main_collection_id, result, last_sync_state, synced_data + ) + + # Sync issues + if self.sync_issues: + self._sync_issues(main_collection_id, result, last_sync_state, synced_data) + + # Save sync state + if result.success: + new_state = { + "last_sync": datetime.now().isoformat(), + "synced_teams": list(synced_data["teams"].keys()), + "synced_projects": list(synced_data["projects"].keys()), + "synced_issues": len(synced_data["issues"]), + } + self.save_sync_state(new_state) + + result.complete() + return result + + def _sync_teams( + self, + parent_collection_id: str, + result: SyncResult, + last_sync_state: dict[str, Any] | None, + synced_data: dict[str, Any], + ): + """Sync Linear teams.""" + try: + teams = self.client.teams(include_archived=self.include_archived) + + for team in teams: + if self.team_ids and team.id not in self.team_ids: + continue + + # Check if needs update + if incremental and last_sync_state: + if team.updated_at <= datetime.fromisoformat( + last_sync_state["last_sync"] + ): + continue + + # Create team collection + team_collection_id = self.create_collection( + f"Team: {team.name}", team.description or f"Linear team {team.key}" + ) + + # Create team frame + frame = self._map_team_to_frame( + team, parent_collection_id, team_collection_id + ) + if frame: + try: + existing = self.dataset.search( + f"source_url:'https://linear.app/team/{team.id}'", limit=1 + ) + + if existing: + self.dataset.update(existing[0].metadata["uuid"], frame) + result.frames_updated += 1 + else: + self.dataset.add(frame) + result.frames_created += 1 + + synced_data["teams"][team.id] = team_collection_id + + except Exception as e: + result.frames_failed += 1 + result.add_error(f"Failed to sync team {team.name}: {e}") + + except Exception as e: + result.add_error(f"Failed to sync teams: {e}") + result.success = False + + def _sync_projects( + self, + parent_collection_id: str, + result: SyncResult, + last_sync_state: dict[str, Any] | None, + synced_data: dict[str, Any], + ): + """Sync Linear projects.""" + try: + projects = self.client.projects(include_archived=self.include_archived) + + for project in projects: + if self.project_ids and project.id not in self.project_ids: + continue + + # Check if needs update + if incremental and last_sync_state: + if project.updated_at <= datetime.fromisoformat( + last_sync_state["last_sync"] + ): + continue + + # Create project collection + project_collection_id = self.create_collection( + f"Project: {project.name}", project.description or "Linear project" + ) + + # Create project frame + frame = self._map_project_to_frame( + project, parent_collection_id, project_collection_id + ) + if frame: + # Add team relationships + for team in project.teams: + if team.id in synced_data["teams"]: + frame.add_relationship( + "member_of", id=synced_data["teams"][team.id] + ) + + try: + existing = self.dataset.search( + f"source_url:'https://linear.app/project/{project.id}'", + limit=1, + ) + + if existing: + self.dataset.update(existing[0].metadata["uuid"], frame) + result.frames_updated += 1 + else: + self.dataset.add(frame) + result.frames_created += 1 + + synced_data["projects"][project.id] = project_collection_id + + except Exception as e: + result.frames_failed += 1 + result.add_error(f"Failed to sync project {project.name}: {e}") + + except Exception as e: + result.add_error(f"Failed to sync projects: {e}") + result.success = False + + def _sync_issues( + self, + parent_collection_id: str, + result: SyncResult, + last_sync_state: dict[str, Any] | None, + synced_data: dict[str, Any], + ): + """Sync Linear issues.""" + try: + # Build filter for issues + filters = {} + if self.team_ids: + filters["team"] = {"id": {"in": self.team_ids}} + if self.project_ids: + filters["project"] = {"id": {"in": self.project_ids}} + if self.issue_states: + filters["state"] = {"name": {"in": self.issue_states}} + + issues = self.client.issues( + filter=filters if filters else None, + include_archived=self.include_archived, + ) + + for issue in issues: + # Check if needs update + if incremental and last_sync_state: + if issue.updated_at <= datetime.fromisoformat( + last_sync_state["last_sync"] + ): + continue + + # Determine collection + collection_id = parent_collection_id + if issue.project and issue.project.id in synced_data["projects"]: + collection_id = synced_data["projects"][issue.project.id] + elif issue.team and issue.team.id in synced_data["teams"]: + collection_id = synced_data["teams"][issue.team.id] + + # Create issue frame + frame = self._map_issue_to_frame(issue, collection_id) + if frame: + # Add relationships + if issue.parent: + frame.add_relationship("child_of", id=issue.parent.id) + if issue.related_issues: + for related in issue.related_issues: + frame.add_relationship("related", id=related.id) + + try: + existing = self.dataset.search( + f"source_url:'https://linear.app/issue/{issue.identifier}'", + limit=1, + ) + + if existing: + self.dataset.update(existing[0].metadata["uuid"], frame) + result.frames_updated += 1 + else: + self.dataset.add(frame) + result.frames_created += 1 + + synced_data["issues"].add(issue.id) + + # Sync comments if enabled + if self.include_comments: + self._sync_issue_comments(issue, collection_id, result) + + except Exception as e: + result.frames_failed += 1 + result.add_error( + f"Failed to sync issue {issue.identifier}: {e}" + ) + + except Exception as e: + result.add_error(f"Failed to sync issues: {e}") + result.success = False + + def _sync_issue_comments(self, issue: Any, collection_id: str, result: SyncResult): + """Sync comments for an issue.""" + try: + comments = issue.comments + for comment in comments: + frame = self._map_comment_to_frame(comment, issue, collection_id) + if frame: + try: + existing = self.dataset.search( + f"source_url:'https://linear.app/comment/{comment.id}'", + limit=1, + ) + + if existing: + self.dataset.update(existing[0].metadata["uuid"], frame) + result.frames_updated += 1 + else: + self.dataset.add(frame) + result.frames_created += 1 + + except Exception as e: + # Don't fail the whole sync for comment errors + result.add_warning( + f"Failed to sync comment on {issue.identifier}: {e}" + ) + + except Exception as e: + result.add_warning(f"Failed to sync comments for {issue.identifier}: {e}") + + def map_to_frame(self, source_data: dict[str, Any]) -> FrameRecord | None: + """Generic mapping - delegates to specific mappers.""" + data_type = source_data.get("_type") + + if data_type == "team": + return self._map_team_to_frame(source_data, "", "") + elif data_type == "project": + return self._map_project_to_frame(source_data, "", "") + elif data_type == "issue": + return self._map_issue_to_frame(source_data, "") + elif data_type == "comment": + return self._map_comment_to_frame(source_data, None, "") + else: + self.logger.warning(f"Unknown Linear data type: {data_type}") + return None + + def _map_team_to_frame( + self, team: Any, parent_collection_id: str, team_collection_id: str + ) -> FrameRecord | None: + """Map Linear team to FrameRecord.""" + try: + metadata = { + "title": f"Team: {team.name}", + "record_type": RecordType.COLLECTION_HEADER, + "source_type": "linear_team", + "source_url": f"https://linear.app/team/{team.id}", + "collection": parent_collection_id, + "collection_id": parent_collection_id, + "custom_metadata": { + "x_linear_id": team.id, + "x_linear_key": team.key, + "x_team_collection": team_collection_id, + }, + } + + content = f"# {team.name}\n\n" + content += f"**Key:** {team.key}\n\n" + if team.description: + content += f"## Description\n\n{team.description}\n\n" + + return FrameRecord( + text_content=content, + metadata=metadata, + context=team.description or f"Linear team {team.key}", + ) + + except Exception as e: + self.logger.error(f"Failed to map team {team.name}: {e}") + return None + + def _map_project_to_frame( + self, project: Any, parent_collection_id: str, project_collection_id: str + ) -> FrameRecord | None: + """Map Linear project to FrameRecord.""" + try: + metadata = { + "title": f"Project: {project.name}", + "record_type": RecordType.COLLECTION_HEADER, + "source_type": "linear_project", + "source_url": f"https://linear.app/project/{project.id}", + "collection": parent_collection_id, + "collection_id": parent_collection_id, + "status": project.state, + "custom_metadata": { + "x_linear_id": project.id, + "x_project_collection": project_collection_id, + "x_project_state": project.state, + }, + } + + content = f"# {project.name}\n\n" + content += f"**State:** {project.state}\n\n" + if project.description: + content += f"## Description\n\n{project.description}\n\n" + if project.start_date: + content += f"**Start Date:** {project.start_date}\n" + if project.target_date: + content += f"**Target Date:** {project.target_date}\n" + + return FrameRecord( + text_content=content, + metadata=metadata, + context=project.description or "Linear project", + ) + + except Exception as e: + self.logger.error(f"Failed to map project {project.name}: {e}") + return None + + def _map_issue_to_frame(self, issue: Any, collection_id: str) -> FrameRecord | None: + """Map Linear issue to FrameRecord.""" + try: + # Build metadata + metadata = { + "title": f"{issue.identifier}: {issue.title}", + "record_type": RecordType.DOCUMENT, + "source_type": "linear_issue", + "source_url": f"https://linear.app/issue/{issue.identifier}", + "collection": collection_id, + "collection_id": collection_id, + "status": issue.state.name if issue.state else "Unknown", + "created_at": issue.created_at.isoformat() + if issue.created_at + else None, + "updated_at": issue.updated_at.isoformat() + if issue.updated_at + else None, + "custom_metadata": { + "x_linear_id": issue.id, + "x_linear_identifier": issue.identifier, + "x_linear_priority": str(issue.priority or 0), + }, + } + + # Add assignee if present + if issue.assignee: + metadata["author"] = issue.assignee.name + metadata["custom_metadata"]["x_linear_assignee_id"] = issue.assignee.id + + # Add labels as tags + if issue.labels: + metadata["tags"] = [label.name for label in issue.labels] + + # Build content + content = f"# {issue.identifier}: {issue.title}\n\n" + content += f"**State:** {issue.state.name if issue.state else 'Unknown'}\n" + content += f"**Priority:** {self._priority_name(issue.priority)}\n" + + if issue.assignee: + content += f"**Assignee:** {issue.assignee.name}\n" + if issue.team: + content += f"**Team:** {issue.team.name}\n" + if issue.project: + content += f"**Project:** {issue.project.name}\n" + + content += "\n" + + if issue.description: + content += f"## Description\n\n{issue.description}\n\n" + + # Create frame + return FrameRecord( + text_content=content, + metadata=metadata, + context=issue.description or issue.title, + ) + + except Exception as e: + self.logger.error(f"Failed to map issue {issue.identifier}: {e}") + return None + + def _map_comment_to_frame( + self, comment: Any, issue: Any, collection_id: str + ) -> FrameRecord | None: + """Map Linear comment to FrameRecord.""" + try: + metadata = { + "title": f"Comment on {issue.identifier}", + "record_type": RecordType.DOCUMENT, + "source_type": "linear_comment", + "source_url": f"https://linear.app/comment/{comment.id}", + "collection": collection_id, + "collection_id": collection_id, + "author": comment.user.name if comment.user else "Unknown", + "created_at": comment.created_at.isoformat() + if comment.created_at + else None, + "custom_metadata": { + "x_linear_id": comment.id, + "x_linear_issue_id": issue.id, + "x_linear_issue_identifier": issue.identifier, + }, + } + + content = f"# Comment on {issue.identifier}\n\n" + content += f"**By:** {comment.user.name if comment.user else 'Unknown'}\n" + content += f"**Date:** {comment.created_at}\n\n" + content += comment.body or "" + + frame = FrameRecord( + text_content=content, + metadata=metadata, + ) + + # Add relationship to issue + frame.add_relationship("comment_on", id=issue.id) + + return frame + + except Exception as e: + self.logger.error(f"Failed to map comment: {e}") + return None + + def _priority_name(self, priority: int | None) -> str: + """Convert Linear priority number to name.""" + if priority is None: + return "None" + priority_map = { + 0: "None", + 1: "Urgent", + 2: "High", + 3: "Normal", + 4: "Low", + } + return priority_map.get(priority, f"Priority {priority}") diff --git a/contextframe/tests/test_connectors.py b/contextframe/tests/test_connectors.py new file mode 100644 index 0000000..15be235 --- /dev/null +++ b/contextframe/tests/test_connectors.py @@ -0,0 +1,394 @@ +"""Tests for external system connectors.""" + +import pytest +from datetime import datetime +from pathlib import Path +from unittest.mock import Mock, MagicMock, patch + +from contextframe import FrameDataset, FrameRecord +from contextframe.connectors import ( + SourceConnector, + ConnectorConfig, + SyncResult, + AuthType, + GitHubConnector, + LinearConnector, +) +from contextframe.schema import RecordType + + +class TestConnectorBase: + """Test base connector functionality.""" + + def test_connector_config(self): + """Test ConnectorConfig initialization.""" + config = ConnectorConfig( + name="Test Connector", + auth_type=AuthType.API_KEY, + auth_config={"api_key": "test-key"}, + sync_config={"setting": "value"}, + rate_limit=60, + timeout=30, + retry_count=3, + ) + + assert config.name == "Test Connector" + assert config.auth_type == AuthType.API_KEY + assert config.auth_config["api_key"] == "test-key" + assert config.sync_config["setting"] == "value" + assert config.rate_limit == 60 + + def test_sync_result(self): + """Test SyncResult functionality.""" + result = SyncResult(success=True) + + # Add some data + result.frames_created = 10 + result.frames_updated = 5 + result.frames_failed = 2 + result.add_error("Test error") + result.add_warning("Test warning") + result.complete() + + assert result.success is True + assert result.frames_created == 10 + assert result.frames_updated == 5 + assert result.frames_failed == 2 + assert len(result.errors) == 1 + assert len(result.warnings) == 1 + assert result.end_time is not None + assert result.duration is not None + + def test_sync_state_management(self, tmp_path): + """Test sync state save/load functionality.""" + # Create test dataset + dataset_path = tmp_path / "test.lance" + dataset = FrameDataset.create(str(dataset_path)) + + # Create mock connector + config = ConnectorConfig(name="Test Connector") + + class TestConnector(SourceConnector): + def validate_connection(self): return True + def discover_content(self): return {} + def sync(self, incremental=True): return SyncResult(success=True) + def map_to_frame(self, data): return None + + connector = TestConnector(config, dataset) + + # Test saving sync state + test_state = { + "last_sync": datetime.now().isoformat(), + "cursor": "page-2", + "processed": 100, + } + connector.save_sync_state(test_state) + + # Test loading sync state + loaded_state = connector.get_last_sync_state() + assert loaded_state is not None + assert loaded_state["cursor"] == "page-2" + assert loaded_state["processed"] == 100 + + +class TestGitHubConnector: + """Test GitHub connector functionality.""" + + @pytest.fixture + def mock_github(self): + """Mock PyGithub objects.""" + with patch("contextframe.connectors.github.Github") as mock_github_class: + # Create mocks + mock_client = Mock() + mock_repo = Mock() + + # Configure repo mock + mock_repo.full_name = "owner/repo" + mock_repo.description = "Test repository" + mock_repo.default_branch = "main" + mock_repo.size = 1000 + mock_repo.language = "Python" + mock_repo.get_topics.return_value = ["test", "example"] + + # Configure client mock + mock_client.get_repo.return_value = mock_repo + mock_github_class.return_value = mock_client + + yield mock_client, mock_repo + + def test_github_connector_init(self, mock_github, tmp_path): + """Test GitHub connector initialization.""" + mock_client, mock_repo = mock_github + + dataset_path = tmp_path / "test.lance" + dataset = FrameDataset.create(str(dataset_path)) + + config = ConnectorConfig( + name="GitHub Test", + auth_type=AuthType.TOKEN, + auth_config={"token": "test-token"}, + sync_config={ + "owner": "owner", + "repo": "repo", + "branch": "main", + "paths": ["/src"], + "file_patterns": ["*.py"], + } + ) + + connector = GitHubConnector(config, dataset) + + assert connector.owner == "owner" + assert connector.repo == "repo" + assert connector.branch == "main" + assert connector.paths == ["/src"] + assert connector.file_patterns == ["*.py"] + + def test_github_validate_connection(self, mock_github, tmp_path): + """Test GitHub connection validation.""" + mock_client, mock_repo = mock_github + + dataset_path = tmp_path / "test.lance" + dataset = FrameDataset.create(str(dataset_path)) + + config = ConnectorConfig( + name="GitHub Test", + auth_type=AuthType.TOKEN, + auth_config={"token": "test-token"}, + sync_config={"owner": "owner", "repo": "repo"} + ) + + connector = GitHubConnector(config, dataset) + + # Test successful validation + assert connector.validate_connection() is True + + # Test failed validation + mock_repo.full_name = Mock(side_effect=Exception("Access denied")) + assert connector.validate_connection() is False + + def test_github_discover_content(self, mock_github, tmp_path): + """Test GitHub content discovery.""" + mock_client, mock_repo = mock_github + + # Mock file structure + mock_file1 = Mock() + mock_file1.type = "file" + mock_file1.path = "src/main.py" + mock_file1.name = "main.py" + mock_file1.size = 1000 + + mock_file2 = Mock() + mock_file2.type = "file" + mock_file2.path = "src/test.py" + mock_file2.name = "test.py" + mock_file2.size = 500 + + mock_dir = Mock() + mock_dir.type = "dir" + mock_dir.path = "src/utils" + + mock_repo.get_contents.return_value = [mock_file1, mock_file2, mock_dir] + + # Mock branches + mock_branch = Mock() + mock_branch.name = "main" + mock_repo.get_branches.return_value = [mock_branch] + + dataset_path = tmp_path / "test.lance" + dataset = FrameDataset.create(str(dataset_path)) + + config = ConnectorConfig( + name="GitHub Test", + auth_type=AuthType.TOKEN, + auth_config={"token": "test-token"}, + sync_config={ + "owner": "owner", + "repo": "repo", + "paths": ["/src"], + "file_patterns": ["*.py"], + } + ) + + connector = GitHubConnector(config, dataset) + discovery = connector.discover_content() + + assert discovery["repository"]["owner"] == "owner" + assert discovery["repository"]["name"] == "repo" + assert "main" in discovery["branches"] + assert discovery["stats"]["total_files"] == 2 + assert discovery["stats"]["file_types"][".py"] == 2 + + def test_github_map_to_frame(self, mock_github, tmp_path): + """Test mapping GitHub file to FrameRecord.""" + mock_client, mock_repo = mock_github + + # Mock file + mock_file = Mock() + mock_file.name = "README.md" + mock_file.path = "README.md" + mock_file.sha = "abc123" + mock_file.size = 1234 + mock_file.type = "file" + mock_file.html_url = "https://github.com/owner/repo/blob/main/README.md" + mock_file.decoded_content = b"# Test Repository\n\nThis is a test." + + dataset_path = tmp_path / "test.lance" + dataset = FrameDataset.create(str(dataset_path)) + + config = ConnectorConfig( + name="GitHub Test", + auth_type=AuthType.TOKEN, + auth_config={"token": "test-token"}, + sync_config={"owner": "owner", "repo": "repo"} + ) + + connector = GitHubConnector(config, dataset) + frame = connector.map_to_frame(mock_file) + + assert frame is not None + assert frame.metadata["title"] == "README.md" + assert frame.metadata["source_type"] == "github" + assert frame.metadata["source_file"] == "README.md" + assert frame.metadata["source_url"] == mock_file.html_url + assert frame.text_content == "# Test Repository\n\nThis is a test." + assert frame.metadata["context"] == frame.text_content[:1000] + + +class TestLinearConnector: + """Test Linear connector functionality.""" + + @pytest.fixture + def mock_linear(self): + """Mock Linear API client.""" + with patch("contextframe.connectors.linear.LinearClient") as mock_linear_class: + # Create mocks + mock_client = Mock() + + # Mock viewer + mock_viewer = Mock() + mock_viewer.id = "viewer-id" + mock_viewer.name = "Test User" + mock_viewer.email = "test@example.com" + mock_client.viewer = mock_viewer + + # Mock organization + mock_org = Mock() + mock_org.id = "org-id" + mock_org.name = "Test Organization" + mock_org.url_key = "testorg" + mock_client.organization = mock_org + + # Configure client + mock_linear_class.return_value = mock_client + + yield mock_client + + def test_linear_connector_init(self, mock_linear, tmp_path): + """Test Linear connector initialization.""" + dataset_path = tmp_path / "test.lance" + dataset = FrameDataset.create(str(dataset_path)) + + config = ConnectorConfig( + name="Linear Test", + auth_type=AuthType.API_KEY, + auth_config={"api_key": "test-key"}, + sync_config={ + "sync_teams": True, + "sync_projects": True, + "sync_issues": True, + "include_archived": False, + "include_comments": True, + } + ) + + connector = LinearConnector(config, dataset) + + assert connector.sync_teams is True + assert connector.sync_projects is True + assert connector.sync_issues is True + assert connector.include_archived is False + assert connector.include_comments is True + + def test_linear_validate_connection(self, mock_linear, tmp_path): + """Test Linear connection validation.""" + dataset_path = tmp_path / "test.lance" + dataset = FrameDataset.create(str(dataset_path)) + + config = ConnectorConfig( + name="Linear Test", + auth_type=AuthType.API_KEY, + auth_config={"api_key": "test-key"}, + ) + + connector = LinearConnector(config, dataset) + + # Test successful validation + assert connector.validate_connection() is True + + # Test failed validation + mock_linear.viewer = Mock(side_effect=Exception("Invalid API key")) + assert connector.validate_connection() is False + + def test_linear_map_issue_to_frame(self, mock_linear, tmp_path): + """Test mapping Linear issue to FrameRecord.""" + # Mock issue + mock_issue = Mock() + mock_issue.id = "issue-id" + mock_issue.identifier = "PROJ-123" + mock_issue.title = "Test Issue" + mock_issue.description = "This is a test issue" + mock_issue.priority = 2 + mock_issue.created_at = datetime.now() + mock_issue.updated_at = datetime.now() + mock_issue.parent = None + mock_issue.related_issues = [] + mock_issue.labels = [] + mock_issue.comments = [] + + # Mock state + mock_state = Mock() + mock_state.name = "In Progress" + mock_issue.state = mock_state + + # Mock assignee + mock_assignee = Mock() + mock_assignee.id = "user-id" + mock_assignee.name = "John Doe" + mock_issue.assignee = mock_assignee + + # Mock team + mock_team = Mock() + mock_team.id = "team-id" + mock_team.name = "Engineering" + mock_issue.team = mock_team + + # Mock project + mock_project = Mock() + mock_project.id = "project-id" + mock_project.name = "Q4 Goals" + mock_issue.project = mock_project + + dataset_path = tmp_path / "test.lance" + dataset = FrameDataset.create(str(dataset_path)) + + config = ConnectorConfig( + name="Linear Test", + auth_type=AuthType.API_KEY, + auth_config={"api_key": "test-key"}, + ) + + connector = LinearConnector(config, dataset) + frame = connector._map_issue_to_frame(mock_issue, "collection-id") + + assert frame is not None + assert frame.metadata["title"] == "PROJ-123: Test Issue" + assert frame.metadata["source_type"] == "linear_issue" + assert frame.metadata["status"] == "In Progress" + assert frame.metadata["author"] == "John Doe" + assert "Engineering" in frame.text_content + assert "Q4 Goals" in frame.text_content + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) \ No newline at end of file diff --git a/docs/external-connectors.md b/docs/external-connectors.md new file mode 100644 index 0000000..f9e37e1 --- /dev/null +++ b/docs/external-connectors.md @@ -0,0 +1,296 @@ +# External System Connectors + +ContextFrame provides connectors to import data from external systems like GitHub, Linear, Google Drive, and more. These connectors enable you to build a unified knowledge base from your existing tools and platforms. + +## Overview + +External connectors allow you to: +- Import documents, code, and issues from various platforms +- Keep your ContextFrame dataset synchronized with source systems +- Preserve relationships and metadata from the original systems +- Perform incremental updates to avoid re-importing unchanged data + +## Available Connectors + +### GitHub Connector + +Import repositories, files, and code from GitHub. + +**Features:** +- Import specific paths or entire repositories +- Filter by file patterns (e.g., `*.py`, `*.md`) +- Preserve file structure as collections +- Support for public and private repositories +- Incremental sync based on commits + +**Example:** +```python +from contextframe import FrameDataset +from contextframe.connectors import GitHubConnector, ConnectorConfig, AuthType + +# Configure connector +config = ConnectorConfig( + name="My GitHub Repo", + auth_type=AuthType.TOKEN, + auth_config={"token": "github_pat_xxxxx"}, + sync_config={ + "owner": "myorg", + "repo": "myrepo", + "branch": "main", + "paths": ["/src", "/docs"], + "file_patterns": ["*.py", "*.md"], + "exclude_patterns": ["*test*", "*__pycache__*"] + } +) + +# Create connector and sync +dataset = FrameDataset("my_knowledge.lance") +connector = GitHubConnector(config, dataset) + +if connector.validate_connection(): + result = connector.sync(incremental=True) + print(f"Imported {result.frames_created} new files") +``` + +### Linear Connector + +Import teams, projects, issues, and comments from Linear. + +**Features:** +- Import complete workspace hierarchy +- Preserve relationships between teams, projects, and issues +- Include issue comments and metadata +- Filter by teams, projects, or issue states +- Support for custom field mapping + +**Example:** +```python +from contextframe.connectors import LinearConnector, ConnectorConfig, AuthType + +config = ConnectorConfig( + name="Linear Workspace", + auth_type=AuthType.API_KEY, + auth_config={"api_key": "lin_api_xxxxx"}, + sync_config={ + "sync_teams": True, + "sync_projects": True, + "sync_issues": True, + "include_comments": True, + "include_archived": False, + # Optional filters: + "team_ids": ["team-uuid"], + "issue_states": ["Todo", "In Progress"] + } +) + +connector = LinearConnector(config, dataset) +result = connector.sync() +``` + +## Base Connector Architecture + +All connectors inherit from the `SourceConnector` base class: + +```python +from contextframe.connectors import SourceConnector, ConnectorConfig, SyncResult +from contextframe import FrameRecord + +class MyConnector(SourceConnector): + def validate_connection(self) -> bool: + """Validate connection to source system""" + pass + + def discover_content(self) -> dict[str, Any]: + """Discover available content""" + pass + + def sync(self, incremental: bool = True) -> SyncResult: + """Perform the sync operation""" + pass + + def map_to_frame(self, source_data: dict) -> FrameRecord | None: + """Map source data to FrameRecord""" + pass +``` + +## Authentication + +Connectors support various authentication methods: + +```python +from contextframe.connectors import AuthType + +# API Key +config = ConnectorConfig( + name="My Connector", + auth_type=AuthType.API_KEY, + auth_config={"api_key": "xxxxx"} +) + +# OAuth Token +config = ConnectorConfig( + name="My Connector", + auth_type=AuthType.TOKEN, + auth_config={"token": "xxxxx"} +) + +# Basic Auth +config = ConnectorConfig( + name="My Connector", + auth_type=AuthType.BASIC, + auth_config={"username": "user", "password": "pass"} +) +``` + +## Incremental Sync + +Connectors support incremental synchronization to import only changed data: + +```python +# First sync - imports everything +result = connector.sync(incremental=False) + +# Subsequent syncs - only changes +result = connector.sync(incremental=True) + +# Check what was synced +print(f"Created: {result.frames_created}") +print(f"Updated: {result.frames_updated}") +print(f"Failed: {result.frames_failed}") +``` + +## Error Handling + +Connectors provide detailed error and warning information: + +```python +result = connector.sync() + +if not result.success: + print("Sync failed!") + for error in result.errors: + print(f"Error: {error}") + +if result.warnings: + print("Warnings:") + for warning in result.warnings: + print(f"Warning: {warning}") +``` + +## Collections and Organization + +Connectors automatically organize imported content into collections: + +- GitHub: Creates collections for repositories and folders +- Linear: Creates collections for teams and projects +- Documents are linked to appropriate collections +- Relationships between items are preserved + +## Custom Connectors + +To create a custom connector: + +1. Inherit from `SourceConnector` +2. Implement required methods +3. Handle authentication +4. Map source data to FrameRecords + +Example custom connector: + +```python +from contextframe.connectors import SourceConnector, ConnectorConfig, SyncResult +from contextframe import FrameRecord + +class NotionConnector(SourceConnector): + def __init__(self, config: ConnectorConfig, dataset): + super().__init__(config, dataset) + self.workspace_id = config.sync_config.get("workspace_id") + + def validate_connection(self) -> bool: + try: + # Test API connection + response = self._api_call("/v1/users/me") + return response.status_code == 200 + except: + return False + + def discover_content(self) -> dict[str, Any]: + # Discover pages, databases, etc. + return { + "pages": self._list_pages(), + "databases": self._list_databases() + } + + def sync(self, incremental: bool = True) -> SyncResult: + result = SyncResult(success=True) + + # Get pages to sync + pages = self._get_pages_to_sync(incremental) + + # Import each page + for page in pages: + frame = self.map_to_frame(page) + if frame: + self.dataset.add(frame) + result.frames_created += 1 + + result.complete() + return result + + def map_to_frame(self, page_data: dict) -> FrameRecord | None: + return FrameRecord( + title=page_data["title"], + text_content=page_data["content"], + metadata={ + "source_type": "notion_page", + "source_url": page_data["url"], + "created_at": page_data["created_time"], + "updated_at": page_data["last_edited_time"] + } + ) +``` + +## Best Practices + +1. **Use Incremental Sync**: After initial import, use incremental sync to reduce API calls +2. **Handle Rate Limits**: Configure appropriate timeouts and retry logic +3. **Filter Content**: Use path and pattern filters to import only relevant content +4. **Monitor Sync Results**: Check errors and warnings after each sync +5. **Test Connection**: Always validate connection before syncing + +## Environment Variables + +For security, store credentials in environment variables: + +```bash +export GITHUB_TOKEN="github_pat_xxxxx" +export LINEAR_API_KEY="lin_api_xxxxx" +``` + +Then use in your code: +```python +import os + +config = ConnectorConfig( + name="GitHub", + auth_type=AuthType.TOKEN, + auth_config={"token": os.getenv("GITHUB_TOKEN")} +) +``` + +## Troubleshooting + +### Connection Issues +- Verify credentials are correct +- Check network connectivity +- Ensure API permissions are sufficient + +### Sync Failures +- Review error messages in SyncResult +- Check API rate limits +- Verify source data format matches expectations + +### Performance +- Use incremental sync when possible +- Filter unnecessary content +- Adjust batch sizes for large imports \ No newline at end of file diff --git a/examples/connector_usage.py b/examples/connector_usage.py new file mode 100644 index 0000000..32b5848 --- /dev/null +++ b/examples/connector_usage.py @@ -0,0 +1,208 @@ +"""Example usage of ContextFrame external system connectors.""" + +import os +from pathlib import Path + +from contextframe import FrameDataset +from contextframe.connectors import ( + GitHubConnector, + LinearConnector, + ConnectorConfig, + AuthType, +) + + +def example_github_sync(): + """Example of syncing a GitHub repository.""" + + # Create or load dataset + dataset_path = Path("data/github_contextframe.lance") + if dataset_path.exists(): + dataset = FrameDataset(dataset_path) + else: + dataset = FrameDataset.create(dataset_path) + + # Configure GitHub connector + config = ConnectorConfig( + name="ContextFrame GitHub", + auth_type=AuthType.TOKEN, + auth_config={ + "token": os.getenv("GITHUB_TOKEN"), # Set GITHUB_TOKEN env var + }, + sync_config={ + "owner": "contextframe", + "repo": "contextframe", + "branch": "main", + "paths": ["/contextframe", "/docs"], # Sync specific paths + "file_patterns": ["*.py", "*.md"], # Only Python and Markdown files + "exclude_patterns": ["*test*", "*__pycache__*"], # Exclude tests + } + ) + + # Create and run connector + connector = GitHubConnector(config, dataset) + + # Validate connection + if not connector.validate_connection(): + print("Failed to connect to GitHub") + return + + # Discover content + print("Discovering repository content...") + discovery = connector.discover_content() + print(f"Found {discovery['stats']['total_files']} files") + print(f"File types: {discovery['stats']['file_types']}") + + # Sync content + print("\nSyncing repository...") + result = connector.sync(incremental=True) + + print(f"\nSync completed!") + print(f"- Created: {result.frames_created} frames") + print(f"- Updated: {result.frames_updated} frames") + print(f"- Failed: {result.frames_failed} frames") + + if result.errors: + print(f"\nErrors:") + for error in result.errors: + print(f" - {error}") + + +def example_linear_sync(): + """Example of syncing Linear workspace data.""" + + # Create or load dataset + dataset_path = Path("data/linear_workspace.lance") + if dataset_path.exists(): + dataset = FrameDataset(dataset_path) + else: + dataset = FrameDataset.create(dataset_path) + + # Configure Linear connector + config = ConnectorConfig( + name="Linear Workspace", + auth_type=AuthType.API_KEY, + auth_config={ + "api_key": os.getenv("LINEAR_API_KEY"), # Set LINEAR_API_KEY env var + }, + sync_config={ + "sync_teams": True, + "sync_projects": True, + "sync_issues": True, + "include_archived": False, + "include_comments": True, + # Optional filters: + # "team_ids": ["team-uuid-1", "team-uuid-2"], + # "project_ids": ["project-uuid-1"], + # "issue_states": ["In Progress", "Todo"], + } + ) + + # Create and run connector + connector = LinearConnector(config, dataset) + + # Validate connection + if not connector.validate_connection(): + print("Failed to connect to Linear") + return + + # Discover content + print("Discovering Linear workspace...") + discovery = connector.discover_content() + print(f"Organization: {discovery['workspace']['organization']['name']}") + print(f"Teams: {len(discovery['teams'])}") + print(f"Projects: {len(discovery['projects'])}") + print(f"Issues: {discovery['issue_stats']['total']}") + + # Sync content + print("\nSyncing workspace...") + result = connector.sync(incremental=True) + + print(f"\nSync completed!") + print(f"- Created: {result.frames_created} frames") + print(f"- Updated: {result.frames_updated} frames") + print(f"- Failed: {result.frames_failed} frames") + + if result.warnings: + print(f"\nWarnings:") + for warning in result.warnings: + print(f" - {warning}") + + +def example_combined_workflow(): + """Example of combining data from multiple sources.""" + + # Create unified dataset + dataset_path = Path("data/unified_knowledge.lance") + if dataset_path.exists(): + dataset = FrameDataset(dataset_path) + else: + dataset = FrameDataset.create(dataset_path) + + # Sync GitHub documentation + github_config = ConnectorConfig( + name="Docs from GitHub", + auth_type=AuthType.TOKEN, + auth_config={"token": os.getenv("GITHUB_TOKEN")}, + sync_config={ + "owner": "myorg", + "repo": "documentation", + "paths": ["/docs"], + "file_patterns": ["*.md"], + } + ) + + github_connector = GitHubConnector(github_config, dataset) + if github_connector.validate_connection(): + print("Syncing GitHub documentation...") + github_result = github_connector.sync() + print(f"GitHub: {github_result.frames_created} new docs") + + # Sync Linear issues related to documentation + linear_config = ConnectorConfig( + name="Doc Issues from Linear", + auth_type=AuthType.API_KEY, + auth_config={"api_key": os.getenv("LINEAR_API_KEY")}, + sync_config={ + "sync_teams": False, + "sync_projects": False, + "sync_issues": True, + "issue_states": ["Todo", "In Progress"], + # Filter for documentation-related issues + } + ) + + linear_connector = LinearConnector(linear_config, dataset) + if linear_connector.validate_connection(): + print("Syncing Linear issues...") + linear_result = linear_connector.sync() + print(f"Linear: {linear_result.frames_created} new issues") + + # Search across unified dataset + print("\nSearching unified knowledge base...") + results = dataset.search("documentation bug", limit=5) + for result in results: + print(f"- {result.metadata['title']} ({result.metadata['source_type']})") + + +if __name__ == "__main__": + # Run examples based on available credentials + if os.getenv("GITHUB_TOKEN"): + print("=== GitHub Sync Example ===") + example_github_sync() + print("\n") + else: + print("Set GITHUB_TOKEN environment variable to run GitHub example") + + if os.getenv("LINEAR_API_KEY"): + print("=== Linear Sync Example ===") + example_linear_sync() + print("\n") + else: + print("Set LINEAR_API_KEY environment variable to run Linear example") + + if os.getenv("GITHUB_TOKEN") and os.getenv("LINEAR_API_KEY"): + print("=== Combined Workflow Example ===") + example_combined_workflow() + else: + print("\nSet both GITHUB_TOKEN and LINEAR_API_KEY to run combined example") \ No newline at end of file