From fdca80ef93ad7fcd35e9822444dd9848ae4e3a0b Mon Sep 17 00:00:00 2001 From: Jay Scambler Date: Thu, 19 Jun 2025 15:15:06 -0500 Subject: [PATCH] fix: Update batch tools and add collection tool registration - Fix FrameRecord initialization in batch tools (remove invalid parameters) - Add collection tools registration to ToolRegistry - Add implementation documentation for collection metadata fixes --- .../collection_metadata_fix.md | 73 ++++ ...ction_metadata_lance_filtering_analysis.md | 228 ++++++++++++ .../phase3.3_collection_management.md | 329 ++++++++++++++++++ contextframe/mcp/batch/tools.py | 5 +- contextframe/mcp/tools.py | 10 + 5 files changed, 641 insertions(+), 4 deletions(-) create mode 100644 .claude/implementations/collection_metadata_fix.md create mode 100644 .claude/implementations/collection_metadata_lance_filtering_analysis.md create mode 100644 .claude/implementations/phase3.3_collection_management.md diff --git a/.claude/implementations/collection_metadata_fix.md b/.claude/implementations/collection_metadata_fix.md new file mode 100644 index 0000000..66f2aea --- /dev/null +++ b/.claude/implementations/collection_metadata_fix.md @@ -0,0 +1,73 @@ +# Collection Metadata Storage Fix + +## Problem +The collection management tools are storing metadata in `x_collection_metadata` which is not persisted to Lance because: +1. It's not a column in the Arrow schema +2. It's stored in the in-memory metadata dict but lost when reloading from Lance + +## Solution +Use the existing `custom_metadata` field which is properly persisted to Lance. Since `custom_metadata` only supports string key-value pairs, we need to: + +1. Prefix collection-specific metadata with `collection_` +2. Prefix shared metadata with `shared_` +3. Convert all values to strings +4. Parse values back when reading + +## Implementation Changes + +### Storage Pattern +```python +# Instead of: +"x_collection_metadata": { + "created_at": "2024-01-19", + "member_count": 5, + "shared_metadata": {"key": "value"} +} + +# Use: +"custom_metadata": { + "collection_created_at": "2024-01-19", + "collection_member_count": "5", + "shared_key": "value" +} +``` + +### Helper Methods +```python +def _get_collection_metadata(self, record: FrameRecord) -> dict: + """Extract collection metadata from custom_metadata.""" + custom = record.metadata.get("custom_metadata", {}) + return { + "created_at": custom.get("collection_created_at", ""), + "updated_at": custom.get("collection_updated_at", ""), + "member_count": int(custom.get("collection_member_count", "0")), + "total_size": int(custom.get("collection_total_size", "0")), + "template": custom.get("collection_template", ""), + "shared_metadata": { + k[7:]: v for k, v in custom.items() + if k.startswith("shared_") + } + } + +def _set_collection_metadata(self, record: FrameRecord, coll_meta: dict): + """Store collection metadata in custom_metadata.""" + if "custom_metadata" not in record.metadata: + record.metadata["custom_metadata"] = {} + + custom = record.metadata["custom_metadata"] + custom["collection_created_at"] = coll_meta.get("created_at", "") + custom["collection_updated_at"] = coll_meta.get("updated_at", "") + custom["collection_member_count"] = str(coll_meta.get("member_count", 0)) + custom["collection_total_size"] = str(coll_meta.get("total_size", 0)) + custom["collection_template"] = coll_meta.get("template", "") + + # Store shared metadata + for key, value in coll_meta.get("shared_metadata", {}).items(): + custom[f"shared_{key}"] = str(value) +``` + +## Benefits +1. Uses existing schema - no changes needed +2. Data persists correctly to Lance +3. Backward compatible with queries +4. Can still use Lance filtering on record_type and collection_id \ No newline at end of file diff --git a/.claude/implementations/collection_metadata_lance_filtering_analysis.md b/.claude/implementations/collection_metadata_lance_filtering_analysis.md new file mode 100644 index 0000000..36cd512 --- /dev/null +++ b/.claude/implementations/collection_metadata_lance_filtering_analysis.md @@ -0,0 +1,228 @@ +# Collection Metadata and Lance Filtering Analysis + +## Problem Statement + +The current implementation of collection management tools attempts to filter on `x_collection_metadata.parent_collection` using Lance SQL queries, but this fails because: + +1. `x_collection_metadata` is stored in the FrameRecord's metadata dictionary, not as a Lance column +2. Lance can only filter on actual columns defined in the Arrow schema +3. The x_ prefix pattern allows fields at the metadata root, but they're not persisted to Lance + +## Current Schema Analysis + +### Available Lance Columns (from Arrow schema) + +```python +# From contextframe_schema.py +- uuid (string, not null) +- text_content (string) +- vector (list) +- title (string, not null) +- version (string) +- context (string) +- uri (string) +- local_path (string) +- cid (string) +- collection (string) # ← Available for filtering +- collection_id (string) # ← Available for filtering +- collection_id_type (string) # ← Available for filtering +- position (int32) +- author (string) +- contributors (list) +- created_at (string) +- updated_at (string) +- tags (list) +- status (string) +- source_file (string) +- source_type (string) +- source_url (string) +- relationships (list) # ← Available for complex queries +- custom_metadata (list) # ← Key-value pairs (string only) +- record_type (string) # ← Available for filtering +- raw_data_type (string) +- raw_data (large_binary) +``` + +### Current Collection Implementation + +Collections are implemented as special documents with: +- `record_type: "collection_header"` +- `x_collection_metadata` containing: + - `parent_collection` (UUID of parent) + - `member_count` + - `created_at` + - `template` + - `shared_metadata` + +## Options for Lance-Friendly Collection Storage + +### Option 1: Use Existing Schema Fields (Recommended) + +**Approach:** +- Use `collection_id` to store parent collection UUID +- Use `relationships` for hierarchical structure +- Store collection metadata in `custom_metadata` + +**Implementation:** +```python +# Collection header document +{ + "record_type": "collection_header", + "title": "My Collection", + "collection_id": "parent-collection-uuid", # Parent collection + "collection_id_type": "uuid", + "custom_metadata": [ + {"key": "member_count", "value": "42"}, + {"key": "template", "value": "project"}, + {"key": "created_at", "value": "2024-01-19"} + ], + "relationships": [ + { + "type": "parent", + "id": "parent-collection-uuid", + "title": "Parent Collection Name" + } + ] +} + +# Member document +{ + "title": "Document in collection", + "collection": "My Collection", # Collection name + "collection_id": "collection-uuid", # Collection UUID + "collection_id_type": "uuid", + "relationships": [ + { + "type": "reference", + "id": "collection-uuid", + "title": "Member of My Collection" + } + ] +} +``` + +**Pros:** +- ✅ Uses existing schema - no changes needed +- ✅ Enables Lance native filtering: `collection_id = 'parent-uuid'` +- ✅ Relationships provide rich linking +- ✅ Backward compatible + +**Cons:** +- ❌ `custom_metadata` only supports string values +- ❌ Slight semantic overload of `collection_id` field + +### Option 2: Add Dedicated Collection Fields to Schema + +**Approach:** +Add new fields to Arrow schema: +```python +pa.field("parent_collection_id", pa.string()), +pa.field("collection_metadata", pa.struct([ + pa.field("member_count", pa.int32()), + pa.field("template", pa.string()), + pa.field("shared_metadata", pa.map_(pa.string(), pa.string())) +])) +``` + +**Pros:** +- ✅ Clean, semantic field names +- ✅ Native Lance filtering on all fields +- ✅ Supports proper data types + +**Cons:** +- ❌ Requires schema migration +- ❌ Breaking change for existing datasets +- ❌ Adds fields only used by collection headers + +### Option 3: Hybrid Approach + +**Approach:** +- Use relationships for hierarchy (Lance filterable) +- Keep x_collection_metadata for rich metadata (Python filtered) +- Add helper methods for common queries + +**Implementation:** +```python +def find_subcollections(self, parent_id: str): + # Use relationships for efficient filtering + results = self.dataset.scanner( + filter=f"record_type = 'collection_header' AND relationships.id = '{parent_id}' AND relationships.type = 'child'" + ).to_table() + + # Then access x_collection_metadata for additional data + return [self._enrich_with_metadata(r) for r in results] +``` + +## Recommendation: Option 1 with Enhancements + +Use existing schema fields with these patterns: + +1. **For Parent-Child Hierarchy:** + - Store parent UUID in `collection_id` + - Use bidirectional relationships (parent/child) + +2. **For Collection Membership:** + - Documents use `collection_id` for their collection + - Add "reference" relationship to collection header + +3. **For Collection Metadata:** + - Critical fields (member_count) tracked separately + - Less critical metadata in custom_metadata + - Consider caching computed values + +4. **Query Patterns:** + ```python + # Find subcollections (Lance native) + f"record_type = 'collection_header' AND collection_id = '{parent_id}'" + + # Find collection members (Lance native) + f"collection_id = '{collection_id}' AND record_type = 'document'" + + # Find by relationship (Lance native) + f"relationships.id = '{target_id}' AND relationships.type = 'child'" + ``` + +## Migration Strategy + +1. **Phase 1: Update Collection Tools** + - Modify create_collection to use collection_id for parent + - Update queries to use Lance-native filters + - Add relationships for richer navigation + +2. **Phase 2: Migration Utilities** + - Script to migrate existing x_collection_metadata + - Backward compatibility layer + +3. **Phase 3: Documentation** + - Update collection patterns + - Query examples + - Best practices + +## Performance Implications + +**Current Approach (Python filtering):** +- O(n) scan of all collection headers +- Memory load of all records +- ~1-10ms for small datasets, >100ms for large + +**Proposed Approach (Lance filtering):** +- Index-assisted filtering +- Lazy loading of results +- <1ms for most queries + +## Decision Criteria + +Choose Option 1 because: +1. No schema changes required +2. Immediate performance benefits +3. Maintains backward compatibility +4. Leverages existing, tested fields +5. Relationships provide future flexibility + +## Next Steps + +1. Update collection tools to use collection_id for parent storage +2. Implement bidirectional relationships +3. Update queries to use Lance native filters +4. Add migration utility for existing collections +5. Update tests to reflect new patterns \ No newline at end of file diff --git a/.claude/implementations/phase3.3_collection_management.md b/.claude/implementations/phase3.3_collection_management.md new file mode 100644 index 0000000..08e07e6 --- /dev/null +++ b/.claude/implementations/phase3.3_collection_management.md @@ -0,0 +1,329 @@ +# Phase 3.3: Collection Management Tools + +## Overview + +Phase 3.3 implements comprehensive collection management capabilities for the MCP server, enabling users to organize documents into logical groups with metadata, hierarchies, and templates. Collections provide a powerful way to manage related documents, track relationships, and apply consistent metadata across document sets. + +## Core Concepts + +### What is a Collection? + +A collection in ContextFrame is a logical grouping of documents with: +- A header document (record_type: collection_header) containing metadata +- Member documents that belong to the collection +- Optional hierarchy (collections can contain sub-collections) +- Shared metadata that can be applied to all members +- Statistics and analytics about the collection contents + +### Collection Use Cases + +1. **Project Documentation**: Group all docs for a project (README, API docs, guides) +2. **Research Papers**: Organize papers by topic, author, or conference +3. **Knowledge Base**: Hierarchical organization of support articles +4. **Training Data**: Curated sets of documents for ML/AI training +5. **Version Control**: Group documents by release or version + +## Implementation Timeline + +**Duration**: 1 week (Part of Phase 3, Week 3) + +1. **Day 1-2**: Schema design and core data structures +2. **Day 3-4**: CRUD operations implementation +3. **Day 5**: Statistics and analytics +4. **Day 6**: Template system +5. **Day 7**: Testing and documentation + +## Tool Specifications + +### 1. create_collection + +Creates a new collection with header and initial configuration. + +**Schema**: +```python +class CreateCollectionParams(BaseModel): + name: str = Field(..., description="Collection name") + description: Optional[str] = Field(None, description="Collection description") + metadata: Optional[Dict[str, Any]] = Field(default_factory=dict, description="Collection metadata") + parent_collection: Optional[str] = Field(None, description="Parent collection ID for hierarchies") + template: Optional[str] = Field(None, description="Template name to apply") + initial_members: Optional[List[str]] = Field(default_factory=list, description="Document IDs to add") +``` + +**Response**: +```python +{ + "collection_id": "uuid", + "header_id": "uuid", + "name": "Project Alpha Docs", + "created_at": "2024-01-15T10:00:00Z", + "member_count": 0, + "metadata": {...} +} +``` + +### 2. update_collection + +Updates collection properties and metadata. + +**Schema**: +```python +class UpdateCollectionParams(BaseModel): + collection_id: str = Field(..., description="Collection ID to update") + name: Optional[str] = Field(None, description="New name") + description: Optional[str] = Field(None, description="New description") + metadata_updates: Optional[Dict[str, Any]] = Field(None, description="Metadata to update") + add_members: Optional[List[str]] = Field(default_factory=list, description="Document IDs to add") + remove_members: Optional[List[str]] = Field(default_factory=list, description="Document IDs to remove") +``` + +### 3. delete_collection + +Deletes a collection and optionally its members. + +**Schema**: +```python +class DeleteCollectionParams(BaseModel): + collection_id: str = Field(..., description="Collection ID to delete") + delete_members: bool = Field(False, description="Also delete member documents") + recursive: bool = Field(False, description="Delete sub-collections recursively") +``` + +### 4. list_collections + +Lists all collections with filtering and statistics. + +**Schema**: +```python +class ListCollectionsParams(BaseModel): + parent_id: Optional[str] = Field(None, description="Filter by parent collection") + include_stats: bool = Field(True, description="Include member statistics") + include_empty: bool = Field(True, description="Include collections with no members") + sort_by: Literal["name", "created_at", "member_count"] = Field("name") + limit: int = Field(100, description="Maximum collections to return") +``` + +### 5. move_documents + +Moves documents between collections. + +**Schema**: +```python +class MoveDocumentsParams(BaseModel): + document_ids: List[str] = Field(..., description="Documents to move") + source_collection: Optional[str] = Field(None, description="Source collection (None for uncollected)") + target_collection: Optional[str] = Field(None, description="Target collection (None to remove from collection)") + update_metadata: bool = Field(True, description="Apply target collection metadata") +``` + +### 6. get_collection_stats + +Gets detailed analytics for a collection. + +**Schema**: +```python +class GetCollectionStatsParams(BaseModel): + collection_id: str = Field(..., description="Collection ID") + include_member_details: bool = Field(False, description="Include per-member statistics") + include_subcollections: bool = Field(True, description="Include sub-collection stats") +``` + +**Response**: +```python +{ + "collection_id": "uuid", + "name": "Research Papers 2024", + "statistics": { + "total_members": 150, + "direct_members": 120, + "subcollection_members": 30, + "total_size_bytes": 25000000, + "avg_document_size": 166666, + "unique_tags": ["ml", "nlp", "cv"], + "date_range": { + "earliest": "2024-01-01", + "latest": "2024-01-15" + }, + "member_types": { + "document": 145, + "collection_header": 5 + } + }, + "subcollections": [...], + "metadata": {...} +} +``` + +## Collection Templates + +Templates provide pre-configured collection structures for common use cases. + +### Built-in Templates + +1. **project_template**: + - README → overview + - src/ → implementation + - docs/ → documentation + - tests/ → validation + +2. **research_template**: + - Papers by year + - Papers by topic + - Papers by author + - Citations network + +3. **knowledge_base_template**: + - Categories → subcategories → articles + - FAQ section + - Troubleshooting guides + +### Template Schema + +```python +class CollectionTemplate(BaseModel): + name: str + description: str + structure: Dict[str, Any] # Hierarchical structure definition + default_metadata: Dict[str, Any] + naming_pattern: str # e.g., "{year}-{topic}" + auto_organize_rules: List[Dict[str, Any]] # Rules for auto-organization +``` + +## Implementation Details + +### File Structure + +``` +contextframe/mcp/collections/ +├── __init__.py +├── tools.py # Collection tool implementations +├── schemas.py # Collection-specific schemas +├── templates.py # Template definitions and loader +└── stats.py # Statistics calculation utilities +``` + +### Core Classes + +```python +# collections/tools.py +class CollectionTools: + """Collection management tools for MCP server.""" + + def __init__(self, dataset: FrameDataset, transport: TransportAdapter): + self.dataset = dataset + self.transport = transport + self.templates = TemplateRegistry() + + async def create_collection(self, params: Dict[str, Any]) -> Dict[str, Any]: + """Create a new collection with header.""" + ... + + async def update_collection(self, params: Dict[str, Any]) -> Dict[str, Any]: + """Update collection properties.""" + ... + + # ... other tool methods +``` + +### Collection Header Schema + +Collections use a special header document with record_type="collection_header": + +```python +{ + "record_type": "collection_header", + "title": "Collection Name", + "context": "Collection description and purpose", + "metadata": { + "collection_metadata": { + "created_at": "2024-01-15T10:00:00Z", + "template": "project_template", + "parent_collection": "parent-uuid", + "member_count": 42, + "total_size": 1048576, + "shared_metadata": { + # Metadata applied to all members + } + } + }, + "relationships": [ + { + "type": "contains", + "target": "member-doc-uuid", + "title": "Member document" + } + ] +} +``` + +### Relationship Management + +Collections use the existing relationship system: +- Header → Members: "contains" relationship +- Members → Header: "member_of" relationship +- Collection → Subcollection: "parent/child" relationships + +## Testing Strategy + +### Unit Tests + +1. Test each collection tool independently +2. Mock FrameDataset operations +3. Verify schema validation +4. Test error handling + +### Integration Tests + +1. Full collection lifecycle (create, update, add members, stats, delete) +2. Hierarchical collections +3. Template application +4. Bulk operations with collections +5. Cross-transport compatibility + +### Test Scenarios + +```python +# test_collection_tools.py +class TestCollectionTools: + async def test_create_project_collection(self): + """Test creating a project collection with template.""" + + async def test_move_documents_between_collections(self): + """Test moving documents preserves relationships.""" + + async def test_collection_statistics(self): + """Test accurate statistics calculation.""" + + async def test_hierarchical_collections(self): + """Test parent/child collection relationships.""" +``` + +## Security Considerations + +1. **Access Control**: Collections can have access restrictions +2. **Metadata Validation**: Ensure metadata doesn't leak sensitive info +3. **Size Limits**: Prevent collections from growing too large +4. **Recursive Operations**: Limit depth to prevent DoS + +## Performance Optimizations + +1. **Lazy Loading**: Don't load all members unless requested +2. **Cached Statistics**: Cache collection stats with TTL +3. **Batch Operations**: Optimize bulk membership changes +4. **Index Usage**: Use collection indexes for fast queries + +## Success Metrics + +- Create collection in < 100ms +- List 1000 collections in < 500ms +- Move 100 documents in < 1s +- Calculate stats for 10k member collection in < 2s +- Template application in < 200ms + +## Migration Path + +For existing datasets without collections: +1. Auto-create "Uncategorized" collection +2. Provide migration tool to organize existing docs +3. Preserve all existing relationships +4. No breaking changes to document structure \ No newline at end of file diff --git a/contextframe/mcp/batch/tools.py b/contextframe/mcp/batch/tools.py index f4dcce1..ddc3705 100644 --- a/contextframe/mcp/batch/tools.py +++ b/contextframe/mcp/batch/tools.py @@ -144,10 +144,7 @@ async def batch_add(self, params: Dict[str, Any]) -> Dict[str, Any]: # Create record record = FrameRecord( text_content=content, - metadata=metadata, - collection=shared.get("collection"), - chunk_index=0, - total_chunks=1 + metadata=metadata ) # Generate embeddings if requested diff --git a/contextframe/mcp/tools.py b/contextframe/mcp/tools.py index bb7fbee..ddb0091 100644 --- a/contextframe/mcp/tools.py +++ b/contextframe/mcp/tools.py @@ -67,6 +67,16 @@ def __init__(self, dataset: FrameDataset, transport: Optional[Any] = None): batch_tools.register_tools(self) except ImportError: logger.warning("Batch tools not available") + + # Register collection tools + try: + from contextframe.mcp.collections.tools import CollectionTools + from contextframe.mcp.collections.templates import TemplateRegistry + template_registry = TemplateRegistry() + collection_tools = CollectionTools(dataset, transport, template_registry) + collection_tools.register_tools(self) + except ImportError: + logger.warning("Collection tools not available") def _register_default_tools(self): """Register the default set of tools."""