From fdca80ef93ad7fcd35e9822444dd9848ae4e3a0b Mon Sep 17 00:00:00 2001
From: Jay Scambler <jayscambler@gmail.com>
Date: Thu, 19 Jun 2025 15:15:06 -0500
Subject: [PATCH] fix: Update batch tools and add collection tool registration

- Fix FrameRecord initialization in batch tools (remove invalid parameters)
- Add collection tools registration to ToolRegistry
- Add implementation documentation for collection metadata fixes
---
 .../collection_metadata_fix.md                |  73 ++++
 ...ction_metadata_lance_filtering_analysis.md | 228 ++++++++++++
 .../phase3.3_collection_management.md         | 329 ++++++++++++++++++
 contextframe/mcp/batch/tools.py               |   5 +-
 contextframe/mcp/tools.py                     |  10 +
 5 files changed, 641 insertions(+), 4 deletions(-)
 create mode 100644 .claude/implementations/collection_metadata_fix.md
 create mode 100644 .claude/implementations/collection_metadata_lance_filtering_analysis.md
 create mode 100644 .claude/implementations/phase3.3_collection_management.md

diff --git a/.claude/implementations/collection_metadata_fix.md b/.claude/implementations/collection_metadata_fix.md
new file mode 100644
index 0000000..66f2aea
--- /dev/null
+++ b/.claude/implementations/collection_metadata_fix.md
@@ -0,0 +1,73 @@
+# Collection Metadata Storage Fix
+
+## Problem
+The collection management tools are storing metadata in `x_collection_metadata` which is not persisted to Lance because:
+1. It's not a column in the Arrow schema
+2. It's stored in the in-memory metadata dict but lost when reloading from Lance
+
+## Solution
+Use the existing `custom_metadata` field which is properly persisted to Lance. Since `custom_metadata` only supports string key-value pairs, we need to:
+
+1. Prefix collection-specific metadata with `collection_`
+2. Prefix shared metadata with `shared_`
+3. Convert all values to strings
+4. Parse values back when reading
+
+## Implementation Changes
+
+### Storage Pattern
+```python
+# Instead of:
+"x_collection_metadata": {
+    "created_at": "2024-01-19",
+    "member_count": 5,
+    "shared_metadata": {"key": "value"}
+}
+
+# Use:
+"custom_metadata": {
+    "collection_created_at": "2024-01-19",
+    "collection_member_count": "5",
+    "shared_key": "value"
+}
+```
+
+### Helper Methods
+```python
+def _get_collection_metadata(self, record: FrameRecord) -> dict:
+    """Extract collection metadata from custom_metadata."""
+    custom = record.metadata.get("custom_metadata", {})
+    return {
+        "created_at": custom.get("collection_created_at", ""),
+        "updated_at": custom.get("collection_updated_at", ""),
+        "member_count": int(custom.get("collection_member_count", "0")),
+        "total_size": int(custom.get("collection_total_size", "0")),
+        "template": custom.get("collection_template", ""),
+        "shared_metadata": {
+            k[7:]: v for k, v in custom.items() 
+            if k.startswith("shared_")
+        }
+    }
+
+def _set_collection_metadata(self, record: FrameRecord, coll_meta: dict):
+    """Store collection metadata in custom_metadata."""
+    if "custom_metadata" not in record.metadata:
+        record.metadata["custom_metadata"] = {}
+    
+    custom = record.metadata["custom_metadata"]
+    custom["collection_created_at"] = coll_meta.get("created_at", "")
+    custom["collection_updated_at"] = coll_meta.get("updated_at", "")
+    custom["collection_member_count"] = str(coll_meta.get("member_count", 0))
+    custom["collection_total_size"] = str(coll_meta.get("total_size", 0))
+    custom["collection_template"] = coll_meta.get("template", "")
+    
+    # Store shared metadata
+    for key, value in coll_meta.get("shared_metadata", {}).items():
+        custom[f"shared_{key}"] = str(value)
+```
+
+## Benefits
+1. Uses existing schema - no changes needed
+2. Data persists correctly to Lance
+3. Backward compatible with queries
+4. Can still use Lance filtering on record_type and collection_id
\ No newline at end of file
diff --git a/.claude/implementations/collection_metadata_lance_filtering_analysis.md b/.claude/implementations/collection_metadata_lance_filtering_analysis.md
new file mode 100644
index 0000000..36cd512
--- /dev/null
+++ b/.claude/implementations/collection_metadata_lance_filtering_analysis.md
@@ -0,0 +1,228 @@
+# Collection Metadata and Lance Filtering Analysis
+
+## Problem Statement
+
+The current implementation of collection management tools attempts to filter on `x_collection_metadata.parent_collection` using Lance SQL queries, but this fails because:
+
+1. `x_collection_metadata` is stored in the FrameRecord's metadata dictionary, not as a Lance column
+2. Lance can only filter on actual columns defined in the Arrow schema
+3. The x_ prefix pattern allows fields at the metadata root, but they're not persisted to Lance
+
+## Current Schema Analysis
+
+### Available Lance Columns (from Arrow schema)
+
+```python
+# From contextframe_schema.py
+- uuid (string, not null)
+- text_content (string)
+- vector (list<float32>)
+- title (string, not null)
+- version (string)
+- context (string)
+- uri (string)
+- local_path (string)
+- cid (string)
+- collection (string)              # ← Available for filtering
+- collection_id (string)           # ← Available for filtering
+- collection_id_type (string)      # ← Available for filtering
+- position (int32)
+- author (string)
+- contributors (list<string>)
+- created_at (string)
+- updated_at (string)
+- tags (list<string>)
+- status (string)
+- source_file (string)
+- source_type (string)
+- source_url (string)
+- relationships (list<struct>)     # ← Available for complex queries
+- custom_metadata (list<struct>)   # ← Key-value pairs (string only)
+- record_type (string)             # ← Available for filtering
+- raw_data_type (string)
+- raw_data (large_binary)
+```
+
+### Current Collection Implementation
+
+Collections are implemented as special documents with:
+- `record_type: "collection_header"`
+- `x_collection_metadata` containing:
+  - `parent_collection` (UUID of parent)
+  - `member_count`
+  - `created_at`
+  - `template`
+  - `shared_metadata`
+
+## Options for Lance-Friendly Collection Storage
+
+### Option 1: Use Existing Schema Fields (Recommended)
+
+**Approach:**
+- Use `collection_id` to store parent collection UUID
+- Use `relationships` for hierarchical structure
+- Store collection metadata in `custom_metadata`
+
+**Implementation:**
+```python
+# Collection header document
+{
+    "record_type": "collection_header",
+    "title": "My Collection",
+    "collection_id": "parent-collection-uuid",  # Parent collection
+    "collection_id_type": "uuid",
+    "custom_metadata": [
+        {"key": "member_count", "value": "42"},
+        {"key": "template", "value": "project"},
+        {"key": "created_at", "value": "2024-01-19"}
+    ],
+    "relationships": [
+        {
+            "type": "parent",
+            "id": "parent-collection-uuid",
+            "title": "Parent Collection Name"
+        }
+    ]
+}
+
+# Member document
+{
+    "title": "Document in collection",
+    "collection": "My Collection",        # Collection name
+    "collection_id": "collection-uuid",    # Collection UUID
+    "collection_id_type": "uuid",
+    "relationships": [
+        {
+            "type": "reference",
+            "id": "collection-uuid",
+            "title": "Member of My Collection"
+        }
+    ]
+}
+```
+
+**Pros:**
+- ✅ Uses existing schema - no changes needed
+- ✅ Enables Lance native filtering: `collection_id = 'parent-uuid'`
+- ✅ Relationships provide rich linking
+- ✅ Backward compatible
+
+**Cons:**
+- ❌ `custom_metadata` only supports string values
+- ❌ Slight semantic overload of `collection_id` field
+
+### Option 2: Add Dedicated Collection Fields to Schema
+
+**Approach:**
+Add new fields to Arrow schema:
+```python
+pa.field("parent_collection_id", pa.string()),
+pa.field("collection_metadata", pa.struct([
+    pa.field("member_count", pa.int32()),
+    pa.field("template", pa.string()),
+    pa.field("shared_metadata", pa.map_(pa.string(), pa.string()))
+]))
+```
+
+**Pros:**
+- ✅ Clean, semantic field names
+- ✅ Native Lance filtering on all fields
+- ✅ Supports proper data types
+
+**Cons:**
+- ❌ Requires schema migration
+- ❌ Breaking change for existing datasets
+- ❌ Adds fields only used by collection headers
+
+### Option 3: Hybrid Approach
+
+**Approach:**
+- Use relationships for hierarchy (Lance filterable)
+- Keep x_collection_metadata for rich metadata (Python filtered)
+- Add helper methods for common queries
+
+**Implementation:**
+```python
+def find_subcollections(self, parent_id: str):
+    # Use relationships for efficient filtering
+    results = self.dataset.scanner(
+        filter=f"record_type = 'collection_header' AND relationships.id = '{parent_id}' AND relationships.type = 'child'"
+    ).to_table()
+    
+    # Then access x_collection_metadata for additional data
+    return [self._enrich_with_metadata(r) for r in results]
+```
+
+## Recommendation: Option 1 with Enhancements
+
+Use existing schema fields with these patterns:
+
+1. **For Parent-Child Hierarchy:**
+   - Store parent UUID in `collection_id`
+   - Use bidirectional relationships (parent/child)
+
+2. **For Collection Membership:**
+   - Documents use `collection_id` for their collection
+   - Add "reference" relationship to collection header
+
+3. **For Collection Metadata:**
+   - Critical fields (member_count) tracked separately
+   - Less critical metadata in custom_metadata
+   - Consider caching computed values
+
+4. **Query Patterns:**
+   ```python
+   # Find subcollections (Lance native)
+   f"record_type = 'collection_header' AND collection_id = '{parent_id}'"
+   
+   # Find collection members (Lance native)
+   f"collection_id = '{collection_id}' AND record_type = 'document'"
+   
+   # Find by relationship (Lance native)
+   f"relationships.id = '{target_id}' AND relationships.type = 'child'"
+   ```
+
+## Migration Strategy
+
+1. **Phase 1: Update Collection Tools**
+   - Modify create_collection to use collection_id for parent
+   - Update queries to use Lance-native filters
+   - Add relationships for richer navigation
+
+2. **Phase 2: Migration Utilities**
+   - Script to migrate existing x_collection_metadata
+   - Backward compatibility layer
+
+3. **Phase 3: Documentation**
+   - Update collection patterns
+   - Query examples
+   - Best practices
+
+## Performance Implications
+
+**Current Approach (Python filtering):**
+- O(n) scan of all collection headers
+- Memory load of all records
+- ~1-10ms for small datasets, >100ms for large
+
+**Proposed Approach (Lance filtering):**
+- Index-assisted filtering
+- Lazy loading of results
+- <1ms for most queries
+
+## Decision Criteria
+
+Choose Option 1 because:
+1. No schema changes required
+2. Immediate performance benefits
+3. Maintains backward compatibility
+4. Leverages existing, tested fields
+5. Relationships provide future flexibility
+
+## Next Steps
+
+1. Update collection tools to use collection_id for parent storage
+2. Implement bidirectional relationships
+3. Update queries to use Lance native filters
+4. Add migration utility for existing collections
+5. Update tests to reflect new patterns
\ No newline at end of file
diff --git a/.claude/implementations/phase3.3_collection_management.md b/.claude/implementations/phase3.3_collection_management.md
new file mode 100644
index 0000000..08e07e6
--- /dev/null
+++ b/.claude/implementations/phase3.3_collection_management.md
@@ -0,0 +1,329 @@
+# Phase 3.3: Collection Management Tools
+
+## Overview
+
+Phase 3.3 implements comprehensive collection management capabilities for the MCP server, enabling users to organize documents into logical groups with metadata, hierarchies, and templates. Collections provide a powerful way to manage related documents, track relationships, and apply consistent metadata across document sets.
+
+## Core Concepts
+
+### What is a Collection?
+
+A collection in ContextFrame is a logical grouping of documents with:
+- A header document (record_type: collection_header) containing metadata
+- Member documents that belong to the collection
+- Optional hierarchy (collections can contain sub-collections)
+- Shared metadata that can be applied to all members
+- Statistics and analytics about the collection contents
+
+### Collection Use Cases
+
+1. **Project Documentation**: Group all docs for a project (README, API docs, guides)
+2. **Research Papers**: Organize papers by topic, author, or conference
+3. **Knowledge Base**: Hierarchical organization of support articles
+4. **Training Data**: Curated sets of documents for ML/AI training
+5. **Version Control**: Group documents by release or version
+
+## Implementation Timeline
+
+**Duration**: 1 week (Part of Phase 3, Week 3)
+
+1. **Day 1-2**: Schema design and core data structures
+2. **Day 3-4**: CRUD operations implementation
+3. **Day 5**: Statistics and analytics
+4. **Day 6**: Template system
+5. **Day 7**: Testing and documentation
+
+## Tool Specifications
+
+### 1. create_collection
+
+Creates a new collection with header and initial configuration.
+
+**Schema**:
+```python
+class CreateCollectionParams(BaseModel):
+    name: str = Field(..., description="Collection name")
+    description: Optional[str] = Field(None, description="Collection description")
+    metadata: Optional[Dict[str, Any]] = Field(default_factory=dict, description="Collection metadata")
+    parent_collection: Optional[str] = Field(None, description="Parent collection ID for hierarchies")
+    template: Optional[str] = Field(None, description="Template name to apply")
+    initial_members: Optional[List[str]] = Field(default_factory=list, description="Document IDs to add")
+```
+
+**Response**:
+```python
+{
+    "collection_id": "uuid",
+    "header_id": "uuid",
+    "name": "Project Alpha Docs",
+    "created_at": "2024-01-15T10:00:00Z",
+    "member_count": 0,
+    "metadata": {...}
+}
+```
+
+### 2. update_collection
+
+Updates collection properties and metadata.
+
+**Schema**:
+```python
+class UpdateCollectionParams(BaseModel):
+    collection_id: str = Field(..., description="Collection ID to update")
+    name: Optional[str] = Field(None, description="New name")
+    description: Optional[str] = Field(None, description="New description")
+    metadata_updates: Optional[Dict[str, Any]] = Field(None, description="Metadata to update")
+    add_members: Optional[List[str]] = Field(default_factory=list, description="Document IDs to add")
+    remove_members: Optional[List[str]] = Field(default_factory=list, description="Document IDs to remove")
+```
+
+### 3. delete_collection
+
+Deletes a collection and optionally its members.
+
+**Schema**:
+```python
+class DeleteCollectionParams(BaseModel):
+    collection_id: str = Field(..., description="Collection ID to delete")
+    delete_members: bool = Field(False, description="Also delete member documents")
+    recursive: bool = Field(False, description="Delete sub-collections recursively")
+```
+
+### 4. list_collections
+
+Lists all collections with filtering and statistics.
+
+**Schema**:
+```python
+class ListCollectionsParams(BaseModel):
+    parent_id: Optional[str] = Field(None, description="Filter by parent collection")
+    include_stats: bool = Field(True, description="Include member statistics")
+    include_empty: bool = Field(True, description="Include collections with no members")
+    sort_by: Literal["name", "created_at", "member_count"] = Field("name")
+    limit: int = Field(100, description="Maximum collections to return")
+```
+
+### 5. move_documents
+
+Moves documents between collections.
+
+**Schema**:
+```python
+class MoveDocumentsParams(BaseModel):
+    document_ids: List[str] = Field(..., description="Documents to move")
+    source_collection: Optional[str] = Field(None, description="Source collection (None for uncollected)")
+    target_collection: Optional[str] = Field(None, description="Target collection (None to remove from collection)")
+    update_metadata: bool = Field(True, description="Apply target collection metadata")
+```
+
+### 6. get_collection_stats
+
+Gets detailed analytics for a collection.
+
+**Schema**:
+```python
+class GetCollectionStatsParams(BaseModel):
+    collection_id: str = Field(..., description="Collection ID")
+    include_member_details: bool = Field(False, description="Include per-member statistics")
+    include_subcollections: bool = Field(True, description="Include sub-collection stats")
+```
+
+**Response**:
+```python
+{
+    "collection_id": "uuid",
+    "name": "Research Papers 2024",
+    "statistics": {
+        "total_members": 150,
+        "direct_members": 120,
+        "subcollection_members": 30,
+        "total_size_bytes": 25000000,
+        "avg_document_size": 166666,
+        "unique_tags": ["ml", "nlp", "cv"],
+        "date_range": {
+            "earliest": "2024-01-01",
+            "latest": "2024-01-15"
+        },
+        "member_types": {
+            "document": 145,
+            "collection_header": 5
+        }
+    },
+    "subcollections": [...],
+    "metadata": {...}
+}
+```
+
+## Collection Templates
+
+Templates provide pre-configured collection structures for common use cases.
+
+### Built-in Templates
+
+1. **project_template**:
+   - README → overview
+   - src/ → implementation
+   - docs/ → documentation
+   - tests/ → validation
+
+2. **research_template**:
+   - Papers by year
+   - Papers by topic
+   - Papers by author
+   - Citations network
+
+3. **knowledge_base_template**:
+   - Categories → subcategories → articles
+   - FAQ section
+   - Troubleshooting guides
+
+### Template Schema
+
+```python
+class CollectionTemplate(BaseModel):
+    name: str
+    description: str
+    structure: Dict[str, Any]  # Hierarchical structure definition
+    default_metadata: Dict[str, Any]
+    naming_pattern: str  # e.g., "{year}-{topic}"
+    auto_organize_rules: List[Dict[str, Any]]  # Rules for auto-organization
+```
+
+## Implementation Details
+
+### File Structure
+
+```
+contextframe/mcp/collections/
+├── __init__.py
+├── tools.py          # Collection tool implementations
+├── schemas.py        # Collection-specific schemas
+├── templates.py      # Template definitions and loader
+└── stats.py         # Statistics calculation utilities
+```
+
+### Core Classes
+
+```python
+# collections/tools.py
+class CollectionTools:
+    """Collection management tools for MCP server."""
+    
+    def __init__(self, dataset: FrameDataset, transport: TransportAdapter):
+        self.dataset = dataset
+        self.transport = transport
+        self.templates = TemplateRegistry()
+    
+    async def create_collection(self, params: Dict[str, Any]) -> Dict[str, Any]:
+        """Create a new collection with header."""
+        ...
+    
+    async def update_collection(self, params: Dict[str, Any]) -> Dict[str, Any]:
+        """Update collection properties."""
+        ...
+    
+    # ... other tool methods
+```
+
+### Collection Header Schema
+
+Collections use a special header document with record_type="collection_header":
+
+```python
+{
+    "record_type": "collection_header",
+    "title": "Collection Name",
+    "context": "Collection description and purpose",
+    "metadata": {
+        "collection_metadata": {
+            "created_at": "2024-01-15T10:00:00Z",
+            "template": "project_template",
+            "parent_collection": "parent-uuid",
+            "member_count": 42,
+            "total_size": 1048576,
+            "shared_metadata": {
+                # Metadata applied to all members
+            }
+        }
+    },
+    "relationships": [
+        {
+            "type": "contains",
+            "target": "member-doc-uuid",
+            "title": "Member document"
+        }
+    ]
+}
+```
+
+### Relationship Management
+
+Collections use the existing relationship system:
+- Header → Members: "contains" relationship
+- Members → Header: "member_of" relationship
+- Collection → Subcollection: "parent/child" relationships
+
+## Testing Strategy
+
+### Unit Tests
+
+1. Test each collection tool independently
+2. Mock FrameDataset operations
+3. Verify schema validation
+4. Test error handling
+
+### Integration Tests
+
+1. Full collection lifecycle (create, update, add members, stats, delete)
+2. Hierarchical collections
+3. Template application
+4. Bulk operations with collections
+5. Cross-transport compatibility
+
+### Test Scenarios
+
+```python
+# test_collection_tools.py
+class TestCollectionTools:
+    async def test_create_project_collection(self):
+        """Test creating a project collection with template."""
+        
+    async def test_move_documents_between_collections(self):
+        """Test moving documents preserves relationships."""
+        
+    async def test_collection_statistics(self):
+        """Test accurate statistics calculation."""
+        
+    async def test_hierarchical_collections(self):
+        """Test parent/child collection relationships."""
+```
+
+## Security Considerations
+
+1. **Access Control**: Collections can have access restrictions
+2. **Metadata Validation**: Ensure metadata doesn't leak sensitive info
+3. **Size Limits**: Prevent collections from growing too large
+4. **Recursive Operations**: Limit depth to prevent DoS
+
+## Performance Optimizations
+
+1. **Lazy Loading**: Don't load all members unless requested
+2. **Cached Statistics**: Cache collection stats with TTL
+3. **Batch Operations**: Optimize bulk membership changes
+4. **Index Usage**: Use collection indexes for fast queries
+
+## Success Metrics
+
+- Create collection in < 100ms
+- List 1000 collections in < 500ms
+- Move 100 documents in < 1s
+- Calculate stats for 10k member collection in < 2s
+- Template application in < 200ms
+
+## Migration Path
+
+For existing datasets without collections:
+1. Auto-create "Uncategorized" collection
+2. Provide migration tool to organize existing docs
+3. Preserve all existing relationships
+4. No breaking changes to document structure
\ No newline at end of file
diff --git a/contextframe/mcp/batch/tools.py b/contextframe/mcp/batch/tools.py
index f4dcce1..ddc3705 100644
--- a/contextframe/mcp/batch/tools.py
+++ b/contextframe/mcp/batch/tools.py
@@ -144,10 +144,7 @@ async def batch_add(self, params: Dict[str, Any]) -> Dict[str, Any]:
             # Create record
             record = FrameRecord(
                 text_content=content,
-                metadata=metadata,
-                collection=shared.get("collection"),
-                chunk_index=0,
-                total_chunks=1
+                metadata=metadata
             )
             
             # Generate embeddings if requested
diff --git a/contextframe/mcp/tools.py b/contextframe/mcp/tools.py
index bb7fbee..ddb0091 100644
--- a/contextframe/mcp/tools.py
+++ b/contextframe/mcp/tools.py
@@ -67,6 +67,16 @@ def __init__(self, dataset: FrameDataset, transport: Optional[Any] = None):
                 batch_tools.register_tools(self)
             except ImportError:
                 logger.warning("Batch tools not available")
+            
+            # Register collection tools
+            try:
+                from contextframe.mcp.collections.tools import CollectionTools
+                from contextframe.mcp.collections.templates import TemplateRegistry
+                template_registry = TemplateRegistry()
+                collection_tools = CollectionTools(dataset, transport, template_registry)
+                collection_tools.register_tools(self)
+            except ImportError:
+                logger.warning("Collection tools not available")
 
     def _register_default_tools(self):
         """Register the default set of tools."""