diff --git a/CLAUDE.md b/CLAUDE.md index b58ffc7..cfacf5b 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -107,6 +107,11 @@ Use these commands: - **Clean logic**: Keep core logic clean and push implementation details to the edges - **File Organsiation**: Balance file organization with simplicity - use an appropriate number of files for the project scale - **Input Handling**: Use `prompt-toolkit` for enhanced terminal input with arrow key navigation and history support +- **YAML Security**: Always use `yaml.safe_load()` instead of `yaml.load()` to prevent code injection +- **Error Handling**: Use specific exception types and provide meaningful error messages; avoid silent failures +- **Metadata Validation**: Validate user-provided metadata to prevent oversized or malicious content +- **Performance**: Consider lazy loading and caching for operations that read multiple files +- **Backward Compatibility**: Maintain fallback mechanisms when introducing new data formats ## Project Architecture @@ -118,11 +123,16 @@ Use these commands: **Key Components:** - Configuration management with YAML and environment variable support -- Chat history persistence in markdown format +- Chat history persistence in markdown format with YAML frontmatter metadata - Interactive chat sessions with commands (/help, /save, etc.) - Enhanced input handling with arrow key navigation and message history - Comprehensive test suite with unit and integration tests +**Chat History Format:** +- New format uses YAML frontmatter for metadata (conversation_id, title, timestamps, tags) +- Maintains backward compatibility with legacy HTML comment format +- Metadata includes: conversation_id, created, updated, title, tags, summaries_count + ## Configuration Configuration follows this precedence (highest to lowest): diff --git a/nova/core/history.py b/nova/core/history.py index 845e342..a732f2c 100644 --- a/nova/core/history.py +++ b/nova/core/history.py @@ -1,11 +1,16 @@ """Chat history management with markdown save/load""" +import logging import re from datetime import datetime from pathlib import Path +import yaml + from nova.models.message import Conversation, Message, MessageRole +logger = logging.getLogger(__name__) + class HistoryError(Exception): """History-related errors""" @@ -13,6 +18,61 @@ class HistoryError(Exception): pass +def _validate_metadata(metadata: dict) -> dict: + """Validate and sanitize metadata from YAML frontmatter""" + allowed_keys = { + "conversation_id", + "created", + "updated", + "title", + "tags", + "summaries_count", + } + max_title_length = 200 + max_tag_count = 50 + max_tag_length = 100 + + validated = {} + + for key, value in metadata.items(): + if key not in allowed_keys: + logger.warning(f"Ignoring unknown metadata key: {key}") + continue + + if key == "title" and isinstance(value, str): + # Validate and truncate title + validated[key] = value.strip()[:max_title_length] + elif key in ["created", "updated"] and isinstance(value, str): + # Validate ISO format timestamps + try: + datetime.fromisoformat(value) + validated[key] = value + except ValueError: + logger.warning(f"Invalid timestamp format for {key}: {value}") + elif key == "conversation_id" and isinstance(value, str): + # Sanitize conversation_id + validated[key] = re.sub(r"[^\w\-]", "_", value.strip()) + elif key == "tags" and isinstance(value, list): + # Validate tags list + clean_tags = [] + for tag in value[:max_tag_count]: # Limit number of tags + if isinstance(tag, str): + clean_tag = tag.strip()[:max_tag_length] + if clean_tag: + clean_tags.append(clean_tag) + validated[key] = clean_tags + elif key == "summaries_count" and isinstance(value, int): + # Validate summaries count + if 0 <= value <= 1000: # Reasonable limit + validated[key] = value + else: + # For other valid keys, store as-is if basic type check passes + if isinstance(value, str | int | list): + validated[key] = value + + return validated + + class HistoryManager: """Manages chat history persistence in markdown format""" @@ -74,32 +134,17 @@ def list_conversations(self) -> list[tuple[Path, str, datetime]]: else: timestamp = datetime.fromtimestamp(filepath.stat().st_mtime) - # Extract title from file (first non-metadata line) - with open(filepath, encoding="utf-8") as f: - lines = f.readlines() - - title = "Untitled" - for line in lines: - line = line.strip() - if ( - line - and not line.startswith("") - lines.append(f"") - lines.append(f"") - lines.append(f"") + # YAML frontmatter header + metadata = { + "conversation_id": conversation.id, + "created": conversation.created_at.isoformat(), + "updated": conversation.updated_at.isoformat(), + } if conversation.title: - lines.append(f"") + metadata["title"] = conversation.title + + if conversation.tags: + metadata["tags"] = list(conversation.tags) + if conversation.summaries: + metadata["summaries_count"] = len(conversation.summaries) + + lines.append("---") + lines.append(yaml.dump(metadata, default_flow_style=False).strip()) + lines.append("---") lines.append("") # Title @@ -251,35 +306,172 @@ def _extract_meaningful_title(self, content: str) -> str: first_sentence = re.sub(r"\s+", " ", first_sentence).strip() return first_sentence - def _markdown_to_conversation( - self, content: str, conversation_id: str - ) -> Conversation: - """Parse markdown content back to conversation""" + def _parse_yaml_frontmatter(self, content: str) -> tuple[dict, str]: + """Parse YAML frontmatter and return validated metadata and remaining content""" + if not content.startswith("---\n"): + return {}, content lines = content.split("\n") + frontmatter_end = -1 + + # Find the end of the frontmatter + for i, line in enumerate(lines[1:], 1): # Skip first "---" + if line.strip() == "---": + frontmatter_end = i + break - # Parse metadata + if frontmatter_end <= 0: + logger.warning("YAML frontmatter found but no closing delimiter") + return {}, content + + try: + frontmatter_text = "\n".join(lines[1:frontmatter_end]) + frontmatter = yaml.safe_load(frontmatter_text) + + if frontmatter is None: + # Empty YAML frontmatter is valid, return empty metadata + remaining_content = "\n".join(lines[frontmatter_end + 1 :]) + return {}, remaining_content + elif not isinstance(frontmatter, dict): + logger.warning("YAML frontmatter must be a dictionary") + return {}, content + + # Validate and sanitize metadata + validated_metadata = _validate_metadata(frontmatter) + remaining_content = "\n".join(lines[frontmatter_end + 1 :]) + + return validated_metadata, remaining_content + + except yaml.YAMLError as e: + logger.warning(f"Invalid YAML frontmatter: {e}") + return {}, content + except Exception as e: + logger.error(f"Unexpected error parsing YAML frontmatter: {e}") + return {}, content + + def _parse_legacy_metadata(self, content: str) -> dict: + """Parse legacy HTML comment metadata format""" metadata = {} + lines = content.split("\n") + for line in lines: if line.startswith(""): comment = line[5:-4] if ":" in comment: key, value = comment.split(":", 1) - metadata[key.strip()] = value.strip() + # Map old keys to new YAML format + old_key = key.strip() + if old_key == "Conversation ID": + metadata["conversation_id"] = value.strip() + elif old_key == "Created": + metadata["created"] = value.strip() + elif old_key == "Updated": + metadata["updated"] = value.strip() + elif old_key == "Title": + metadata["title"] = value.strip() + + # Still validate legacy metadata + return _validate_metadata(metadata) + + def _extract_title_efficiently(self, filepath: Path) -> str: + """Extract title efficiently by reading only the beginning of file""" + try: + with open(filepath, encoding="utf-8") as f: + # Read only first 1KB for title extraction + partial_content = f.read(1024) + + if partial_content.startswith("---\n"): + # Parse only YAML frontmatter + metadata, _ = self._parse_yaml_frontmatter(partial_content) + if "title" in metadata: + return metadata["title"][:50] + + # Fallback to content-based title extraction + return self._extract_title_from_content(partial_content) + + except (OSError, UnicodeDecodeError) as e: + logger.warning(f"Error reading file {filepath}: {e}") + return "Untitled" + + def _extract_title_from_content(self, content: str) -> str: + """Extract title from content when no frontmatter title exists""" + lines = content.split("\n") + + # Look for the first real markdown H1 header (outside of YAML frontmatter) + skip_until_content = False + if content.strip().startswith("---"): + skip_until_content = True + + for line in lines: + stripped = line.strip() + + # If we started with ---, wait for the content section + if skip_until_content: + if stripped == "---": + skip_until_content = False # Found closing, now look for content + continue + + # Look for markdown H1 header + if stripped.startswith("# ") and not stripped.startswith("##"): + # Make sure it's not a comment inside YAML + return stripped[2:].strip()[:50] # Remove '# ' prefix + + # Fallback to first non-metadata content + skip_until_content = False + if content.strip().startswith("---"): + skip_until_content = True + + for line in lines: + stripped = line.strip() + + # If we started with ---, wait for the content section + if skip_until_content: + if stripped == "---": + skip_until_content = False + continue + + # Skip empty lines, comments, and headers + if ( + not stripped + or stripped.startswith("" in markdown - assert f"" in markdown - assert f"" in markdown + # Check YAML frontmatter + assert "---" in markdown + assert f"conversation_id: {sample_conversation.id}" in markdown + assert f"title: {sample_conversation.title}" in markdown + assert "created:" in markdown + assert "updated:" in markdown # Check title assert f"# {sample_conversation.title}" in markdown @@ -441,3 +443,232 @@ def test_markdown_headings_preserved_in_loaded_conversation(self, history_dir): # Verify the full content structure is preserved assert "This is a basic heading." in loaded_assistant_message.content assert "horizontal rule" in loaded_assistant_message.content + + def test_malformed_yaml_frontmatter_handling(self, history_dir): + """Test handling of malformed YAML frontmatter""" + malformed_content = """--- +title: "Test +# Missing closing quote and invalid YAML +created: 2024-01-01 +invalid_structure: [unclosed +--- + +# Test Conversation + +## User (10:00:00) +Hello +""" + + test_file = history_dir / "malformed.md" + test_file.write_text(malformed_content) + + manager = HistoryManager(history_dir) + # Should fall back gracefully without crashing + conversation = manager.load_conversation(test_file) + # Should extract title from content when YAML parsing fails + assert conversation.title is not None # Should generate fallback title + assert len(conversation.messages) == 1 # Should still parse messages + + def test_yaml_frontmatter_metadata_validation(self, history_dir): + """Test that YAML frontmatter metadata is properly validated""" + # Create oversized tag list + oversized_tags = ["tag"] * 100 + oversized_title = "A" * 300 + + # Test oversized title and invalid data + oversized_content = f"""--- +title: "{oversized_title}" +created: "not-a-date" +conversation_id: "test/../malicious" +tags: {oversized_tags} +summaries_count: -5 +unknown_key: "should be ignored" +--- + +# Test Conversation + +## User (10:00:00) +Hello +""" + + test_file = history_dir / "oversized.md" + test_file.write_text(oversized_content) + + manager = HistoryManager(history_dir) + conversation = manager.load_conversation(test_file) + + # Title should be truncated + assert len(conversation.title or "") <= 200 + # Conversation ID should be sanitized + assert conversation.id == "test____malicious" + # Tags should be limited + assert len(conversation.tags) <= 50 + + def test_yaml_frontmatter_security_safe_load(self, history_dir): + """Test that YAML frontmatter uses safe loading""" + # This would be dangerous with yaml.load but safe with yaml.safe_load + potentially_malicious_content = """--- +title: !!python/object/apply:builtins.print ["This should not execute"] +created: 2024-01-01T10:00:00 +--- + +# Test Conversation + +## User (10:00:00) +Hello +""" + + test_file = history_dir / "potentially_malicious.md" + test_file.write_text(potentially_malicious_content) + + manager = HistoryManager(history_dir) + # Should handle safely without executing code + conversation = manager.load_conversation(test_file) + # The dangerous YAML should be ignored/rejected + assert conversation.title != "This should not execute" + + def test_incomplete_yaml_frontmatter(self, history_dir): + """Test handling of incomplete YAML frontmatter""" + incomplete_content = """--- +title: "Test Conversation" +created: 2024-01-01T10:00:00 +# Missing closing --- delimiter + +# Test Conversation + +## User (10:00:00) +Hello +""" + + test_file = history_dir / "incomplete.md" + test_file.write_text(incomplete_content) + + manager = HistoryManager(history_dir) + # Should fall back to content-based parsing and ignore the invalid YAML + conversation = manager.load_conversation(test_file) + # Since YAML parsing fails, it should extract from content (which might include comment) + assert "Missing closing" in conversation.title or conversation.title == "Hello" + + def test_mixed_yaml_and_html_comments(self, history_dir): + """Test files with both YAML frontmatter and HTML comments""" + mixed_content = """--- +title: "YAML Title" +created: 2024-01-01T10:00:00 +--- + + + + +# Test Conversation + +## User (10:00:00) +Hello +""" + + test_file = history_dir / "mixed.md" + test_file.write_text(mixed_content) + + manager = HistoryManager(history_dir) + conversation = manager.load_conversation(test_file) + # YAML frontmatter should take precedence + assert conversation.title == "YAML Title" + + def test_file_encoding_error_handling(self, history_dir): + """Test handling of files with encoding issues""" + # Create a file with invalid UTF-8 + binary_file = history_dir / "binary.md" + binary_file.write_bytes(b"\xff\xfe Invalid UTF-8 \xff") + + manager = HistoryManager(history_dir) + conversations = manager.list_conversations() + # Should not crash and should exclude the problematic file + assert all(isinstance(conv[1], str) for conv in conversations) + + def test_large_file_performance(self, history_dir): + """Test that list_conversations is efficient with large files""" + # Create a file with large content but small frontmatter + large_content = ( + """--- +title: "Test Large File" +created: 2024-01-01T10:00:00 +--- + +# Test Conversation + +## User (10:00:00) +Hello + +## Assistant (10:00:01) +""" + + "This is a very long response. " * 10000 + + """ + +## User (10:01:00) +Continue +""" + ) + + test_file = history_dir / "large.md" + test_file.write_text(large_content) + + import time + + manager = HistoryManager(history_dir) + + start_time = time.time() + conversations = manager.list_conversations() + end_time = time.time() + + # Should complete quickly (less than 1 second for this test) + assert end_time - start_time < 1.0 + # Should still extract the title correctly + large_conv = next((c for c in conversations if c[0].name == "large.md"), None) + assert large_conv is not None + assert large_conv[1] == "Test Large File" + + def test_datetime_parsing_error_handling(self, history_dir): + """Test handling of invalid datetime formats""" + invalid_datetime_content = """--- +title: "Test Conversation" +created: "not-a-valid-datetime" +updated: "2024-13-45T25:70:80" # Invalid date components +--- + +# Test Conversation + +## User (10:00:00) +Hello +""" + + test_file = history_dir / "invalid_dates.md" + test_file.write_text(invalid_datetime_content) + + manager = HistoryManager(history_dir) + conversation = manager.load_conversation(test_file) + + # Should use current datetime for invalid timestamps + assert conversation.created_at is not None + assert conversation.updated_at is not None + # Title should still be preserved + assert conversation.title == "Test Conversation" + + def test_empty_yaml_frontmatter(self, history_dir): + """Test handling of empty YAML frontmatter""" + empty_frontmatter_content = """--- +--- + +# Test Conversation + +## User (10:00:00) +Hello +""" + + test_file = history_dir / "empty_frontmatter.md" + test_file.write_text(empty_frontmatter_content) + + manager = HistoryManager(history_dir) + conversation = manager.load_conversation(test_file) + + # Should parse successfully with default values + assert conversation.title == "Test Conversation" # From content + assert len(conversation.messages) == 1