diff --git a/.env.example b/.env.example index b12e987..7c5147b 100644 --- a/.env.example +++ b/.env.example @@ -35,6 +35,14 @@ ACI_VECTOR_STORE_VECTOR_SIZE=1024 ACI_INDEXING_MAX_WORKERS=8 ACI_INDEXING_MAX_CHUNK_TOKENS=8192 ACI_INDEXING_CHUNK_OVERLAP_LINES=2 +# Tokenizer strategy for chunk token counting. +# Use "character" or "simple" for Ollama/BERT-based models (e.g. nomic-embed-text, +# mxbai-embed-large) to avoid the tiktoken/WordPiece mismatch that causes +# "input length exceeds context length" errors. +# tiktoken - OpenAI BPE (default, accurate for OpenAI models) +# character - len(text)/4 estimate (conservative, works with any model) +# simple - whitespace split (for generic non-BPE models) +ACI_TOKENIZER=tiktoken # Optional comma-separated lists: # ACI_INDEXING_FILE_EXTENSIONS=.py,.js,.ts,.go # ACI_INDEXING_IGNORE_PATTERNS=__pycache__,*.pyc,.git,node_modules diff --git a/src/aci/cli/__init__.py b/src/aci/cli/__init__.py index efb018d..4516db7 100644 --- a/src/aci/cli/__init__.py +++ b/src/aci/cli/__init__.py @@ -22,7 +22,11 @@ from rich.syntax import Syntax from rich.table import Table -from aci.core.path_utils import get_collection_name_for_path, validate_indexable_path +from aci.core.path_utils import ( + get_collection_name_for_path, + resolve_file_filter_pattern, + validate_indexable_path, +) from aci.infrastructure import GrepSearcher from aci.infrastructure.codebase_registry import ( CodebaseRegistryStore, @@ -51,7 +55,12 @@ context_settings={"color": False}, ) -def get_services(): +def _project_metadata_db_path(path: Path) -> Path: + """Return the metadata DB path scoped to a project root.""" + return path.resolve() / ".aci" / "index.db" + + +def get_services(metadata_db_path: Path | None = None): """ Initialize services from .env with config-driven settings. @@ -63,7 +72,7 @@ def get_services(): Tuple of (config, embedding_client, vector_store, metadata_store, file_scanner, chunker, reranker) """ - container = create_services() + container = create_services(metadata_db_path=metadata_db_path) return ( container.config, container.embedding_client, @@ -100,7 +109,7 @@ def index( file_scanner, chunker, reranker, - ) = get_services() + ) = get_services(metadata_db_path=_project_metadata_db_path(path)) # Use config workers if not overridden by CLI actual_workers = workers if workers is not None else cfg.indexing.max_workers @@ -217,6 +226,9 @@ def search( ): """Search the indexed codebase.""" try: + # Determine the search base path + search_base = path if path is not None else Path.cwd() + ( cfg, embedding_client, @@ -225,10 +237,7 @@ def search( file_scanner, chunker, config_reranker, - ) = get_services() - - # Determine the search base path - search_base = path if path is not None else Path.cwd() + ) = get_services(metadata_db_path=_project_metadata_db_path(search_base)) # Use centralized repository resolution for path validation and collection name resolution = resolve_repository(search_base, metadata_store) @@ -236,6 +245,7 @@ def search( console.print(f"[bold red]Error:[/bold red] {resolution.error_message}") raise typer.Exit(1) collection_name = resolution.collection_name + normalized_file_filter = resolve_file_filter_pattern(file_filter, resolution.indexed_root) # Use config values if not overridden by CLI actual_limit = limit if limit is not None else cfg.search.default_limit @@ -291,7 +301,7 @@ def search( search_service.search( query=query, limit=actual_limit, - file_filter=file_filter, # User-provided filter only + file_filter=normalized_file_filter, use_rerank=use_rerank and reranker is not None, search_mode=search_mode, # Pass search mode collection_name=collection_name, # Pass explicitly, no state mutation @@ -375,7 +385,7 @@ def update( file_scanner, chunker, reranker, - ) = get_services() + ) = get_services(metadata_db_path=_project_metadata_db_path(path)) with Progress( SpinnerColumn(), diff --git a/src/aci/core/path_utils.py b/src/aci/core/path_utils.py index a1ad9ac..bcbc8b4 100644 --- a/src/aci/core/path_utils.py +++ b/src/aci/core/path_utils.py @@ -173,6 +173,41 @@ def parse_runtime_path_mappings(raw_value: str | None) -> list[RuntimePathMappin return mappings +def resolve_file_filter_pattern( + file_filter: str | None, indexed_root: str | Path | None +) -> str | None: + """Resolve relative file-filter prefixes against the indexed root path. + + Keeps broad wildcard-only patterns unchanged (e.g. ``*.py`` or ``**/*.py``), + but expands relative directory-prefixed patterns (e.g. ``src/**/*.py``) + to absolute patterns rooted at ``indexed_root``. + """ + if not file_filter: + return file_filter + if indexed_root is None: + return file_filter + + raw_filter = file_filter.strip() + if not raw_filter: + return raw_filter + + # Already absolute (POSIX, Windows drive, or UNC path). + if raw_filter.startswith("/") or _looks_like_windows_path(raw_filter) or raw_filter.startswith("\\\\"): + return raw_filter + + normalized = raw_filter.replace("\\", "/") + has_directory_prefix = "/" in normalized + starts_with_wildcard = normalized.startswith(("*", "?", "[")) + if not has_directory_prefix or starts_with_wildcard: + return raw_filter + + relative_filter = normalized.lstrip("./") + if not relative_filter: + return raw_filter + + return str(Path(indexed_root).resolve() / Path(relative_filter)) + + def _apply_runtime_path_mapping( path_str: str, path_mappings: Sequence[RuntimePathMapping], diff --git a/src/aci/http_server.py b/src/aci/http_server.py index e17cb9e..51bead8 100644 --- a/src/aci/http_server.py +++ b/src/aci/http_server.py @@ -11,7 +11,11 @@ from fastapi import FastAPI, HTTPException, Query from pydantic import BaseModel -from aci.core.path_utils import get_collection_name_for_path, is_system_directory +from aci.core.path_utils import ( + get_collection_name_for_path, + is_system_directory, + resolve_file_filter_pattern, +) from aci.core.watch_config import WatchConfig from aci.infrastructure.codebase_registry import best_effort_update_registry from aci.infrastructure.file_watcher import FileWatcher @@ -330,6 +334,7 @@ async def search( raise HTTPException(status_code=400, detail=resolution.error_message) collection_name = resolution.collection_name + normalized_file_filter = resolve_file_filter_pattern(file_filter, resolution.indexed_root) apply_rerank = cfg.search.use_rerank if use_rerank is None else use_rerank @@ -363,7 +368,7 @@ async def search( results = await search_service.search( query=q, limit=limit, - file_filter=file_filter, + file_filter=normalized_file_filter, use_rerank=apply_rerank, search_mode=search_mode, collection_name=collection_name, diff --git a/src/aci/mcp/handlers.py b/src/aci/mcp/handlers.py index 3f3981d..324fed8 100644 --- a/src/aci/mcp/handlers.py +++ b/src/aci/mcp/handlers.py @@ -11,6 +11,7 @@ from aci.core.path_utils import ( RuntimePathResolutionResult, get_collection_name_for_path, + resolve_file_filter_pattern, resolve_runtime_path, validate_indexable_path, ) @@ -243,6 +244,8 @@ async def _handle_search_code(arguments: dict, ctx: MCPContext) -> list[TextCont f"Valid types: {', '.join(sorted(valid_artifact_types))}" )] + normalized_file_filter = resolve_file_filter_pattern(file_filter, indexed_root) + # Request more results if filtering by subdirectory (to ensure enough after filtering) fetch_limit = limit * 3 if path_prefix_filter else limit @@ -250,7 +253,7 @@ async def _handle_search_code(arguments: dict, ctx: MCPContext) -> list[TextCont results = await search_service.search( query=query, limit=fetch_limit, - file_filter=file_filter, + file_filter=normalized_file_filter, use_rerank=use_rerank, search_mode=search_mode, collection_name=collection_name, diff --git a/tests/unit/test_chunker_core.py b/tests/unit/test_chunker_core.py index 73870af..8f97e73 100644 --- a/tests/unit/test_chunker_core.py +++ b/tests/unit/test_chunker_core.py @@ -17,7 +17,7 @@ get_import_registry, ) from aci.core.file_scanner import ScannedFile -from aci.core.tokenizer import get_default_tokenizer +from aci.core.tokenizer import CharacterTokenizer class TestCodeChunk: @@ -143,7 +143,7 @@ class TestChunker: @pytest.fixture def tokenizer(self): - return get_default_tokenizer() + return CharacterTokenizer() @pytest.fixture def chunker(self, tokenizer): diff --git a/tests/unit/test_chunker_splitter.py b/tests/unit/test_chunker_splitter.py index 09e0a00..843f8fc 100644 --- a/tests/unit/test_chunker_splitter.py +++ b/tests/unit/test_chunker_splitter.py @@ -7,7 +7,7 @@ from aci.core.ast_parser import ASTNode, TreeSitterParser from aci.core.chunker import Chunker, SmartChunkSplitter from aci.core.file_scanner import ScannedFile -from aci.core.tokenizer import get_default_tokenizer +from aci.core.tokenizer import CharacterTokenizer class TestSmartChunkSplitter: @@ -15,7 +15,7 @@ class TestSmartChunkSplitter: @pytest.fixture def tokenizer(self): - return get_default_tokenizer() + return CharacterTokenizer() @pytest.fixture def splitter(self, tokenizer): @@ -195,7 +195,7 @@ class TestChunkerWithSmartSplitter: @pytest.fixture def tokenizer(self): - return get_default_tokenizer() + return CharacterTokenizer() @pytest.fixture def parser(self): diff --git a/tests/unit/test_cli_metadata_db_path.py b/tests/unit/test_cli_metadata_db_path.py new file mode 100644 index 0000000..0a89b3b --- /dev/null +++ b/tests/unit/test_cli_metadata_db_path.py @@ -0,0 +1,51 @@ +from pathlib import Path + +import pytest +import typer + +import aci.cli as cli + + +def test_project_metadata_db_path_is_scoped_to_project_root(tmp_path: Path) -> None: + nested = tmp_path / "repo" / "subdir" + nested.mkdir(parents=True) + + db_path = cli._project_metadata_db_path(nested) + + assert db_path == nested.resolve() / ".aci" / "index.db" + + +def test_index_uses_project_scoped_metadata_db_path(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + indexed_path = tmp_path / "repo" + indexed_path.mkdir() + captured: dict[str, Path | None] = {"metadata_db_path": None} + + def fake_get_services(metadata_db_path: Path | None = None): + captured["metadata_db_path"] = metadata_db_path + raise RuntimeError("stop") + + monkeypatch.setattr(cli, "get_services", fake_get_services) + + with pytest.raises(typer.Exit): + cli.index(path=indexed_path, workers=None) + + assert captured["metadata_db_path"] == indexed_path.resolve() / ".aci" / "index.db" + + +def test_search_uses_explicit_path_for_project_scoped_metadata_db_path( + monkeypatch: pytest.MonkeyPatch, tmp_path: Path +) -> None: + search_path = tmp_path / "repo" + search_path.mkdir() + captured: dict[str, Path | None] = {"metadata_db_path": None} + + def fake_get_services(metadata_db_path: Path | None = None): + captured["metadata_db_path"] = metadata_db_path + raise RuntimeError("stop") + + monkeypatch.setattr(cli, "get_services", fake_get_services) + + with pytest.raises(typer.Exit): + cli.search(query="hello", path=search_path) + + assert captured["metadata_db_path"] == search_path.resolve() / ".aci" / "index.db" diff --git a/tests/unit/test_incremental_update_scopes_to_root.py b/tests/unit/test_incremental_update_scopes_to_root.py index 1ec005b..0b97a8d 100644 --- a/tests/unit/test_incremental_update_scopes_to_root.py +++ b/tests/unit/test_incremental_update_scopes_to_root.py @@ -2,7 +2,9 @@ import tempfile from pathlib import Path +from aci.core.chunker import create_chunker from aci.core.file_scanner import FileScanner +from aci.core.tokenizer import CharacterTokenizer from aci.infrastructure.fakes import InMemoryVectorStore, LocalEmbeddingClient from aci.infrastructure.metadata_store import IndexMetadataStore from aci.services.indexing_service import IndexingService @@ -47,6 +49,7 @@ def test_update_incremental_does_not_delete_other_repo_metadata(): vector_store=vector_store, metadata_store=metadata_store, file_scanner=file_scanner, + chunker=create_chunker(tokenizer=CharacterTokenizer()), max_workers=1, ) diff --git a/tests/unit/test_runtime_path_resolution.py b/tests/unit/test_runtime_path_resolution.py index 2749f0b..60a1f13 100644 --- a/tests/unit/test_runtime_path_resolution.py +++ b/tests/unit/test_runtime_path_resolution.py @@ -107,3 +107,28 @@ def test_mcp_index_codebase_uses_resolved_runtime_path(tmp_path: Path): payload = json.loads(result[0].text) assert payload["requested_path"] == r"D:\workspace" assert payload["indexed_path"] == str(mounted_repo.resolve()) + + +def test_resolve_file_filter_pattern_keeps_wildcard_only_pattern(tmp_path: Path): + from aci.core.path_utils import resolve_file_filter_pattern + + resolved = resolve_file_filter_pattern("**/*.tsx", tmp_path) + + assert resolved == "**/*.tsx" + + +def test_resolve_file_filter_pattern_expands_relative_prefixed_pattern(tmp_path: Path): + from aci.core.path_utils import resolve_file_filter_pattern + + resolved = resolve_file_filter_pattern("apps/web/**/*.tsx", tmp_path) + + assert resolved == str(tmp_path.resolve() / "apps/web/**/*.tsx") + + +def test_resolve_file_filter_pattern_keeps_absolute_pattern(tmp_path: Path): + from aci.core.path_utils import resolve_file_filter_pattern + + absolute_pattern = str(tmp_path / "apps/**/*.tsx") + resolved = resolve_file_filter_pattern(absolute_pattern, "/another/root") + + assert resolved == absolute_pattern