Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,14 @@ ACI_VECTOR_STORE_VECTOR_SIZE=1024
ACI_INDEXING_MAX_WORKERS=8
ACI_INDEXING_MAX_CHUNK_TOKENS=8192
ACI_INDEXING_CHUNK_OVERLAP_LINES=2
# Tokenizer strategy for chunk token counting.
# Use "character" or "simple" for Ollama/BERT-based models (e.g. nomic-embed-text,
# mxbai-embed-large) to avoid the tiktoken/WordPiece mismatch that causes
# "input length exceeds context length" errors.
# tiktoken - OpenAI BPE (default, accurate for OpenAI models)
# character - len(text)/4 estimate (conservative, works with any model)
# simple - whitespace split (for generic non-BPE models)
ACI_TOKENIZER=tiktoken
# Optional comma-separated lists:
# ACI_INDEXING_FILE_EXTENSIONS=.py,.js,.ts,.go
# ACI_INDEXING_IGNORE_PATTERNS=__pycache__,*.pyc,.git,node_modules
Expand Down
30 changes: 20 additions & 10 deletions src/aci/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,11 @@
from rich.syntax import Syntax
from rich.table import Table

from aci.core.path_utils import get_collection_name_for_path, validate_indexable_path
from aci.core.path_utils import (
get_collection_name_for_path,
resolve_file_filter_pattern,
validate_indexable_path,
)
from aci.infrastructure import GrepSearcher
from aci.infrastructure.codebase_registry import (
CodebaseRegistryStore,
Expand Down Expand Up @@ -51,7 +55,12 @@
context_settings={"color": False},
)

def get_services():
def _project_metadata_db_path(path: Path) -> Path:
"""Return the metadata DB path scoped to a project root."""
return path.resolve() / ".aci" / "index.db"


def get_services(metadata_db_path: Path | None = None):
"""
Initialize services from .env with config-driven settings.

Expand All @@ -63,7 +72,7 @@ def get_services():
Tuple of (config, embedding_client, vector_store, metadata_store,
file_scanner, chunker, reranker)
"""
container = create_services()
container = create_services(metadata_db_path=metadata_db_path)
return (
container.config,
container.embedding_client,
Expand Down Expand Up @@ -100,7 +109,7 @@ def index(
file_scanner,
chunker,
reranker,
) = get_services()
) = get_services(metadata_db_path=_project_metadata_db_path(path))

# Use config workers if not overridden by CLI
actual_workers = workers if workers is not None else cfg.indexing.max_workers
Expand Down Expand Up @@ -217,6 +226,9 @@ def search(
):
"""Search the indexed codebase."""
try:
# Determine the search base path
search_base = path if path is not None else Path.cwd()

(
cfg,
embedding_client,
Expand All @@ -225,17 +237,15 @@ def search(
file_scanner,
chunker,
config_reranker,
) = get_services()

# Determine the search base path
search_base = path if path is not None else Path.cwd()
) = get_services(metadata_db_path=_project_metadata_db_path(search_base))

# Use centralized repository resolution for path validation and collection name
resolution = resolve_repository(search_base, metadata_store)
if not resolution.valid:
console.print(f"[bold red]Error:[/bold red] {resolution.error_message}")
raise typer.Exit(1)
collection_name = resolution.collection_name
normalized_file_filter = resolve_file_filter_pattern(file_filter, resolution.indexed_root)

# Use config values if not overridden by CLI
actual_limit = limit if limit is not None else cfg.search.default_limit
Expand Down Expand Up @@ -291,7 +301,7 @@ def search(
search_service.search(
query=query,
limit=actual_limit,
file_filter=file_filter, # User-provided filter only
file_filter=normalized_file_filter,
use_rerank=use_rerank and reranker is not None,
search_mode=search_mode, # Pass search mode
collection_name=collection_name, # Pass explicitly, no state mutation
Expand Down Expand Up @@ -375,7 +385,7 @@ def update(
file_scanner,
chunker,
reranker,
) = get_services()
) = get_services(metadata_db_path=_project_metadata_db_path(path))

with Progress(
SpinnerColumn(),
Expand Down
35 changes: 35 additions & 0 deletions src/aci/core/path_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,41 @@ def parse_runtime_path_mappings(raw_value: str | None) -> list[RuntimePathMappin
return mappings


def resolve_file_filter_pattern(
file_filter: str | None, indexed_root: str | Path | None
) -> str | None:
"""Resolve relative file-filter prefixes against the indexed root path.

Keeps broad wildcard-only patterns unchanged (e.g. ``*.py`` or ``**/*.py``),
but expands relative directory-prefixed patterns (e.g. ``src/**/*.py``)
to absolute patterns rooted at ``indexed_root``.
"""
if not file_filter:
return file_filter
if indexed_root is None:
return file_filter

raw_filter = file_filter.strip()
if not raw_filter:
return raw_filter

# Already absolute (POSIX, Windows drive, or UNC path).
if raw_filter.startswith("/") or _looks_like_windows_path(raw_filter) or raw_filter.startswith("\\\\"):
return raw_filter

normalized = raw_filter.replace("\\", "/")
has_directory_prefix = "/" in normalized
starts_with_wildcard = normalized.startswith(("*", "?", "["))
if not has_directory_prefix or starts_with_wildcard:
return raw_filter

relative_filter = normalized.lstrip("./")
if not relative_filter:
return raw_filter

return str(Path(indexed_root).resolve() / Path(relative_filter))


def _apply_runtime_path_mapping(
path_str: str,
path_mappings: Sequence[RuntimePathMapping],
Expand Down
9 changes: 7 additions & 2 deletions src/aci/http_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,11 @@
from fastapi import FastAPI, HTTPException, Query
from pydantic import BaseModel

from aci.core.path_utils import get_collection_name_for_path, is_system_directory
from aci.core.path_utils import (
get_collection_name_for_path,
is_system_directory,
resolve_file_filter_pattern,
)
from aci.core.watch_config import WatchConfig
from aci.infrastructure.codebase_registry import best_effort_update_registry
from aci.infrastructure.file_watcher import FileWatcher
Expand Down Expand Up @@ -330,6 +334,7 @@ async def search(
raise HTTPException(status_code=400, detail=resolution.error_message)

collection_name = resolution.collection_name
normalized_file_filter = resolve_file_filter_pattern(file_filter, resolution.indexed_root)

apply_rerank = cfg.search.use_rerank if use_rerank is None else use_rerank

Expand Down Expand Up @@ -363,7 +368,7 @@ async def search(
results = await search_service.search(
query=q,
limit=limit,
file_filter=file_filter,
file_filter=normalized_file_filter,
use_rerank=apply_rerank,
search_mode=search_mode,
collection_name=collection_name,
Expand Down
5 changes: 4 additions & 1 deletion src/aci/mcp/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from aci.core.path_utils import (
RuntimePathResolutionResult,
get_collection_name_for_path,
resolve_file_filter_pattern,
resolve_runtime_path,
validate_indexable_path,
)
Expand Down Expand Up @@ -243,14 +244,16 @@ async def _handle_search_code(arguments: dict, ctx: MCPContext) -> list[TextCont
f"Valid types: {', '.join(sorted(valid_artifact_types))}"
)]

normalized_file_filter = resolve_file_filter_pattern(file_filter, indexed_root)

# Request more results if filtering by subdirectory (to ensure enough after filtering)
fetch_limit = limit * 3 if path_prefix_filter else limit

# Pass collection_name explicitly to avoid shared state mutation
results = await search_service.search(
query=query,
limit=fetch_limit,
file_filter=file_filter,
file_filter=normalized_file_filter,
use_rerank=use_rerank,
search_mode=search_mode,
collection_name=collection_name,
Expand Down
4 changes: 2 additions & 2 deletions tests/unit/test_chunker_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
get_import_registry,
)
from aci.core.file_scanner import ScannedFile
from aci.core.tokenizer import get_default_tokenizer
from aci.core.tokenizer import CharacterTokenizer


class TestCodeChunk:
Expand Down Expand Up @@ -143,7 +143,7 @@ class TestChunker:

@pytest.fixture
def tokenizer(self):
return get_default_tokenizer()
return CharacterTokenizer()

@pytest.fixture
def chunker(self, tokenizer):
Expand Down
6 changes: 3 additions & 3 deletions tests/unit/test_chunker_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,15 @@
from aci.core.ast_parser import ASTNode, TreeSitterParser
from aci.core.chunker import Chunker, SmartChunkSplitter
from aci.core.file_scanner import ScannedFile
from aci.core.tokenizer import get_default_tokenizer
from aci.core.tokenizer import CharacterTokenizer


class TestSmartChunkSplitter:
"""Tests for SmartChunkSplitter implementation."""

@pytest.fixture
def tokenizer(self):
return get_default_tokenizer()
return CharacterTokenizer()

@pytest.fixture
def splitter(self, tokenizer):
Expand Down Expand Up @@ -195,7 +195,7 @@ class TestChunkerWithSmartSplitter:

@pytest.fixture
def tokenizer(self):
return get_default_tokenizer()
return CharacterTokenizer()

@pytest.fixture
def parser(self):
Expand Down
51 changes: 51 additions & 0 deletions tests/unit/test_cli_metadata_db_path.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from pathlib import Path

import pytest
import typer

import aci.cli as cli


def test_project_metadata_db_path_is_scoped_to_project_root(tmp_path: Path) -> None:
nested = tmp_path / "repo" / "subdir"
nested.mkdir(parents=True)

db_path = cli._project_metadata_db_path(nested)

assert db_path == nested.resolve() / ".aci" / "index.db"


def test_index_uses_project_scoped_metadata_db_path(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
indexed_path = tmp_path / "repo"
indexed_path.mkdir()
captured: dict[str, Path | None] = {"metadata_db_path": None}

def fake_get_services(metadata_db_path: Path | None = None):
captured["metadata_db_path"] = metadata_db_path
raise RuntimeError("stop")

monkeypatch.setattr(cli, "get_services", fake_get_services)

with pytest.raises(typer.Exit):
cli.index(path=indexed_path, workers=None)

assert captured["metadata_db_path"] == indexed_path.resolve() / ".aci" / "index.db"


def test_search_uses_explicit_path_for_project_scoped_metadata_db_path(
monkeypatch: pytest.MonkeyPatch, tmp_path: Path
) -> None:
search_path = tmp_path / "repo"
search_path.mkdir()
captured: dict[str, Path | None] = {"metadata_db_path": None}

def fake_get_services(metadata_db_path: Path | None = None):
captured["metadata_db_path"] = metadata_db_path
raise RuntimeError("stop")

monkeypatch.setattr(cli, "get_services", fake_get_services)

with pytest.raises(typer.Exit):
cli.search(query="hello", path=search_path)

assert captured["metadata_db_path"] == search_path.resolve() / ".aci" / "index.db"
3 changes: 3 additions & 0 deletions tests/unit/test_incremental_update_scopes_to_root.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
import tempfile
from pathlib import Path

from aci.core.chunker import create_chunker
from aci.core.file_scanner import FileScanner
from aci.core.tokenizer import CharacterTokenizer
from aci.infrastructure.fakes import InMemoryVectorStore, LocalEmbeddingClient
from aci.infrastructure.metadata_store import IndexMetadataStore
from aci.services.indexing_service import IndexingService
Expand Down Expand Up @@ -47,6 +49,7 @@ def test_update_incremental_does_not_delete_other_repo_metadata():
vector_store=vector_store,
metadata_store=metadata_store,
file_scanner=file_scanner,
chunker=create_chunker(tokenizer=CharacterTokenizer()),
max_workers=1,
)

Expand Down
25 changes: 25 additions & 0 deletions tests/unit/test_runtime_path_resolution.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,3 +107,28 @@ def test_mcp_index_codebase_uses_resolved_runtime_path(tmp_path: Path):
payload = json.loads(result[0].text)
assert payload["requested_path"] == r"D:\workspace"
assert payload["indexed_path"] == str(mounted_repo.resolve())


def test_resolve_file_filter_pattern_keeps_wildcard_only_pattern(tmp_path: Path):
from aci.core.path_utils import resolve_file_filter_pattern

resolved = resolve_file_filter_pattern("**/*.tsx", tmp_path)

assert resolved == "**/*.tsx"


def test_resolve_file_filter_pattern_expands_relative_prefixed_pattern(tmp_path: Path):
from aci.core.path_utils import resolve_file_filter_pattern

resolved = resolve_file_filter_pattern("apps/web/**/*.tsx", tmp_path)

assert resolved == str(tmp_path.resolve() / "apps/web/**/*.tsx")


def test_resolve_file_filter_pattern_keeps_absolute_pattern(tmp_path: Path):
from aci.core.path_utils import resolve_file_filter_pattern

absolute_pattern = str(tmp_path / "apps/**/*.tsx")
resolved = resolve_file_filter_pattern(absolute_pattern, "/another/root")

assert resolved == absolute_pattern
Loading