Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,14 @@ ACI_VECTOR_STORE_VECTOR_SIZE=1024
ACI_INDEXING_MAX_WORKERS=8
ACI_INDEXING_MAX_CHUNK_TOKENS=8192
ACI_INDEXING_CHUNK_OVERLAP_LINES=2
# Tokenizer strategy for chunk token counting.
# Use "character" or "simple" for Ollama/BERT-based models (e.g. nomic-embed-text,
# mxbai-embed-large) to avoid the tiktoken/WordPiece mismatch that causes
# "input length exceeds context length" errors.
# tiktoken - OpenAI BPE (default, accurate for OpenAI models)
# character - len(text)/4 estimate (conservative, works with any model)
# simple - whitespace split (for generic non-BPE models)
ACI_TOKENIZER=tiktoken
# Optional comma-separated lists:
# ACI_INDEXING_FILE_EXTENSIONS=.py,.js,.ts,.go
# ACI_INDEXING_IGNORE_PATTERNS=__pycache__,*.pyc,.git,node_modules
Expand Down
4 changes: 2 additions & 2 deletions tests/unit/test_chunker_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
get_import_registry,
)
from aci.core.file_scanner import ScannedFile
from aci.core.tokenizer import get_default_tokenizer
from aci.core.tokenizer import CharacterTokenizer


class TestCodeChunk:
Expand Down Expand Up @@ -143,7 +143,7 @@ class TestChunker:

@pytest.fixture
def tokenizer(self):
return get_default_tokenizer()
return CharacterTokenizer()

@pytest.fixture
def chunker(self, tokenizer):
Expand Down
6 changes: 3 additions & 3 deletions tests/unit/test_chunker_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,15 @@
from aci.core.ast_parser import ASTNode, TreeSitterParser
from aci.core.chunker import Chunker, SmartChunkSplitter
from aci.core.file_scanner import ScannedFile
from aci.core.tokenizer import get_default_tokenizer
from aci.core.tokenizer import CharacterTokenizer


class TestSmartChunkSplitter:
"""Tests for SmartChunkSplitter implementation."""

@pytest.fixture
def tokenizer(self):
return get_default_tokenizer()
return CharacterTokenizer()

@pytest.fixture
def splitter(self, tokenizer):
Expand Down Expand Up @@ -195,7 +195,7 @@ class TestChunkerWithSmartSplitter:

@pytest.fixture
def tokenizer(self):
return get_default_tokenizer()
return CharacterTokenizer()

@pytest.fixture
def parser(self):
Expand Down
3 changes: 3 additions & 0 deletions tests/unit/test_incremental_update_scopes_to_root.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
import tempfile
from pathlib import Path

from aci.core.chunker import create_chunker
from aci.core.file_scanner import FileScanner
from aci.core.tokenizer import CharacterTokenizer
from aci.infrastructure.fakes import InMemoryVectorStore, LocalEmbeddingClient
from aci.infrastructure.metadata_store import IndexMetadataStore
from aci.services.indexing_service import IndexingService
Expand Down Expand Up @@ -47,6 +49,7 @@ def test_update_incremental_does_not_delete_other_repo_metadata():
vector_store=vector_store,
metadata_store=metadata_store,
file_scanner=file_scanner,
chunker=create_chunker(tokenizer=CharacterTokenizer()),
max_workers=1,
)

Expand Down
Loading