diff --git a/.env.example b/.env.example index b12e987..7c5147b 100644 --- a/.env.example +++ b/.env.example @@ -35,6 +35,14 @@ ACI_VECTOR_STORE_VECTOR_SIZE=1024 ACI_INDEXING_MAX_WORKERS=8 ACI_INDEXING_MAX_CHUNK_TOKENS=8192 ACI_INDEXING_CHUNK_OVERLAP_LINES=2 +# Tokenizer strategy for chunk token counting. +# Use "character" or "simple" for Ollama/BERT-based models (e.g. nomic-embed-text, +# mxbai-embed-large) to avoid the tiktoken/WordPiece mismatch that causes +# "input length exceeds context length" errors. +# tiktoken - OpenAI BPE (default, accurate for OpenAI models) +# character - len(text)/4 estimate (conservative, works with any model) +# simple - whitespace split (for generic non-BPE models) +ACI_TOKENIZER=tiktoken # Optional comma-separated lists: # ACI_INDEXING_FILE_EXTENSIONS=.py,.js,.ts,.go # ACI_INDEXING_IGNORE_PATTERNS=__pycache__,*.pyc,.git,node_modules diff --git a/tests/unit/test_chunker_core.py b/tests/unit/test_chunker_core.py index 73870af..8f97e73 100644 --- a/tests/unit/test_chunker_core.py +++ b/tests/unit/test_chunker_core.py @@ -17,7 +17,7 @@ get_import_registry, ) from aci.core.file_scanner import ScannedFile -from aci.core.tokenizer import get_default_tokenizer +from aci.core.tokenizer import CharacterTokenizer class TestCodeChunk: @@ -143,7 +143,7 @@ class TestChunker: @pytest.fixture def tokenizer(self): - return get_default_tokenizer() + return CharacterTokenizer() @pytest.fixture def chunker(self, tokenizer): diff --git a/tests/unit/test_chunker_splitter.py b/tests/unit/test_chunker_splitter.py index 09e0a00..843f8fc 100644 --- a/tests/unit/test_chunker_splitter.py +++ b/tests/unit/test_chunker_splitter.py @@ -7,7 +7,7 @@ from aci.core.ast_parser import ASTNode, TreeSitterParser from aci.core.chunker import Chunker, SmartChunkSplitter from aci.core.file_scanner import ScannedFile -from aci.core.tokenizer import get_default_tokenizer +from aci.core.tokenizer import CharacterTokenizer class TestSmartChunkSplitter: @@ -15,7 +15,7 @@ class TestSmartChunkSplitter: @pytest.fixture def tokenizer(self): - return get_default_tokenizer() + return CharacterTokenizer() @pytest.fixture def splitter(self, tokenizer): @@ -195,7 +195,7 @@ class TestChunkerWithSmartSplitter: @pytest.fixture def tokenizer(self): - return get_default_tokenizer() + return CharacterTokenizer() @pytest.fixture def parser(self): diff --git a/tests/unit/test_incremental_update_scopes_to_root.py b/tests/unit/test_incremental_update_scopes_to_root.py index 1ec005b..0b97a8d 100644 --- a/tests/unit/test_incremental_update_scopes_to_root.py +++ b/tests/unit/test_incremental_update_scopes_to_root.py @@ -2,7 +2,9 @@ import tempfile from pathlib import Path +from aci.core.chunker import create_chunker from aci.core.file_scanner import FileScanner +from aci.core.tokenizer import CharacterTokenizer from aci.infrastructure.fakes import InMemoryVectorStore, LocalEmbeddingClient from aci.infrastructure.metadata_store import IndexMetadataStore from aci.services.indexing_service import IndexingService @@ -47,6 +49,7 @@ def test_update_incremental_does_not_delete_other_repo_metadata(): vector_store=vector_store, metadata_store=metadata_store, file_scanner=file_scanner, + chunker=create_chunker(tokenizer=CharacterTokenizer()), max_workers=1, )