From 934a91955672af0b5f5abb5eb4ca5bd7569cea26 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 14 Mar 2026 11:38:52 +0000 Subject: [PATCH 1/2] Initial plan From bafcf80872d87979f9abe6a4a25bf7cec8618ee3 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 14 Mar 2026 11:51:35 +0000 Subject: [PATCH 2/2] fix(tokenizer): document ACI_TOKENIZER in .env.example and fix offline tests Why: Tests using TiktokenTokenizer fail when network is unavailable (tiktoken downloads its vocabulary from openaipublic.blob.core.windows.net on first use). This caused 14 unit test failures in sandboxed/offline environments. The ACI_TOKENIZER env var was also undocumented, leaving Ollama users with no guidance on how to resolve the BPE/WordPiece mismatch. What: - .env.example: add ACI_TOKENIZER with documentation for Ollama/BERT users - test_chunker_core.py: use CharacterTokenizer fixture (network-free) - test_chunker_splitter.py: use CharacterTokenizer fixture (network-free) - test_incremental_update_scopes_to_root.py: pass CharacterTokenizer-based chunker to IndexingService to avoid tiktoken download at test time Test: uv run pytest tests/unit/ -> 226 passed, 0 failed Co-authored-by: AperturePlus <146049978+AperturePlus@users.noreply.github.com> --- .env.example | 8 ++++++++ tests/unit/test_chunker_core.py | 4 ++-- tests/unit/test_chunker_splitter.py | 6 +++--- tests/unit/test_incremental_update_scopes_to_root.py | 3 +++ 4 files changed, 16 insertions(+), 5 deletions(-) diff --git a/.env.example b/.env.example index b12e987..7c5147b 100644 --- a/.env.example +++ b/.env.example @@ -35,6 +35,14 @@ ACI_VECTOR_STORE_VECTOR_SIZE=1024 ACI_INDEXING_MAX_WORKERS=8 ACI_INDEXING_MAX_CHUNK_TOKENS=8192 ACI_INDEXING_CHUNK_OVERLAP_LINES=2 +# Tokenizer strategy for chunk token counting. +# Use "character" or "simple" for Ollama/BERT-based models (e.g. nomic-embed-text, +# mxbai-embed-large) to avoid the tiktoken/WordPiece mismatch that causes +# "input length exceeds context length" errors. +# tiktoken - OpenAI BPE (default, accurate for OpenAI models) +# character - len(text)/4 estimate (conservative, works with any model) +# simple - whitespace split (for generic non-BPE models) +ACI_TOKENIZER=tiktoken # Optional comma-separated lists: # ACI_INDEXING_FILE_EXTENSIONS=.py,.js,.ts,.go # ACI_INDEXING_IGNORE_PATTERNS=__pycache__,*.pyc,.git,node_modules diff --git a/tests/unit/test_chunker_core.py b/tests/unit/test_chunker_core.py index 73870af..8f97e73 100644 --- a/tests/unit/test_chunker_core.py +++ b/tests/unit/test_chunker_core.py @@ -17,7 +17,7 @@ get_import_registry, ) from aci.core.file_scanner import ScannedFile -from aci.core.tokenizer import get_default_tokenizer +from aci.core.tokenizer import CharacterTokenizer class TestCodeChunk: @@ -143,7 +143,7 @@ class TestChunker: @pytest.fixture def tokenizer(self): - return get_default_tokenizer() + return CharacterTokenizer() @pytest.fixture def chunker(self, tokenizer): diff --git a/tests/unit/test_chunker_splitter.py b/tests/unit/test_chunker_splitter.py index 09e0a00..843f8fc 100644 --- a/tests/unit/test_chunker_splitter.py +++ b/tests/unit/test_chunker_splitter.py @@ -7,7 +7,7 @@ from aci.core.ast_parser import ASTNode, TreeSitterParser from aci.core.chunker import Chunker, SmartChunkSplitter from aci.core.file_scanner import ScannedFile -from aci.core.tokenizer import get_default_tokenizer +from aci.core.tokenizer import CharacterTokenizer class TestSmartChunkSplitter: @@ -15,7 +15,7 @@ class TestSmartChunkSplitter: @pytest.fixture def tokenizer(self): - return get_default_tokenizer() + return CharacterTokenizer() @pytest.fixture def splitter(self, tokenizer): @@ -195,7 +195,7 @@ class TestChunkerWithSmartSplitter: @pytest.fixture def tokenizer(self): - return get_default_tokenizer() + return CharacterTokenizer() @pytest.fixture def parser(self): diff --git a/tests/unit/test_incremental_update_scopes_to_root.py b/tests/unit/test_incremental_update_scopes_to_root.py index 1ec005b..0b97a8d 100644 --- a/tests/unit/test_incremental_update_scopes_to_root.py +++ b/tests/unit/test_incremental_update_scopes_to_root.py @@ -2,7 +2,9 @@ import tempfile from pathlib import Path +from aci.core.chunker import create_chunker from aci.core.file_scanner import FileScanner +from aci.core.tokenizer import CharacterTokenizer from aci.infrastructure.fakes import InMemoryVectorStore, LocalEmbeddingClient from aci.infrastructure.metadata_store import IndexMetadataStore from aci.services.indexing_service import IndexingService @@ -47,6 +49,7 @@ def test_update_incremental_does_not_delete_other_repo_metadata(): vector_store=vector_store, metadata_store=metadata_store, file_scanner=file_scanner, + chunker=create_chunker(tokenizer=CharacterTokenizer()), max_workers=1, )