From 934a91955672af0b5f5abb5eb4ca5bd7569cea26 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 14 Mar 2026 11:38:52 +0000
Subject: [PATCH 1/2] Initial plan


From bafcf80872d87979f9abe6a4a25bf7cec8618ee3 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 14 Mar 2026 11:51:35 +0000
Subject: [PATCH 2/2] fix(tokenizer): document ACI_TOKENIZER in .env.example
 and fix offline tests

Why: Tests using TiktokenTokenizer fail when network is unavailable (tiktoken
downloads its vocabulary from openaipublic.blob.core.windows.net on first use).
This caused 14 unit test failures in sandboxed/offline environments. The
ACI_TOKENIZER env var was also undocumented, leaving Ollama users with no
guidance on how to resolve the BPE/WordPiece mismatch.

What:
- .env.example: add ACI_TOKENIZER with documentation for Ollama/BERT users
- test_chunker_core.py: use CharacterTokenizer fixture (network-free)
- test_chunker_splitter.py: use CharacterTokenizer fixture (network-free)
- test_incremental_update_scopes_to_root.py: pass CharacterTokenizer-based
  chunker to IndexingService to avoid tiktoken download at test time

Test: uv run pytest tests/unit/ -> 226 passed, 0 failed

Co-authored-by: AperturePlus <146049978+AperturePlus@users.noreply.github.com>
---
 .env.example                                         | 8 ++++++++
 tests/unit/test_chunker_core.py                      | 4 ++--
 tests/unit/test_chunker_splitter.py                  | 6 +++---
 tests/unit/test_incremental_update_scopes_to_root.py | 3 +++
 4 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/.env.example b/.env.example
index b12e987..7c5147b 100644
--- a/.env.example
+++ b/.env.example
@@ -35,6 +35,14 @@ ACI_VECTOR_STORE_VECTOR_SIZE=1024
 ACI_INDEXING_MAX_WORKERS=8
 ACI_INDEXING_MAX_CHUNK_TOKENS=8192
 ACI_INDEXING_CHUNK_OVERLAP_LINES=2
+# Tokenizer strategy for chunk token counting.
+# Use "character" or "simple" for Ollama/BERT-based models (e.g. nomic-embed-text,
+# mxbai-embed-large) to avoid the tiktoken/WordPiece mismatch that causes
+# "input length exceeds context length" errors.
+#   tiktoken  - OpenAI BPE (default, accurate for OpenAI models)
+#   character - len(text)/4 estimate (conservative, works with any model)
+#   simple    - whitespace split (for generic non-BPE models)
+ACI_TOKENIZER=tiktoken
 # Optional comma-separated lists:
 # ACI_INDEXING_FILE_EXTENSIONS=.py,.js,.ts,.go
 # ACI_INDEXING_IGNORE_PATTERNS=__pycache__,*.pyc,.git,node_modules
diff --git a/tests/unit/test_chunker_core.py b/tests/unit/test_chunker_core.py
index 73870af..8f97e73 100644
--- a/tests/unit/test_chunker_core.py
+++ b/tests/unit/test_chunker_core.py
@@ -17,7 +17,7 @@
     get_import_registry,
 )
 from aci.core.file_scanner import ScannedFile
-from aci.core.tokenizer import get_default_tokenizer
+from aci.core.tokenizer import CharacterTokenizer
 
 
 class TestCodeChunk:
@@ -143,7 +143,7 @@ class TestChunker:
 
     @pytest.fixture
     def tokenizer(self):
-        return get_default_tokenizer()
+        return CharacterTokenizer()
 
     @pytest.fixture
     def chunker(self, tokenizer):
diff --git a/tests/unit/test_chunker_splitter.py b/tests/unit/test_chunker_splitter.py
index 09e0a00..843f8fc 100644
--- a/tests/unit/test_chunker_splitter.py
+++ b/tests/unit/test_chunker_splitter.py
@@ -7,7 +7,7 @@
 from aci.core.ast_parser import ASTNode, TreeSitterParser
 from aci.core.chunker import Chunker, SmartChunkSplitter
 from aci.core.file_scanner import ScannedFile
-from aci.core.tokenizer import get_default_tokenizer
+from aci.core.tokenizer import CharacterTokenizer
 
 
 class TestSmartChunkSplitter:
@@ -15,7 +15,7 @@ class TestSmartChunkSplitter:
 
     @pytest.fixture
     def tokenizer(self):
-        return get_default_tokenizer()
+        return CharacterTokenizer()
 
     @pytest.fixture
     def splitter(self, tokenizer):
@@ -195,7 +195,7 @@ class TestChunkerWithSmartSplitter:
 
     @pytest.fixture
     def tokenizer(self):
-        return get_default_tokenizer()
+        return CharacterTokenizer()
 
     @pytest.fixture
     def parser(self):
diff --git a/tests/unit/test_incremental_update_scopes_to_root.py b/tests/unit/test_incremental_update_scopes_to_root.py
index 1ec005b..0b97a8d 100644
--- a/tests/unit/test_incremental_update_scopes_to_root.py
+++ b/tests/unit/test_incremental_update_scopes_to_root.py
@@ -2,7 +2,9 @@
 import tempfile
 from pathlib import Path
 
+from aci.core.chunker import create_chunker
 from aci.core.file_scanner import FileScanner
+from aci.core.tokenizer import CharacterTokenizer
 from aci.infrastructure.fakes import InMemoryVectorStore, LocalEmbeddingClient
 from aci.infrastructure.metadata_store import IndexMetadataStore
 from aci.services.indexing_service import IndexingService
@@ -47,6 +49,7 @@ def test_update_incremental_does_not_delete_other_repo_metadata():
             vector_store=vector_store,
             metadata_store=metadata_store,
             file_scanner=file_scanner,
+            chunker=create_chunker(tokenizer=CharacterTokenizer()),
             max_workers=1,
         )