diff --git a/pyproject.toml b/pyproject.toml index 30576ed0..d4c1b924 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ mistralai = ["mistralai>=1.0.0"] openai = ["openai>=1.1.0"] nltk = ["nltk>=3.8.1,<4"] cohere = ["cohere>=4.44"] -voyageai = ["voyageai>=0.2.2"] +voyageai = ["voyageai>=0.3.5"] sentence-transformers = ["sentence-transformers>=3.4.0,<4"] langcache = ["langcache>=0.9.0"] vertexai = [ diff --git a/redisvl/utils/vectorize/text/voyageai.py b/redisvl/utils/vectorize/text/voyageai.py index 1936bd97..f3c26bc5 100644 --- a/redisvl/utils/vectorize/text/voyageai.py +++ b/redisvl/utils/vectorize/text/voyageai.py @@ -14,12 +14,33 @@ # ignore that voyageai isn't imported # mypy: disable-error-code="name-defined" +# Token limits for different VoyageAI models +VOYAGE_TOTAL_TOKEN_LIMITS = { + "voyage-context-3": 32_000, + "voyage-3.5-lite": 1_000_000, + "voyage-3.5": 320_000, + "voyage-2": 320_000, + "voyage-3-large": 120_000, + "voyage-code-3": 120_000, + "voyage-large-2-instruct": 120_000, + "voyage-finance-2": 120_000, + "voyage-multilingual-2": 120_000, + "voyage-law-2": 120_000, + "voyage-large-2": 120_000, + "voyage-3": 120_000, + "voyage-3-lite": 120_000, + "voyage-code-2": 120_000, + "voyage-3-m-exp": 120_000, + "voyage-multimodal-3": 120_000, +} + class VoyageAITextVectorizer(BaseVectorizer): """The VoyageAITextVectorizer class utilizes VoyageAI's API to generate embeddings for text data. - This vectorizer is designed to interact with VoyageAI's /embed API, + This vectorizer is designed to interact with VoyageAI's /embed API and + /contextualized_embed API (for context models like voyage-context-3), requiring an API key for authentication. The key can be provided directly in the `api_config` dictionary or through the `VOYAGE_API_KEY` environment variable. User must obtain an API key from VoyageAI's website @@ -27,10 +48,13 @@ class VoyageAITextVectorizer(BaseVectorizer): client must be installed with `pip install voyageai`. The vectorizer supports both synchronous and asynchronous operations, allows for batch - processing of texts and flexibility in handling preprocessing tasks. + processing of texts and flexibility in handling preprocessing tasks. It automatically + detects and handles contextualized embedding models (like voyage-context-3) which + generate embeddings that are aware of the surrounding context within a document. You can optionally enable caching to improve performance when generating - embeddings for repeated text inputs. + embeddings for repeated text inputs. The vectorizer also provides token counting + capabilities to help manage API usage and optimize batching strategies. .. code-block:: python @@ -38,7 +62,7 @@ class VoyageAITextVectorizer(BaseVectorizer): # Basic usage vectorizer = VoyageAITextVectorizer( - model="voyage-large-2", + model="voyage-3.5", api_config={"api_key": "your-voyageai-api-key"} # OR set VOYAGE_API_KEY in your env ) query_embedding = vectorizer.embed( @@ -55,7 +79,7 @@ class VoyageAITextVectorizer(BaseVectorizer): cache = EmbeddingsCache(name="voyageai_embeddings_cache") vectorizer = VoyageAITextVectorizer( - model="voyage-large-2", + model="voyage-3.5", api_config={"api_key": "your-voyageai-api-key"}, cache=cache ) @@ -72,13 +96,30 @@ class VoyageAITextVectorizer(BaseVectorizer): input_type="query" ) + # Using contextualized embeddings (voyage-context-3) + context_vectorizer = VoyageAITextVectorizer( + model="voyage-context-3", + api_config={"api_key": "your-voyageai-api-key"} + ) + # Context models automatically use contextualized_embed API + # which generates context-aware embeddings for document chunks + context_embeddings = context_vectorizer.embed_many( + texts=["chunk 1 of document", "chunk 2 of document", "chunk 3 of document"], + input_type="document" + ) + + # Token counting for API usage management + token_counts = vectorizer.count_tokens(["text one", "text two"]) + print(f"Token counts: {token_counts}") + print(f"Model token limit: {VOYAGE_TOTAL_TOKEN_LIMITS.get(vectorizer.model, 120_000)}") + """ model_config = ConfigDict(arbitrary_types_allowed=True) def __init__( self, - model: str = "voyage-large-2", + model: str, api_config: Optional[Dict] = None, dtype: str = "float32", cache: Optional["EmbeddingsCache"] = None, @@ -89,7 +130,7 @@ def __init__( Visit https://docs.voyageai.com/docs/embeddings to learn about embeddings and check the available models. Args: - model (str): Model to use for embedding. Defaults to "voyage-large-2". + model (str): Model to use for embedding (e.g., "voyage-3.5", "voyage-context-3"). api_config (Optional[Dict], optional): Dictionary containing the API key. Defaults to None. dtype (str): the default datatype to use when embedding text as byte arrays. @@ -172,22 +213,6 @@ def _set_model_dims(self) -> int: # fall back (TODO get more specific) raise ValueError(f"Error setting embedding model dimensions: {str(e)}") - def _get_batch_size(self) -> int: - """ - Determine the appropriate batch size based on the model being used. - - Returns: - int: Recommended batch size for the current model - """ - if self.model in ["voyage-2", "voyage-02"]: - return 72 - elif self.model in ["voyage-3-lite", "voyage-3.5-lite"]: - return 30 - elif self.model in ["voyage-3", "voyage-3.5"]: - return 10 - else: - return 7 # Default for other models - def _validate_input( self, texts: List[str], input_type: Optional[str], truncation: Optional[bool] ): @@ -244,10 +269,12 @@ def _embed_many( """ Generate vector embeddings for a batch of texts using the VoyageAI API. + Uses token-aware batching to respect model token limits and optimize API calls. + Args: texts: List of texts to embed - batch_size: Number of texts to process in each API call - **kwargs: Additional parameters to pass to the VoyageAI API + batch_size: Deprecated. Token-aware batching is now always used. + **kwargs: Additional parameters to pass to the VoyageAI API. Returns: List[List[float]]: List of vector embeddings as lists of floats @@ -262,21 +289,35 @@ def _embed_many( # Validate inputs self._validate_input(texts, input_type, truncation) - # Determine batch size if not provided - if batch_size is None: - batch_size = self._get_batch_size() + # Use token-aware batching + batches = self._build_token_aware_batches(texts) try: embeddings: List = [] - for batch in self.batchify(texts, batch_size): - response = self._client.embed( - texts=batch, - model=self.model, - input_type=input_type, - truncation=truncation, - **kwargs, - ) - embeddings.extend(response.embeddings) + + # Use contextualized embed API for context models + if self._is_context_model(): + for batch in batches: + # Context models expect inputs as a list of lists + response = self._client.contextualized_embed( + inputs=[batch], + model=self.model, + input_type=input_type, + **kwargs, + ) + # Extract embeddings from the first (and only) result + embeddings.extend(response.results[0].embeddings) + else: + # Use regular embed API for standard models + for batch in batches: + response = self._client.embed( + texts=batch, + model=self.model, + input_type=input_type, + truncation=truncation, # type: ignore[assignment] + **kwargs, + ) + embeddings.extend(response.embeddings) # type: ignore[attr-defined] return embeddings except Exception as e: raise ValueError(f"Embedding texts failed: {e}") @@ -311,10 +352,12 @@ async def _aembed_many( """ Asynchronously generate vector embeddings for a batch of texts using the VoyageAI API. + Uses token-aware batching to respect model token limits and optimize API calls. + Args: texts: List of texts to embed - batch_size: Number of texts to process in each API call - **kwargs: Additional parameters to pass to the VoyageAI API + batch_size: Deprecated. Token-aware batching is now always used. + **kwargs: Additional parameters to pass to the VoyageAI API. Returns: List[List[float]]: List of vector embeddings as lists of floats @@ -329,25 +372,135 @@ async def _aembed_many( # Validate inputs self._validate_input(texts, input_type, truncation) - # Determine batch size if not provided - if batch_size is None: - batch_size = self._get_batch_size() + # Use token-aware batching (synchronous - tokenization is sync-only) + batches = self._build_token_aware_batches(texts) try: embeddings: List = [] - for batch in self.batchify(texts, batch_size): - response = await self._aclient.embed( - texts=batch, - model=self.model, - input_type=input_type, - truncation=truncation, - **kwargs, - ) - embeddings.extend(response.embeddings) + + # Use contextualized embed API for context models + if self._is_context_model(): + for batch in batches: + # Context models expect inputs as a list of lists + response = await self._aclient.contextualized_embed( + inputs=[batch], + model=self.model, + input_type=input_type, + **kwargs, + ) + # Extract embeddings from the first (and only) result + embeddings.extend(response.results[0].embeddings) + else: + # Use regular embed API for standard models + for batch in batches: + response = await self._aclient.embed( + texts=batch, + model=self.model, + input_type=input_type, + truncation=truncation, # type: ignore[assignment] + **kwargs, + ) + embeddings.extend(response.embeddings) # type: ignore[attr-defined] return embeddings except Exception as e: raise ValueError(f"Embedding texts failed: {e}") + def count_tokens(self, texts: List[str]) -> List[int]: + """ + Count tokens for the given texts using VoyageAI's tokenization API. + + Args: + texts: List of texts to count tokens for. + + Returns: + List[int]: List of token counts for each text. + + Raises: + ValueError: If tokenization fails. + + Example: + >>> vectorizer = VoyageAITextVectorizer(model="voyage-3.5") + >>> token_counts = vectorizer.count_tokens(["Hello world", "Another text"]) + >>> print(token_counts) # [2, 2] + """ + if not texts: + return [] + + try: + # Use the VoyageAI tokenize API to get token counts + token_lists = self._client.tokenize(texts, model=self.model) + return [len(token_list) for token_list in token_lists] + except Exception as e: + raise ValueError(f"Token counting failed: {e}") + + def _is_context_model(self) -> bool: + """ + Check if the current model is a contextualized embedding model. + + Contextualized models (like voyage-context-3) use a different API + endpoint and expect inputs formatted differently. + + Returns: + bool: True if the model is a context model, False otherwise. + """ + return "context" in self.model + + def _build_token_aware_batches( + self, texts: List[str], max_batch_size: int = 1000 + ) -> List[List[str]]: + """ + Generate batches of texts based on token limits and batch size constraints. + + This method uses VoyageAI's tokenization API to count tokens for all texts + in a single call, then creates batches that respect both the model's token + limit and a maximum batch size. + + Args: + texts: List of texts to batch. + max_batch_size: Maximum number of texts per batch (default: 1000). + + Returns: + List[List[str]]: List of batches, where each batch is a list of texts. + + Raises: + ValueError: If tokenization fails. + """ + if not texts: + return [] + + max_tokens_per_batch = VOYAGE_TOTAL_TOKEN_LIMITS.get(self.model, 120_000) + batches = [] + current_batch: List[str] = [] + current_batch_tokens = 0 + + # Tokenize all texts in one API call for efficiency + try: + token_counts = self.count_tokens(texts) + except Exception as e: + raise ValueError(f"Failed to count tokens for batching: {e}") + + for i, text in enumerate(texts): + n_tokens = token_counts[i] + + # Check if adding this text would exceed limits + if current_batch and ( + len(current_batch) >= max_batch_size + or (current_batch_tokens + n_tokens > max_tokens_per_batch) + ): + # Save the current batch and start a new one + batches.append(current_batch) + current_batch = [] + current_batch_tokens = 0 + + current_batch.append(text) + current_batch_tokens += n_tokens + + # Add the last batch if it has any texts + if current_batch: + batches.append(current_batch) + + return batches + @property def type(self) -> str: return "voyageai" diff --git a/tests/integration/test_rerankers.py b/tests/integration/test_rerankers.py index caee5a47..a6904af2 100644 --- a/tests/integration/test_rerankers.py +++ b/tests/integration/test_rerankers.py @@ -13,14 +13,15 @@ @pytest.fixture( params=[ CohereReranker, - VoyageAIReranker, + (VoyageAIReranker, "rerank-lite-1"), + (VoyageAIReranker, "rerank-2.5"), ] ) def reranker(request): if request.param == CohereReranker: return request.param() - elif request.param == VoyageAIReranker: - return request.param(model="rerank-lite-1") + elif isinstance(request.param, tuple) and request.param[0] == VoyageAIReranker: + return request.param[0](model=request.param[1]) @pytest.fixture diff --git a/tests/integration/test_vectorizers.py b/tests/integration/test_vectorizers.py index d5727664..d35a15f8 100644 --- a/tests/integration/test_vectorizers.py +++ b/tests/integration/test_vectorizers.py @@ -62,7 +62,7 @@ def vectorizer(request): elif request.param == MistralAITextVectorizer: return request.param() elif request.param == VoyageAITextVectorizer: - return request.param(model="voyage-large-2") + return request.param(model="voyage-3.5") elif request.param == AzureOpenAITextVectorizer: return request.param( model=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME", "text-embedding-ada-002") @@ -437,6 +437,8 @@ def test_default_dtype(vectorizer_): vectorizer = vectorizer_( model=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME", "text-embedding-ada-002") ) + elif issubclass(vectorizer_, VoyageAITextVectorizer): + vectorizer = vectorizer_(model="voyage-3.5") else: vectorizer = vectorizer_() @@ -470,6 +472,8 @@ def test_vectorizer_dtype_assignment(vectorizer_): ), dtype=dtype, ) + elif issubclass(vectorizer_, VoyageAITextVectorizer): + vectorizer = vectorizer_(model="voyage-3.5", dtype=dtype) else: vectorizer = vectorizer_(dtype=dtype) @@ -491,14 +495,24 @@ def test_vectorizer_dtype_assignment(vectorizer_): ], ) def test_non_supported_dtypes(vectorizer_): - with pytest.raises(ValueError): - vectorizer_(dtype="float25") + if issubclass(vectorizer_, VoyageAITextVectorizer): + with pytest.raises(ValueError): + vectorizer_(model="voyage-3.5", dtype="float25") - with pytest.raises(ValueError): - vectorizer_(dtype=7) + with pytest.raises(ValueError): + vectorizer_(model="voyage-3.5", dtype=7) - with pytest.raises(ValueError): - vectorizer_(dtype=None) + with pytest.raises(ValueError): + vectorizer_(model="voyage-3.5", dtype=2.3) + else: + with pytest.raises(ValueError): + vectorizer_(dtype="float25") + + with pytest.raises(ValueError): + vectorizer_(dtype=7) + + with pytest.raises(ValueError): + vectorizer_(dtype=None) @pytest.mark.requires_api_keys @@ -623,3 +637,120 @@ def test_cohere_embedding_types_warning(): ) assert isinstance(embeddings, list) assert len(embeddings) == len(texts) + + +# VoyageAI-specific tests +@pytest.mark.requires_api_keys +@pytest.mark.parametrize("model", ["voyage-3.5", "voyage-context-3"]) +def test_voyageai_count_tokens(model): + """Test VoyageAI token counting functionality.""" + vectorizer = VoyageAITextVectorizer(model=model) + texts = ["Hello world", "This is a longer test sentence."] + + # Test count_tokens method + token_counts = vectorizer.count_tokens(texts) + assert isinstance(token_counts, list) + assert len(token_counts) == len(texts) + assert all(isinstance(count, int) and count > 0 for count in token_counts) + + +def test_voyageai_token_limits(): + """Test VoyageAI token limit constants for different models.""" + # Test token limits using the dictionary (no API calls needed) + from redisvl.utils.vectorize.text.voyageai import VOYAGE_TOTAL_TOKEN_LIMITS + + # Test that the constants are defined correctly + assert VOYAGE_TOTAL_TOKEN_LIMITS.get("voyage-context-3") == 32_000 + assert VOYAGE_TOTAL_TOKEN_LIMITS.get("voyage-3.5-lite") == 1_000_000 + assert VOYAGE_TOTAL_TOKEN_LIMITS.get("voyage-3.5") == 320_000 + assert VOYAGE_TOTAL_TOKEN_LIMITS.get("voyage-2") == 320_000 + assert VOYAGE_TOTAL_TOKEN_LIMITS.get("voyage-large-2") == 120_000 + + # Test that default value is returned for unknown models + assert VOYAGE_TOTAL_TOKEN_LIMITS.get("unknown-model", 120_000) == 120_000 + + +def test_voyageai_context_model_detection(): + """Test detection of contextualized embedding models.""" + # Test the detection method directly (no API calls needed) + vectorizer_regular = VoyageAITextVectorizer(model="voyage-3.5") + assert vectorizer_regular._is_context_model() is False + + # Test that the method would detect context models + # by checking the logic (model name contains "context") + assert "context" not in "voyage-3.5" + assert "context" in "voyage-context-3" + + +@pytest.mark.requires_api_keys +def test_voyageai_context_model_embed(): + """Test embedding with contextualized model (voyage-context-3).""" + vectorizer = VoyageAITextVectorizer(model="voyage-context-3") + texts = TEST_TEXTS + + # Test embedding with context model + embeddings = vectorizer.embed_many(texts, input_type="document") + assert isinstance(embeddings, list) + assert len(embeddings) == len(texts) + assert all( + isinstance(emb, list) and len(emb) == vectorizer.dims for emb in embeddings + ) + + +@pytest.mark.requires_api_keys +@pytest.mark.asyncio +async def test_voyageai_context_model_aembed(): + """Test async embedding with contextualized model (voyage-context-3).""" + vectorizer = VoyageAITextVectorizer(model="voyage-context-3") + texts = TEST_TEXTS + + # Test async embedding with context model + embeddings = await vectorizer.aembed_many(texts, input_type="document") + assert isinstance(embeddings, list) + assert len(embeddings) == len(texts) + assert all( + isinstance(emb, list) and len(emb) == vectorizer.dims for emb in embeddings + ) + + +@pytest.mark.requires_api_keys +@pytest.mark.parametrize("model", ["voyage-3.5", "voyage-context-3"]) +def test_voyageai_batching(model): + """Test batching with varying text lengths (uses automatic token-aware batching).""" + vectorizer = VoyageAITextVectorizer(model=model) + # Create texts with varying lengths + texts = [ + "Short text.", + "This is a medium length text that has more words.", + "This is a much longer text that contains significantly more content and should take up more tokens in the batch.", + ] * 3 + + # Token-aware batching is now always used + embeddings = vectorizer.embed_many(texts, input_type="document") + assert isinstance(embeddings, list) + assert len(embeddings) == len(texts) + assert all( + isinstance(emb, list) and len(emb) == vectorizer.dims for emb in embeddings + ) + + +@pytest.mark.requires_api_keys +@pytest.mark.asyncio +@pytest.mark.parametrize("model", ["voyage-3.5", "voyage-context-3"]) +async def test_voyageai_batching_async(model): + """Test async batching with varying text lengths (uses automatic token-aware batching).""" + vectorizer = VoyageAITextVectorizer(model=model) + # Create texts with varying lengths + texts = [ + "Short text.", + "This is a medium length text that has more words.", + "This is a much longer text that contains significantly more content and should take up more tokens in the batch.", + ] * 3 + + # Token-aware batching is now always used + embeddings = await vectorizer.aembed_many(texts, input_type="document") + assert isinstance(embeddings, list) + assert len(embeddings) == len(texts) + assert all( + isinstance(emb, list) and len(emb) == vectorizer.dims for emb in embeddings + )