From 1cb4b0f30643f6d708bfc0861ed9693eaac3178f Mon Sep 17 00:00:00 2001 From: leonardobaggio Date: Thu, 17 Oct 2024 17:07:03 -0300 Subject: [PATCH 1/4] feat: Add AzureOpenAI support --- src/openparse/processing/__init__.py | 3 +- src/openparse/processing/ingest.py | 18 ++++- .../processing/semantic_transforms.py | 81 ++++++++++++++++++- 3 files changed, 97 insertions(+), 5 deletions(-) diff --git a/src/openparse/processing/__init__.py b/src/openparse/processing/__init__.py index 8375714..21add97 100644 --- a/src/openparse/processing/__init__.py +++ b/src/openparse/processing/__init__.py @@ -15,7 +15,7 @@ NoOpIngestionPipeline, SemanticIngestionPipeline, ) -from .semantic_transforms import CombineNodesSemantically, OpenAIEmbeddings +from .semantic_transforms import CombineNodesSemantically, OpenAIEmbeddings, AzureOpenAIEmbeddings __all__ = [ "ProcessingStep", @@ -33,4 +33,5 @@ "RemoveNodesBelowNTokens", "CombineNodesSemantically", "OpenAIEmbeddings", + "AzureOpenAIEmbeddings", ] diff --git a/src/openparse/processing/ingest.py b/src/openparse/processing/ingest.py index a08f84b..3cc5b5e 100644 --- a/src/openparse/processing/ingest.py +++ b/src/openparse/processing/ingest.py @@ -17,6 +17,7 @@ CombineNodesSemantically, EmbeddingModel, OpenAIEmbeddings, + AzureOpenAIEmbeddings ) from openparse.schemas import Node @@ -97,12 +98,25 @@ class SemanticIngestionPipeline(IngestionPipeline): def __init__( self, - openai_api_key: str, + api_key: str, + api_endpoint: str, + azure_deployment: str, + api_version: str = "2024-02-15-preview", model: EmbeddingModel = "text-embedding-3-large", min_tokens: int = consts.TOKENIZATION_LOWER_LIMIT, max_tokens: int = consts.TOKENIZATION_UPPER_LIMIT, ) -> None: - embedding_client = OpenAIEmbeddings(api_key=openai_api_key, model=model) + # if an api endpoint is provided, use AzureOpenAIEmbeddings + if api_endpoint is not None: + embedding_client = AzureOpenAIEmbeddings( + model=model, + api_key=api_key, + azure_endpoint=api_endpoint, + azure_deployment=azure_deployment, + api_version=api_version + ) + else: + embedding_client = OpenAIEmbeddings(api_key=api_key, model=model) self.transformations = [ RemoveTextInsideTables(), diff --git a/src/openparse/processing/semantic_transforms.py b/src/openparse/processing/semantic_transforms.py index 8369035..81e4a18 100644 --- a/src/openparse/processing/semantic_transforms.py +++ b/src/openparse/processing/semantic_transforms.py @@ -1,3 +1,4 @@ +from abc import ABC, abstractmethod from typing import List, Literal, Union import numpy as np @@ -14,10 +15,27 @@ def cosine_similarity( a: Union[np.ndarray, List[float]], b: Union[np.ndarray, List[float]] ) -> float: + """ + Calculate the cosine similarity between two vectors. + + Cosine similarity is a measure of similarity between two non-zero vectors of an inner product space that measures the cosine of the angle between them. + + Parameters: + a (Union[np.ndarray, List[float]]): The first vector. + b (Union[np.ndarray, List[float]]): The second vector. + + Returns: + float: The cosine similarity between vector `a` and vector `b`. The value ranges from -1 meaning exactly opposite, to 1 meaning exactly the same, with 0 usually indicating orthogonality (independence), and in-between values indicating intermediate similarity or dissimilarity. + """ return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) -class OpenAIEmbeddings: +class BaseEmbeddings(ABC): + @abstractmethod + def embed_many(self, texts: List[str]) -> List[List[float]]: + pass + +class OpenAIEmbeddings(BaseEmbeddings): def __init__( self, model: EmbeddingModel, @@ -68,7 +86,66 @@ def _create_client(self): ) from err return OpenAI(api_key=self.api_key) +class AzureOpenAIEmbeddings(BaseEmbeddings): + def __init__( + self, + model: EmbeddingModel, + api_key: str, + api_endpoint: str, + api_version: str, + deployment: str, + batch_size: int = 256, + ): + """ + Used to generate embeddings for Nodes. + + Args: + model (str): The embedding model to use. + api_key (str): Your Azure OpenAI API key. + api_endpoint (str): The Azure endpoint to use. + api_version (str): The version of the API to use. + deployment (str): The deployment to use. + batch_size (int): The number of texts to process in each api call. + """ + self.api_key = api_key + self.api_endpoint = api_endpoint + self.api_version = api_version + self.deployment = deployment + self.model = model + self.batch_size = batch_size + self.client = self._create_client() + def embed_many(self, texts: List[str]) -> List[List[float]]: + """ + Generate embeddings for a list of texts in batches. + + Args: + texts (list[str]): The list of texts to embed. + batch_size (int): The number of texts to process in each batch. + + Returns: + List[List[float]]: A list of embeddings. + """ + res = [] + for i in range(0, len(texts), self.batch_size): + batch_texts = texts[i : i + self.batch_size] + api_resp = self.client.embeddings.create( + input=batch_texts, model=self.deployment + ) + batch_res = [val.embedding for val in api_resp.data] + res.extend(batch_res) + + return res + + def _create_client(self): + try: + from openai import AzureOpenAI + except ImportError as err: + raise ImportError( + "You need to install the openai package to use this feature." + ) from err + return AzureOpenAI(api_key=self.api_key, azure_endpoint=self.api_endpoint, azure_deployment=self.deployment, api_version=self.api_version) + class CombineNodesSemantically(ProcessingStep): """ Combines nodes that are semantically related. @@ -76,7 +153,7 @@ class CombineNodesSemantically(ProcessingStep): def __init__( self, - embedding_client: OpenAIEmbeddings, + embedding_client: BaseEmbeddings, min_similarity: float, max_tokens: int, ): From e99dd95cb40052d03ab8fea1017da744768a1dca Mon Sep 17 00:00:00 2001 From: leonardobaggio Date: Thu, 17 Oct 2024 17:17:41 -0300 Subject: [PATCH 2/4] refactor: Remove unused model parameter in AzureOpenAIEmbeddings constructor --- src/openparse/processing/semantic_transforms.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/openparse/processing/semantic_transforms.py b/src/openparse/processing/semantic_transforms.py index 81e4a18..d091c8b 100644 --- a/src/openparse/processing/semantic_transforms.py +++ b/src/openparse/processing/semantic_transforms.py @@ -89,7 +89,6 @@ def _create_client(self): class AzureOpenAIEmbeddings(BaseEmbeddings): def __init__( self, - model: EmbeddingModel, api_key: str, api_endpoint: str, api_version: str, @@ -111,7 +110,6 @@ def __init__( self.api_endpoint = api_endpoint self.api_version = api_version self.deployment = deployment - self.model = model self.batch_size = batch_size self.client = self._create_client() From f2957461f9dbf186fff45eed7f2d696e690a136f Mon Sep 17 00:00:00 2001 From: leonardobaggio Date: Thu, 17 Oct 2024 18:34:09 -0300 Subject: [PATCH 3/4] refactor: removing unused model in SemanticIngestionPipeline and renaming azure_deployment,azure_endpoint => deployment,api_endpoint for consistency sake --- src/openparse/processing/ingest.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/openparse/processing/ingest.py b/src/openparse/processing/ingest.py index 3cc5b5e..2e28161 100644 --- a/src/openparse/processing/ingest.py +++ b/src/openparse/processing/ingest.py @@ -100,7 +100,7 @@ def __init__( self, api_key: str, api_endpoint: str, - azure_deployment: str, + deployment: str, api_version: str = "2024-02-15-preview", model: EmbeddingModel = "text-embedding-3-large", min_tokens: int = consts.TOKENIZATION_LOWER_LIMIT, @@ -109,10 +109,9 @@ def __init__( # if an api endpoint is provided, use AzureOpenAIEmbeddings if api_endpoint is not None: embedding_client = AzureOpenAIEmbeddings( - model=model, api_key=api_key, - azure_endpoint=api_endpoint, - azure_deployment=azure_deployment, + api_endpoint=api_endpoint, + deployment=deployment, api_version=api_version ) else: From e564513b4a10051d455cda7267d654d0ae7dce72 Mon Sep 17 00:00:00 2001 From: leonardobaggio Date: Thu, 17 Oct 2024 18:39:36 -0300 Subject: [PATCH 4/4] fix missing default for api_version on AzureOpenAIEmbeddings --- src/openparse/processing/semantic_transforms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openparse/processing/semantic_transforms.py b/src/openparse/processing/semantic_transforms.py index d091c8b..d66b7ee 100644 --- a/src/openparse/processing/semantic_transforms.py +++ b/src/openparse/processing/semantic_transforms.py @@ -91,8 +91,8 @@ def __init__( self, api_key: str, api_endpoint: str, - api_version: str, deployment: str, + api_version: str = "2024-02-15-preview", batch_size: int = 256, ): """