diff --git a/README.md b/README.md index edd6f516a..e7363a3cc 100644 --- a/README.md +++ b/README.md @@ -25,40 +25,48 @@ This README provides detailed information on how to set up, develop, and deploy ## Table of Contents -- [Quickstart](#quickstart) -- [About](#about) - - [Plugins](#plugins) - - [Retrieval Plugin](#retrieval-plugin) - - [Memory Feature](#memory-feature) - - [Security](#security) - - [API Endpoints](#api-endpoints) -- [Development](#development) - - [Setup](#setup) - - [General Environment Variables](#general-environment-variables) - - [Choosing a Vector Database](#choosing-a-vector-database) - - [Pinecone](#pinecone) - - [Weaviate](#weaviate) - - [Zilliz](#zilliz) - - [Milvus](#milvus) - - [Qdrant](#qdrant) - - [Redis](#redis) - - [Llama Index](#llamaindex) - - [Chroma](#chroma) - - [Azure Cognitive Search](#azure-cognitive-search) - - [Supabase](#supabase) - - [Postgres](#postgres) - - [AnalyticDB](#analyticdb) - - [Running the API Locally](#running-the-api-locally) - - [Testing a Localhost Plugin in ChatGPT](#testing-a-localhost-plugin-in-chatgpt) - - [Personalization](#personalization) - - [Authentication Methods](#authentication-methods) -- [Deployment](#deployment) -- [Installing a Developer Plugin](#installing-a-developer-plugin) -- [Webhooks](#webhooks) -- [Scripts](#scripts) -- [Limitations](#limitations) -- [Contributors](#contributors) -- [Future Directions](#future-directions) +- [ChatGPT Retrieval Plugin](#chatgpt-retrieval-plugin) + - [Introduction](#introduction) + - [Table of Contents](#table-of-contents) + - [Quickstart](#quickstart) + - [Testing in ChatGPT](#testing-in-chatgpt) + - [About](#about) + - [Plugins](#plugins) + - [Retrieval Plugin](#retrieval-plugin) + - [Memory Feature](#memory-feature) + - [Security](#security) + - [API Endpoints](#api-endpoints) + - [Development](#development) + - [Setup](#setup) + - [General Environment Variables](#general-environment-variables) + - [Using the plugin with Azure OpenAI](#using-the-plugin-with-azure-openai) + - [Choosing a Vector Database](#choosing-a-vector-database) + - [Pinecone](#pinecone) + - [Weaviate](#weaviate) + - [Zilliz](#zilliz) + - [Milvus](#milvus) + - [Qdrant](#qdrant) + - [Redis](#redis) + - [LlamaIndex](#llamaindex) + - [Chroma](#chroma) + - [Azure Cognitive Search](#azure-cognitive-search) + - [Supabase](#supabase) + - [Postgres](#postgres) + - [AnalyticDB](#analyticdb) + - [Azure Cosmos DB](#azure-cosmos-db) + - [Running the API locally](#running-the-api-locally) + - [Testing a Localhost Plugin in ChatGPT](#testing-a-localhost-plugin-in-chatgpt) + - [Personalization](#personalization) + - [Authentication Methods](#authentication-methods) + - [Deployment](#deployment) + - [Installing a Developer Plugin](#installing-a-developer-plugin) + - [Webhooks](#webhooks) + - [Scripts](#scripts) + - [Pull Request (PR) Checklist](#pull-request-pr-checklist) + - [Pull Request Naming Convention](#pull-request-naming-convention) + - [Limitations](#limitations) + - [Future Directions](#future-directions) + - [Contributors](#contributors) ## Quickstart @@ -166,6 +174,12 @@ Follow these steps to quickly set up and run the ChatGPT Retrieval Plugin: export PG_USER= export PG_PASSWORD= export PG_DATABASE= + + # Azure Cosmos DB + export AZCOSMOS_API= + export AZCOSMOS_CONNSTR= + export AZCOSMOS_DATABASE_NAME= + export AZCOSMOS_CONTAINER_NAME= ``` 10. Run the API locally: `poetry run start` @@ -279,7 +293,7 @@ The API requires the following environment variables to work: | Name | Required | Description | | ---------------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `DATASTORE` | Yes | This specifies the vector database provider you want to use to store and query embeddings. You can choose from `chroma`, `pinecone`, `weaviate`, `zilliz`, `milvus`, `qdrant`, `redis`, `azuresearch`, `supabase`, `postgres`, `analyticdb`. | +| `DATASTORE` | Yes | This specifies the vector database provider you want to use to store and query embeddings. You can choose from `chroma`, `pinecone`, `weaviate`, `zilliz`, `milvus`, `qdrant`, `redis`, `azuresearch`, `supabase`, `postgres`, `analyticdb`, `azurecosmosdb`. | | `BEARER_TOKEN` | Yes | This is a secret token that you need to authenticate your requests to the API. You can generate one using any tool or method you prefer, such as [jwt.io](https://jwt.io/). | | `OPENAI_API_KEY` | Yes | This is your OpenAI API key that you need to generate embeddings using the `text-embedding-ada-002` model. You can get an API key by creating an account on [OpenAI](https://openai.com/). | @@ -352,6 +366,10 @@ For detailed setup instructions, refer to [`/docs/providers/llama/setup.md`](/do [AnalyticDB](https://www.alibabacloud.com/help/en/analyticdb-for-postgresql/latest/product-introduction-overview) is a distributed cloud-native vector database designed for storing documents and vector embeddings. It is fully compatible with PostgreSQL syntax and managed by Alibaba Cloud. AnalyticDB offers a powerful vector compute engine, processing billions of data vectors and providing features such as indexing algorithms, structured and unstructured data capabilities, real-time updates, distance metrics, scalar filtering, and time travel searches. For detailed setup instructions, refer to [`/docs/providers/analyticdb/setup.md`](/docs/providers/analyticdb/setup.md). +#### Azure Cosmos DB + +[Azure Cosmos DB](https://azure.microsoft.com/en-us/products/cosmos-db/) Azure Cosmos DB is a fully managed NoSQL and relational database for modern app development. Using Azure Cosmos DB for MongoDB vCore, you can store vector embeddings in your documents and perform [vector similarity search](https://learn.microsoft.com/azure/cosmos-db/mongodb/vcore/vector-search) on a fully managed MongoDB-compatible database service. For detailed setup instructions, refer to [`/docs/providers/azurecosmosdb/setup.md`](/docs/providers/azurecosmosdb/setup.md). + ### Running the API locally To run the API locally, you first need to set the requisite environment variables with the `export` command: diff --git a/datastore/factory.py b/datastore/factory.py index adde49d76..e5243aeae 100644 --- a/datastore/factory.py +++ b/datastore/factory.py @@ -36,6 +36,10 @@ async def get_datastore() -> DataStore: from datastore.providers.redis_datastore import RedisDataStore return await RedisDataStore.init() + case "azurecosmosdb": + from datastore.providers.azurecosmosdb_datastore import AzureCosmosDBDataStore + + return await AzureCosmosDBDataStore.create() case "qdrant": from datastore.providers.qdrant_datastore import QdrantDataStore diff --git a/datastore/providers/azurecosmosdb_datastore.py b/datastore/providers/azurecosmosdb_datastore.py new file mode 100644 index 000000000..f9d3507d8 --- /dev/null +++ b/datastore/providers/azurecosmosdb_datastore.py @@ -0,0 +1,277 @@ +import logging +import os + +import certifi +import numpy as np +import pymongo + +from pymongo.mongo_client import MongoClient +from abc import ABC, abstractmethod + +from typing import Dict, List, Optional +from datetime import datetime +from datastore.datastore import DataStore +from models.models import ( + DocumentChunk, + DocumentMetadataFilter, + DocumentChunkWithScore, + DocumentMetadataFilter, + QueryResult, + QueryWithEmbedding, +) +from services.date import to_unix_timestamp + + +# Read environment variables for CosmosDB Mongo vCore +AZCOSMOS_API = os.environ.get("AZCOSMOS_API", "mongo-vcore") +AZCOSMOS_CONNSTR = os.environ.get("AZCOSMOS_CONNSTR") +AZCOSMOS_DATABASE_NAME = os.environ.get("AZCOSMOS_DATABASE_NAME") +AZCOSMOS_CONTAINER_NAME = os.environ.get("AZCOSMOS_CONTAINER_NAME") +assert AZCOSMOS_API is not None +assert AZCOSMOS_CONNSTR is not None +assert AZCOSMOS_DATABASE_NAME is not None +assert AZCOSMOS_CONTAINER_NAME is not None + +# OpenAI Ada Embeddings Dimension +VECTOR_DIMENSION = 1536 + + +# Abstract class similar to the original data store that allows API level abstraction +class AzureCosmosDBStoreApi(ABC): + @abstractmethod + async def ensure(self, num_lists, similarity): + raise NotImplementedError + + @abstractmethod + async def upsert_core(self, docId: str, chunks: List[DocumentChunk]) -> List[str]: + raise NotImplementedError + + @abstractmethod + async def query_core(self, query: QueryWithEmbedding) -> List[DocumentChunkWithScore]: + raise NotImplementedError + + @abstractmethod + async def drop_container(self): + raise NotImplementedError + + @abstractmethod + async def delete_filter(self, filter: DocumentMetadataFilter): + raise NotImplementedError + + @abstractmethod + async def delete_ids(self, ids: List[str]): + raise NotImplementedError + + @abstractmethod + async def delete_document_ids(self, documentIds: List[str]): + raise NotImplementedError + + +class MongoStoreApi(AzureCosmosDBStoreApi): + def __init__(self, mongoClient: MongoClient): + self.mongoClient = mongoClient + + @staticmethod + def _get_metadata_filter(filter: DocumentMetadataFilter) -> dict: + returnedFilter: dict = {} + if filter.document_id is not None: + returnedFilter["document_id"] = filter.document_id + if filter.author is not None: + returnedFilter["metadata.author"] = filter.author + if filter.start_date is not None: + returnedFilter["metadata.created_at"] = {"$gt": datetime.fromisoformat(filter.start_date)} + if filter.end_date is not None: + returnedFilter["metadata.created_at"] = {"$lt": datetime.fromisoformat(filter.end_date)} + if filter.source is not None: + returnedFilter["metadata.source"] = filter.source + if filter.source_id is not None: + returnedFilter["metadata.source_id"] = filter.source_id + return returnedFilter + + async def ensure(self, num_lists, similarity): + assert self.mongoClient.is_mongos + self.collection = self.mongoClient[AZCOSMOS_DATABASE_NAME][AZCOSMOS_CONTAINER_NAME] + + indexes = self.collection.index_information() + if indexes.get("embedding_cosmosSearch") is None: + # Ensure the vector index exists. + indexDefs: List[any] = [ + { + "name": "embedding_cosmosSearch", + "key": {"embedding": "cosmosSearch"}, + "cosmosSearchOptions": { + "kind": "vector-ivf", + "numLists": num_lists, + "similarity": similarity, + "dimensions": VECTOR_DIMENSION, + }, + } + ] + self.mongoClient[AZCOSMOS_DATABASE_NAME].command("createIndexes", AZCOSMOS_CONTAINER_NAME, + indexes=indexDefs) + + async def upsert_core(self, docId: str, chunks: List[DocumentChunk]) -> List[str]: + # Until nested doc embedding support is done, treat each chunk as a separate doc. + doc_ids: List[str] = [] + for chunk in chunks: + finalDocChunk: dict = { + "_id": f"doc:{docId}:chunk:{chunk.id}", + "document_id": docId, + 'embedding': chunk.embedding, + "text": chunk.text, + "metadata": chunk.metadata.__dict__ + } + + if chunk.metadata.created_at is not None: + finalDocChunk["metadata"]["created_at"] = datetime.fromisoformat(chunk.metadata.created_at) + self.collection.insert_one(finalDocChunk) + doc_ids.append(finalDocChunk["_id"]) + return doc_ids + + async def query_core(self, query: QueryWithEmbedding) -> List[DocumentChunkWithScore]: + pipeline = [ + { + "$search": { + "cosmosSearch": { + "vector": query.embedding, + "path": "embedding", + "k": query.top_k}, + "returnStoredSource": True} + }, + { + "$project": { + "similarityScore": { + "$meta": "searchScore" + }, + "document": "$$ROOT" + } + } + ] + + # TODO: Add in match filter (once it can be satisfied). + # Perform vector search + query_results: List[DocumentChunkWithScore] = [] + for aggResult in self.collection.aggregate(pipeline): + finalMetadata = aggResult["document"]["metadata"] + if finalMetadata["created_at"] is not None: + finalMetadata["created_at"] = datetime.isoformat(finalMetadata["created_at"]) + result = DocumentChunkWithScore( + id=aggResult["_id"], + score=aggResult["similarityScore"], + text=aggResult["document"]["text"], + metadata=finalMetadata + ) + query_results.append(result) + return query_results + + async def drop_container(self): + self.collection.drop() + + async def delete_filter(self, filter: DocumentMetadataFilter): + delete_filter = self._get_metadata_filter(filter) + self.collection.delete_many(delete_filter) + + async def delete_ids(self, ids: List[str]): + self.collection.delete_many({"_id": {"$in": ids}}) + + async def delete_document_ids(self, documentIds: List[str]): + self.collection.delete_many({"document_id": {"$in": documentIds}}) + + +# Datastore implementation. +""" +A class representing a memory store for Azure CosmosDB DataStore, currently only supports Mongo vCore +""" +class AzureCosmosDBDataStore(DataStore): + def __init__(self, cosmosStore: AzureCosmosDBStoreApi): + self.cosmosStore = cosmosStore + + """ + Creates a new datastore based on the Cosmos Api provided in the environment variables, + only supports Mongo vCore for now + + Args: + numLists (int) : This integer is the number of clusters that the inverted file (IVF) index + uses to group the vector data. We recommend that numLists is set to + documentCount/1000 for up to 1 million documents and to sqrt(documentCount) + for more than 1 million documents. Using a numLists value of 1 is akin to + performing brute-force search, which has limited performance. + similarity (str) : Similarity metric to use with the IVF index. Possible options are COS (cosine distance), + L2 (Euclidean distance), and IP (inner product). + + """ + @staticmethod + async def create(num_lists, similarity) -> DataStore: + + # Create underlying data store based on the API definition. + # Right now this only supports Mongo, but set up to support more. + apiStore: AzureCosmosDBStoreApi = None + if AZCOSMOS_API == "mongo-vcore": + mongoClient = MongoClient(AZCOSMOS_CONNSTR) + apiStore = MongoStoreApi(mongoClient) + else: + raise NotImplementedError + + await apiStore.ensure(num_lists, similarity) + store = AzureCosmosDBDataStore(apiStore) + return store + + async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> List[str]: + """ + Takes in a list of list of document chunks and inserts them into the database. + Return a list of document ids. + """ + # Initialize a list of ids to return + doc_ids: List[str] = [] + for doc_id, chunk_list in chunks.items(): + returnedIds = await self.cosmosStore.upsert_core(doc_id, chunk_list) + for returnedId in returnedIds: + doc_ids.append(returnedId) + return doc_ids + + async def _query( + self, + queries: List[QueryWithEmbedding], + ) -> List[QueryResult]: + """ + Takes in a list of queries with embeddings and filters and + returns a list of query results with matching document chunks and scores. + """ + # Prepare query responses and results object + results: List[QueryResult] = [] + + # Gather query results in a pipeline + logging.info(f"Gathering {len(queries)} query results", flush=True) + for query in queries: + logging.info(f"Query: {query.query}") + query_results = await self.cosmosStore.query_core(query) + + # Add to overall results + results.append(QueryResult(query=query.query, results=query_results)) + return results + + async def delete( + self, + ids: Optional[List[str]] = None, + filter: Optional[DocumentMetadataFilter] = None, + delete_all: Optional[bool] = None, + ) -> bool: + """ + Removes vectors by ids, filter, or everything in the datastore. + Returns whether the operation was successful. + """ + if delete_all: + # fast path - truncate/delete all items. + await self.cosmosStore.drop_container() + return True + + if filter: + if filter.document_id is not None: + await self.cosmosStore.delete_document_ids([filter.document_id]) + else: + await self.cosmosStore.delete_filter(filter) + + if ids: + await self.cosmosStore.delete_ids(ids) + + return True diff --git a/docs/providers/azurecosmosdb/setup.md b/docs/providers/azurecosmosdb/setup.md new file mode 100644 index 000000000..33d8caa22 --- /dev/null +++ b/docs/providers/azurecosmosdb/setup.md @@ -0,0 +1,20 @@ +# Azure Cosmos DB + +[Azure Cosmos DB](https://azure.microsoft.com/en-us/products/cosmos-db/) Azure Cosmos DB is a fully managed NoSQL and relational database for modern app development. Using Azure Cosmos DB for MongoDB vCore, you can store vector embeddings in your documents and perform [vector similarity search](https://learn.microsoft.com/azure/cosmos-db/mongodb/vcore/vector-search) on a fully managed MongoDB-compatible database service. + +Learn more about Azure Cosmos DB for MongoDB vCore [here](https://learn.microsoft.com/azure/cosmos-db/mongodb/vcore/). If you don't have an Azure account, you can start setting one up [here](https://azure.microsoft.com/). + +## Environment variables + +| Name | Required | Description | Default | +| ---------------------------- | -------- |-------------------------------------------------------------------------| ------------------- | +| `DATASTORE` | Yes | Datastore name, set to `azurecosmosdb` | | +| `BEARER_TOKEN` | Yes | Secret token | | +| `OPENAI_API_KEY` | Yes | OpenAI API key | | +| `AZCOSMOS_API` | Yes | Name of the API you're connecting to. Currently supported `mongo-vcore` | | +| `AZCOSMOS_CONNSTR` | Yes | The connection string to your account. | | +| `AZCOSMOS_DATABASE_NAME` | Yes | The database where the data is stored/queried | | +| `AZCOSMOS_CONTAINER_NAME` | Yes | The container where the data is stored/queried | | + +## Indexing +On first insert, the datastore will create the collection and index if necessary on the field `embedding`. Currently hybrid search is not yet supported. diff --git a/examples/providers/azurecosmosdb/semantic-search.ipynb b/examples/providers/azurecosmosdb/semantic-search.ipynb new file mode 100644 index 000000000..4f78aa98d --- /dev/null +++ b/examples/providers/azurecosmosdb/semantic-search.ipynb @@ -0,0 +1,215 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "de02cdc9", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import requests" + ] + }, + { + "cell_type": "markdown", + "id": "7e5d60e1", + "metadata": {}, + "source": [ + "# Document retrieval: upsert and basic query usage\n", + "\n", + "In this walkthrough we will go over the Retrieval API with a Azure CosmosDB Mongo vCore datastore for semantic search.\n", + "\n", + "Before running the notebook please initialize the retrieval API and have it running locally somewhere. Please follow the instructions to start the Retreival API provided [here](https://github.com/openai/chatgpt-retrieval-plugin#quickstart). \n", + "\n", + "[Azure Cosmos DB](https://azure.microsoft.com/en-us/products/cosmos-db/) Azure Cosmos DB is a fully managed NoSQL and relational database for modern app development. Using Azure Cosmos DB for MongoDB vCore, you can store vector embeddings in your documents and perform [vector similarity search](https://learn.microsoft.com/azure/cosmos-db/mongodb/vcore/vector-search) on a fully managed MongoDB-compatible database service.\n", + "\n", + "Learn more about Azure Cosmos DB for MongoDB vCore [here](https://learn.microsoft.com/azure/cosmos-db/mongodb/vcore/). If you don't have an Azure account, you can start setting one up [here](https://azure.microsoft.com/)." + ] + }, + { + "cell_type": "markdown", + "id": "80988348", + "metadata": {}, + "source": [ + "## Document\n", + "\n", + "First we will create a list of documents. From the perspective of the retrieval plugin, a [document](https://github.com/openai/chatgpt-retrieval-plugin/blob/main/models/models.py) consists of an \"id\", \"text\", \"embedding\"(optional) and a collection of \"metadata\". The \"metadata\" has \"source\", \"source_id\", \"created_at\", \"url\" and \"author\" fields. Query metadata does not expose the \"url\" field.\n", + "\n", + "For this example we have taken some data about a few dog breeds. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52829ffc", + "metadata": {}, + "outputs": [], + "source": [ + "document_1 = {\n", + " \"id\": \"Siberian Husky\",\n", + " \"text\": \"Siberian Huskies are strikingly beautiful and energetic Arctic breed dogs known for their captivating blue eyes and remarkable endurance in cold climates.\"\n", + "}\n", + "\n", + "document_2 = {\n", + " \"id\": \"Alaskan Malamute\",\n", + " \"text\": \"The Alaskan Malamute is a powerful and friendly Arctic sled dog breed known for its strength, endurance, and affectionate nature.\"\n", + "}\n", + "\n", + "document_3 = {\n", + " \"id\": \"Samoyed\",\n", + " \"text\": \"The Samoyed is a cheerful and fluffy Arctic breed, renowned for its smile and gentle disposition, originally used for herding reindeer and pulling sleds in Siberia.\"\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "6af96f59", + "metadata": {}, + "source": [ + "## Indexing the Docs\n", + "\n", + "On the first insert, the datastore will create the collection and index if necessary on the field `embedding`. Currently hybrid search is not yet supported. \n", + "\n", + "To make these requests to the retrieval app API, we will need to provide authorization in the form of the BEARER_TOKEN we set earlier. We do this below:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d68e796e", + "metadata": {}, + "outputs": [], + "source": [ + "BEARER_TOKEN_HERE = \"\"\n", + "endpoint_url = 'http://0.0.0.0:8000'\n", + "headers = {\n", + " \"Authorization\": f\"Bearer {BEARER_TOKEN_HERE}\"\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "954a09da", + "metadata": {}, + "outputs": [], + "source": [ + "response = requests.post(\n", + " f\"{endpoint_url}/upsert\",\n", + " headers=headers,\n", + " json={\"documents\": [document_1, document_2, document_3]\n", + " }\n", + ")\n", + "\n", + "response.json()" + ] + }, + { + "cell_type": "markdown", + "id": "431a8616", + "metadata": {}, + "source": [ + "## Querying the datastore\n", + "Let's query the data store for dogs based on the place of their origin." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23441d46", + "metadata": {}, + "outputs": [], + "source": [ + "queries = [\n", + " {\n", + " \"query\":\"I want dog breeds from Siberia.\",\n", + " \"top_k\":2\n", + " },\n", + " {\n", + " \"query\":\"I want dog breed from Alaska.\",\n", + " \"top_k\":1\n", + " }\n", + "]\n", + "\n", + "response = requests.post(\n", + " f\"{endpoint_url}/query\",\n", + " headers=headers,\n", + " json={\"queries\":queries}\n", + ")\n", + "\n", + "response.json()" + ] + }, + { + "cell_type": "markdown", + "id": "705181ee", + "metadata": {}, + "source": [ + "## Deleting the data from the datastore\n", + "You can either delete all the data, or provide a list of docIds to delete" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b15513ca", + "metadata": {}, + "outputs": [], + "source": [ + "response = requests.delete(\n", + " f\"{endpoint_url}/delete\",\n", + " headers=headers,\n", + " json={\"ids\":[\"doc:SiberianHusky:chunk:SiberianHusky_0\"]}\n", + ")\n", + "\n", + "response.json()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc748e50", + "metadata": {}, + "outputs": [], + "source": [ + "response = requests.delete(\n", + " f\"{endpoint_url}/delete\",\n", + " headers=headers,\n", + " json={\"delete_all\":True}\n", + ")\n", + "\n", + "response.json()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19531965", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/poetry.lock b/poetry.lock index 55cf9e55c..8055ca8f4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2893,6 +2893,101 @@ pandas = ">=1.2.4" protobuf = ">=3.20.0" ujson = ">=2.0.0" +[[package]] +name = "pymongo" +version = "4.3.3" +description = "Python driver for MongoDB " +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "pymongo-4.3.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:74731c9e423c93cbe791f60c27030b6af6a948cef67deca079da6cd1bb583a8e"}, + {file = "pymongo-4.3.3-cp310-cp310-manylinux1_i686.whl", hash = "sha256:66413c50d510e5bcb0afc79880d1693a2185bcea003600ed898ada31338c004e"}, + {file = "pymongo-4.3.3-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:9b87b23570565a6ddaa9244d87811c2ee9cffb02a753c8a2da9c077283d85845"}, + {file = "pymongo-4.3.3-cp310-cp310-manylinux2014_i686.whl", hash = "sha256:695939036a320f4329ccf1627edefbbb67cc7892b8222d297b0dd2313742bfee"}, + {file = "pymongo-4.3.3-cp310-cp310-manylinux2014_ppc64le.whl", hash = "sha256:ffcc8394123ea8d43fff8e5d000095fe7741ce3f8988366c5c919c4f5eb179d3"}, + {file = "pymongo-4.3.3-cp310-cp310-manylinux2014_s390x.whl", hash = "sha256:943f208840777f34312c103a2d1caab02d780c4e9be26b3714acf6c4715ba7e1"}, + {file = "pymongo-4.3.3-cp310-cp310-manylinux2014_x86_64.whl", hash = "sha256:01f7cbe88d22440b6594c955e37312d932fd632ffed1a86d0c361503ca82cc9d"}, + {file = "pymongo-4.3.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cdb87309de97c63cb9a69132e1cb16be470e58cffdfbad68fdd1dc292b22a840"}, + {file = "pymongo-4.3.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d86c35d94b5499689354ccbc48438a79f449481ee6300f3e905748edceed78e7"}, + {file = "pymongo-4.3.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a966d5304b7d90c45c404914e06bbf02c5bf7e99685c6c12f0047ef2aa837142"}, + {file = "pymongo-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:be1d2ce7e269215c3ee9a215e296b7a744aff4f39233486d2c4d77f5f0c561a6"}, + {file = "pymongo-4.3.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:55b6163dac53ef1e5d834297810c178050bd0548a4136cd4e0f56402185916ca"}, + {file = "pymongo-4.3.3-cp310-cp310-win32.whl", hash = "sha256:dc0cff74cd36d7e1edba91baa09622c35a8a57025f2f2b7a41e3f83b1db73186"}, + {file = "pymongo-4.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:cafa52873ae12baa512a8721afc20de67a36886baae6a5f394ddef0ce9391f91"}, + {file = "pymongo-4.3.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:599d3f6fbef31933b96e2d906b0f169b3371ff79ea6aaf6ecd76c947a3508a3d"}, + {file = "pymongo-4.3.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c0640b4e9d008e13956b004d1971a23377b3d45491f87082161c92efb1e6c0d6"}, + {file = "pymongo-4.3.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:341221e2f2866a5960e6f8610f4cbac0bb13097f3b1a289aa55aba984fc0d969"}, + {file = "pymongo-4.3.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e7fac06a539daef4fcf5d8288d0d21b412f9b750454cd5a3cf90484665db442a"}, + {file = "pymongo-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d3a51901066696c4af38c6c63a1f0aeffd5e282367ff475de8c191ec9609b56d"}, + {file = "pymongo-4.3.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f3055510fdfdb1775bc8baa359783022f70bb553f2d46e153c094dfcb08578ff"}, + {file = "pymongo-4.3.3-cp311-cp311-win32.whl", hash = "sha256:524d78673518dcd352a91541ecd2839c65af92dc883321c2109ef6e5cd22ef23"}, + {file = "pymongo-4.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:b8a03af1ce79b902a43f5f694c4ca8d92c2a4195db0966f08f266549e2fc49bc"}, + {file = "pymongo-4.3.3-cp37-cp37m-macosx_10_6_intel.whl", hash = "sha256:39b03045c71f761aee96a12ebfbc2f4be89e724ff6f5e31c2574c1a0e2add8bd"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:6fcfbf435eebf8a1765c6d1f46821740ebe9f54f815a05c8fc30d789ef43cb12"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:7d43ac9c7eeda5100fb0a7152fab7099c9cf9e5abd3bb36928eb98c7d7a339c6"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:3b93043b14ba7eb08c57afca19751658ece1cfa2f0b7b1fb5c7a41452fbb8482"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux2014_i686.whl", hash = "sha256:c09956606c08c4a7c6178a04ba2dd9388fcc5db32002ade9c9bc865ab156ab6d"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux2014_ppc64le.whl", hash = "sha256:b0cfe925610f2fd59555bb7fc37bd739e4b197d33f2a8b2fae7b9c0c6640318c"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux2014_s390x.whl", hash = "sha256:4d00b91c77ceb064c9b0459f0d6ea5bfdbc53ea9e17cf75731e151ef25a830c7"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:c6258a3663780ae47ba73d43eb63c79c40ffddfb764e09b56df33be2f9479837"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c29e758f0e734e1e90357ae01ec9c6daf19ff60a051192fe110d8fb25c62600e"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:12f3621a46cdc7a9ba8080422262398a91762a581d27e0647746588d3f995c88"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:47f7aa217b25833cd6f0e72b0d224be55393c2692b4f5e0561cb3beeb10296e9"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2c2fdc855149efe7cdcc2a01ca02bfa24761c640203ea94df467f3baf19078be"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5effd87c7d363890259eac16c56a4e8da307286012c076223997f8cc4a8c435b"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6dd1cf2995fdbd64fc0802313e8323f5fa18994d51af059b5b8862b73b5e53f0"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:bb869707d8e30645ed6766e44098600ca6cdf7989c22a3ea2b7966bb1d98d4b2"}, + {file = "pymongo-4.3.3-cp37-cp37m-win32.whl", hash = "sha256:49210feb0be8051a64d71691f0acbfbedc33e149f0a5d6e271fddf6a12493fed"}, + {file = "pymongo-4.3.3-cp37-cp37m-win_amd64.whl", hash = "sha256:54c377893f2cbbffe39abcff5ff2e917b082c364521fa079305f6f064e1a24a9"}, + {file = "pymongo-4.3.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c184ec5be465c0319440734491e1aa4709b5f3ba75fdfc9dbbc2ae715a7f6829"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux1_i686.whl", hash = "sha256:dca34367a4e77fcab0693e603a959878eaf2351585e7d752cac544bc6b2dee46"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:cd6a4afb20fb3c26a7bfd4611a0bbb24d93cbd746f5eb881f114b5e38fd55501"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:0c466710871d0026c190fc4141e810cf9d9affbf4935e1d273fbdc7d7cda6143"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux2014_i686.whl", hash = "sha256:d07d06dba5b5f7d80f9cc45501456e440f759fe79f9895922ed486237ac378a8"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux2014_ppc64le.whl", hash = "sha256:711bc52cb98e7892c03e9b669bebd89c0a890a90dbc6d5bb2c47f30239bac6e9"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux2014_s390x.whl", hash = "sha256:34b040e095e1671df0c095ec0b04fc4ebb19c4c160f87c2b55c079b16b1a6b00"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:4ed00f96e147f40b565fe7530d1da0b0f3ab803d5dd5b683834500fa5d195ec4"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ef888f48eb9203ee1e04b9fb27429017b290fb916f1e7826c2f7808c88798394"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:316498b642c00401370b2156b5233b256f9b33799e0a8d9d0b8a7da217a20fca"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fa7e202feb683dad74f00dea066690448d0cfa310f8a277db06ec8eb466601b5"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:52896e22115c97f1c829db32aa2760b0d61839cfe08b168c2b1d82f31dbc5f55"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7c051fe37c96b9878f37fa58906cb53ecd13dcb7341d3a85f1e2e2f6b10782d9"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5134d33286c045393c7beb51be29754647cec5ebc051cf82799c5ce9820a2ca2"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:a9c2885b4a8e6e39db5662d8b02ca6dcec796a45e48c2de12552841f061692ba"}, + {file = "pymongo-4.3.3-cp38-cp38-win32.whl", hash = "sha256:a6cd6f1db75eb07332bd3710f58f5fce4967eadbf751bad653842750a61bda62"}, + {file = "pymongo-4.3.3-cp38-cp38-win_amd64.whl", hash = "sha256:d5571b6978750601f783cea07fb6b666837010ca57e5cefa389c1d456f6222e2"}, + {file = "pymongo-4.3.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:81d1a7303bd02ca1c5be4aacd4db73593f573ba8e0c543c04c6da6275fd7a47e"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux1_i686.whl", hash = "sha256:016c412118e1c23fef3a1eada4f83ae6e8844fd91986b2e066fc1b0013cdd9ae"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:8fd6e191b92a10310f5a6cfe10d6f839d79d192fb02480bda325286bd1c7b385"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:e2961b05f9c04a53da8bfc72f1910b6aec7205fcf3ac9c036d24619979bbee4b"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux2014_i686.whl", hash = "sha256:b38a96b3eed8edc515b38257f03216f382c4389d022a8834667e2bc63c0c0c31"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux2014_ppc64le.whl", hash = "sha256:c1a70c51da9fa95bd75c167edb2eb3f3c4d27bc4ddd29e588f21649d014ec0b7"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux2014_s390x.whl", hash = "sha256:8a06a0c02f5606330e8f2e2f3b7949877ca7e4024fa2bff5a4506bec66c49ec7"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:6c2216d8b6a6d019c6f4b1ad55f890e5e77eb089309ffc05b6911c09349e7474"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eac0a143ef4f28f49670bf89cb15847eb80b375d55eba401ca2f777cd425f338"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:08fc250b5552ee97ceeae0f52d8b04f360291285fc7437f13daa516ce38fdbc6"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704d939656e21b073bfcddd7228b29e0e8a93dd27b54240eaafc0b9a631629a6"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1074f1a6f23e28b983c96142f2d45be03ec55d93035b471c26889a7ad2365db3"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7b16250238de8dafca225647608dddc7bbb5dce3dd53b4d8e63c1cc287394c2f"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7761cacb8745093062695b11574effea69db636c2fd0a9269a1f0183712927b4"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:fd7bb378d82b88387dc10227cfd964f6273eb083e05299e9b97cbe075da12d11"}, + {file = "pymongo-4.3.3-cp39-cp39-win32.whl", hash = "sha256:dc24d245026a72d9b4953729d31813edd4bd4e5c13622d96e27c284942d33f24"}, + {file = "pymongo-4.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:fc28e8d85d392a06434e9a934908d97e2cf453d69488d2bcd0bfb881497fd975"}, + {file = "pymongo-4.3.3.tar.gz", hash = "sha256:34e95ffb0a68bffbc3b437f2d1f25fc916fef3df5cdeed0992da5f42fae9b807"}, +] + +[package.dependencies] +dnspython = ">=1.16.0,<3.0.0" + +[package.extras] +aws = ["pymongo-auth-aws (<2.0.0)"] +encryption = ["pymongo-auth-aws (<2.0.0)", "pymongocrypt (>=1.3.0,<2.0.0)"] +gssapi = ["pykerberos"] +ocsp = ["certifi", "pyopenssl (>=17.2.0)", "requests (<3.0.0)", "service-identity (>=18.1.0)"] +snappy = ["python-snappy"] +zstd = ["zstandard"] + [[package]] name = "pypdf2" version = "3.0.1" diff --git a/pyproject.toml b/pyproject.toml index 628ab44df..2024579d3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,7 @@ redis = "4.5.4" supabase = "^1.0.2" psycopg2 = "^2.9.5" llama-index = "0.5.4" +pymongo="4.3.3" azure-identity = "^1.12.0" azure-search-documents = {version = "11.4.0a20230509004", source = "azure-sdk-dev"} pgvector = "^0.1.7" diff --git a/tests/datastore/providers/azurecosmosdb/test_azurecosmosdb_datastore.py b/tests/datastore/providers/azurecosmosdb/test_azurecosmosdb_datastore.py new file mode 100644 index 000000000..7b238e4d5 --- /dev/null +++ b/tests/datastore/providers/azurecosmosdb/test_azurecosmosdb_datastore.py @@ -0,0 +1,182 @@ +import pytest +from typing import Dict, List +from dotenv import dotenv_values + +from datastore.datastore import DataStore +from datastore.providers.azurecosmosdb_datastore import AzureCosmosDBDataStore +from models.models import ( + DocumentChunk, + DocumentChunkMetadata, + QueryWithEmbedding, +) + + +num_lists = 1 +similarity = "COS" + + +def create_embedding(non_zero_pos: int) -> List[float]: + # create a vector with a single non-zero value of dimension 1536 + vector = [0.0] * 1536 + vector[non_zero_pos - 1] = 1.0 + return vector + + +@pytest.fixture +def azure_cosmos_db_settings_from_dot_env() -> dict: + """ + Reads the Azure CosmosDB environment variables for the .env file. + + Returns: + dict: The Azure CosmosDB environment variables + """ + config = dotenv_values(".env") + env_variables = { + "DATASTORE": "azurecosmosdb", + "AZCOSMOS_API": config.get(("AZCOSMOS_API")), # Right now CosmosDB only supports vector search in Mongo vCore. + "AZCOSMOS_CONNSTR": config.get("AZCOSMOS_CONNSTR"), + "AZCOSMOS_DATABASE_NAME": config.get("AZCOSMOS_DATABASE_NAME"), + "AZCOSMOS_CONTAINER_NAME": config.get("AZCOSMOS_CONTAINER_NAME"), + } + + return env_variables + + +@pytest.fixture +def initial_document_chunks() -> Dict[str, List[DocumentChunk]]: + first_doc_chunks = [ + DocumentChunk( + id=f"first-doc-{i}", + text=f"Lorem ipsum {i}", + metadata=DocumentChunkMetadata(), + embedding=create_embedding(i), + ) + for i in range(4, 7) + ] + return { + "first-doc": first_doc_chunks, + } + + +@pytest.fixture +def queries() -> List[QueryWithEmbedding]: + queries = [ + QueryWithEmbedding( + query="Query 1", + top_k=1, + embedding=create_embedding(4), + ), + QueryWithEmbedding( + query="Query 2", + top_k=2, + embedding=create_embedding(5), + ), + ] + return queries + + +@pytest.fixture +async def azurecosmosdb_datastore() -> DataStore: + return await AzureCosmosDBDataStore.create(num_lists=num_lists, similarity=similarity) + + +@pytest.mark.asyncio +async def test_upsert( + azurecosmosdb_datastore: AzureCosmosDBDataStore, + initial_document_chunks: Dict[str, List[DocumentChunk]], +) -> None: + """Test basic upsert.""" + doc_ids = await azurecosmosdb_datastore._upsert(initial_document_chunks) + assert doc_ids == [f"doc:{doc_id}:chunk:{chunk.id}" for doc_id, chunk_list in initial_document_chunks.items() + for chunk in chunk_list] + + +@pytest.mark.asyncio +async def test_query( + azurecosmosdb_datastore: AzureCosmosDBDataStore, + initial_document_chunks: Dict[str, List[DocumentChunk]], + queries: List[QueryWithEmbedding], +) -> None: + """Test basic query.""" + await azurecosmosdb_datastore.delete(delete_all=True) + # insert to prepare for the test + await azurecosmosdb_datastore._upsert(initial_document_chunks) + + query_results = await azurecosmosdb_datastore._query(queries) + assert len(query_results) == len(queries) + + query_0_results = query_results[0].results + query_1_results = query_results[1].results + + assert len(query_0_results) == 1 + assert len(query_1_results) == 2 + + # NOTE: this is the correct behavior + assert query_0_results[0].id == "doc:first-doc:chunk:first-doc-4" + assert query_1_results[0].id == "doc:first-doc:chunk:first-doc-5" + assert query_1_results[1].id == "doc:first-doc:chunk:first-doc-4" + + +@pytest.mark.asyncio +async def test_delete(azurecosmosdb_datastore: AzureCosmosDBDataStore) -> None: + await azurecosmosdb_datastore.delete(delete_all=True) + chunk1 = DocumentChunk( + id="deleteChunk1", + text="delete text 1", + embedding=[1] * 1536, + metadata=DocumentChunkMetadata(), + ) + chunk2 = DocumentChunk( + id="deleteChunk2", + text="delete text 2", + embedding=[1] * 1536, + metadata=DocumentChunkMetadata(), + ) + # insert to prepare for test + await azurecosmosdb_datastore._upsert({"deleteDoc1": [chunk1], "deleteDoc2": [chunk2]}) + + query_embedding = [1] * 1536 + query = QueryWithEmbedding( + query="Query for delete", + embedding=query_embedding, + ) + results = await azurecosmosdb_datastore._query([query]) + + assert len(results[0].results) == 2 + assert results[0].results[0].id == "doc:deleteDoc1:chunk:deleteChunk1" + assert results[0].results[1].id == "doc:deleteDoc2:chunk:deleteChunk2" + + await azurecosmosdb_datastore.delete(ids=["doc:deleteDoc1:chunk:deleteChunk1"]) + results_after_delete = await azurecosmosdb_datastore._query([query]) + + assert len(results_after_delete[0].results) == 1 + assert results_after_delete[0].results[0].id == "doc:deleteDoc2:chunk:deleteChunk2" + + +@pytest.mark.asynio +async def test_delete_all(azurecosmosdb_datastore: AzureCosmosDBDataStore) -> None: + await azurecosmosdb_datastore.delete(delete_all=True) + chunk = DocumentChunk( + id="deleteChunk", + text="delete text", + embedding=[1] * 1536, + metadata=DocumentChunkMetadata(), + ) + await azurecosmosdb_datastore._upsert({"deleteDoc": [chunk]}) + + query_embedding = [1] * 1536 + query = QueryWithEmbedding( + query="delete query", + embedding=query_embedding, + top_k=1, + ) + results = await azurecosmosdb_datastore._query([query]) + + assert len(results) == 1 + assert len(results[0].results) == 1 + assert results[0].results[0].id == "doc:deleteDoc:chunk:deleteChunk" + + await azurecosmosdb_datastore.delete(delete_all=True) + results_after_delete = await azurecosmosdb_datastore._query([query]) + + assert len(results_after_delete[0].results) == 0