From cf08c870e8dc84adbad3876af87d71c90c341e2c Mon Sep 17 00:00:00 2001 From: giriraj-singh-couchbase Date: Mon, 8 Dec 2025 23:52:02 +0530 Subject: [PATCH 01/13] renamed directory and fixed frontmatter --- crewai/gsi/frontmatter.md | 22 ----------------- crewai/{gsi => query_based}/.env.sample | 0 .../RAG_with_Couchbase_and_CrewAI.ipynb | 0 crewai/query_based/frontmatter.md | 24 +++++++++++++++++++ crewai/{fts => search_based}/.env.sample | 0 .../RAG_with_Couchbase_and_CrewAI.ipynb | 0 crewai/{fts => search_based}/crew_index.json | 0 crewai/{fts => search_based}/frontmatter.md | 8 +++---- 8 files changed, 28 insertions(+), 26 deletions(-) delete mode 100644 crewai/gsi/frontmatter.md rename crewai/{gsi => query_based}/.env.sample (100%) rename crewai/{gsi => query_based}/RAG_with_Couchbase_and_CrewAI.ipynb (100%) create mode 100644 crewai/query_based/frontmatter.md rename crewai/{fts => search_based}/.env.sample (100%) rename crewai/{fts => search_based}/RAG_with_Couchbase_and_CrewAI.ipynb (100%) rename crewai/{fts => search_based}/crew_index.json (100%) rename crewai/{fts => search_based}/frontmatter.md (67%) diff --git a/crewai/gsi/frontmatter.md b/crewai/gsi/frontmatter.md deleted file mode 100644 index 450d7f41..00000000 --- a/crewai/gsi/frontmatter.md +++ /dev/null @@ -1,22 +0,0 @@ ---- -# frontmatter -path: "/tutorial-crewai-couchbase-rag-with-global-secondary-index" -title: Retrieval-Augmented Generation (RAG) with Couchbase and CrewAI with GSI -short_title: RAG with Couchbase and CrewAI with GSI -description: - - Learn how to build a semantic search engine using Couchbase and CrewAI. - - This tutorial demonstrates how to integrate Couchbase's vector search capabilities with CrewAI's agent-based approach. - - You'll understand how to perform Retrieval-Augmented Generation (RAG) using LangChain, CrewAI, and Couchbase with GSI. -content_type: tutorial -filter: sdk -technology: - - vector search -tags: - - GSI - - Artificial Intelligence - - LangChain - - CrewAI -sdk_language: - - python -length: 60 Mins ---- diff --git a/crewai/gsi/.env.sample b/crewai/query_based/.env.sample similarity index 100% rename from crewai/gsi/.env.sample rename to crewai/query_based/.env.sample diff --git a/crewai/gsi/RAG_with_Couchbase_and_CrewAI.ipynb b/crewai/query_based/RAG_with_Couchbase_and_CrewAI.ipynb similarity index 100% rename from crewai/gsi/RAG_with_Couchbase_and_CrewAI.ipynb rename to crewai/query_based/RAG_with_Couchbase_and_CrewAI.ipynb diff --git a/crewai/query_based/frontmatter.md b/crewai/query_based/frontmatter.md new file mode 100644 index 00000000..eb914cea --- /dev/null +++ b/crewai/query_based/frontmatter.md @@ -0,0 +1,24 @@ +--- +# frontmatter +path: "/tutorial-crewai-couchbase-rag-with-hyperscale-or-composite-vector-index" +title: Retrieval-Augmented Generation (RAG) with CrewAI using Couchbase Hyperscale and Composite Vector Index +short_title: RAG with CrewAI using Couchbase Hyperscale and Composite Vector Index +description: + - Learn how to build a semantic search engine using Couchbase and CrewAI. + - This tutorial demonstrates how to integrate Couchbase's vector search capabilities with CrewAI's agent-based approach. + - You'll understand how to perform Retrieval-Augmented Generation (RAG) using LangChain, CrewAI, Couchbase Hyperscale and Composite Vector Index. +content_type: tutorial +filter: sdk +technology: + - vector search +tags: + - Hyperscale Vector Index + - Composite Vector Index + - Artificial Intelligence + - LangChain + - CrewAI +sdk_language: + - python +length: 60 Mins +alt_paths: ["/tutorial-crewai-couchbase-rag-with-hyperscale-vector-index", "/tutorial-crewai-couchbase-rag-with-composite-vector-index"] +--- diff --git a/crewai/fts/.env.sample b/crewai/search_based/.env.sample similarity index 100% rename from crewai/fts/.env.sample rename to crewai/search_based/.env.sample diff --git a/crewai/fts/RAG_with_Couchbase_and_CrewAI.ipynb b/crewai/search_based/RAG_with_Couchbase_and_CrewAI.ipynb similarity index 100% rename from crewai/fts/RAG_with_Couchbase_and_CrewAI.ipynb rename to crewai/search_based/RAG_with_Couchbase_and_CrewAI.ipynb diff --git a/crewai/fts/crew_index.json b/crewai/search_based/crew_index.json similarity index 100% rename from crewai/fts/crew_index.json rename to crewai/search_based/crew_index.json diff --git a/crewai/fts/frontmatter.md b/crewai/search_based/frontmatter.md similarity index 67% rename from crewai/fts/frontmatter.md rename to crewai/search_based/frontmatter.md index 0f64bf06..0a447a37 100644 --- a/crewai/fts/frontmatter.md +++ b/crewai/search_based/frontmatter.md @@ -1,8 +1,8 @@ --- # frontmatter -path: "/tutorial-crewai-couchbase-rag-with-fts" -title: Retrieval-Augmented Generation (RAG) with Couchbase and CrewAI using FTS Service -short_title: RAG with Couchbase and CrewAI using FTS +path: "/tutorial-crewai-couchbase-rag-with-search-vector-index" +title: Retrieval-Augmented Generation (RAG) with CrewAI using Couchbase Search Vector Index +short_title: RAG with CrewAI using Couchbase Search Vector Index description: - Learn how to build a semantic search engine using Couchbase and CrewAI. - This tutorial demonstrates how to integrate Couchbase's vector search capabilities with CrewAI's agent-based approach. @@ -12,7 +12,7 @@ filter: sdk technology: - vector search tags: - - FTS + - Search Vector Index - Artificial Intelligence - LangChain - CrewAI From cd9b2f03f58104ef2e1f1c8d919706f5a73884b3 Mon Sep 17 00:00:00 2001 From: giriraj-singh-couchbase Date: Tue, 9 Dec 2025 00:22:50 +0530 Subject: [PATCH 02/13] updated notebook to use Hyperscale and Composite index terminology --- .../RAG_with_Couchbase_and_CrewAI.ipynb | 216 +++++++++--------- 1 file changed, 104 insertions(+), 112 deletions(-) diff --git a/crewai/query_based/RAG_with_Couchbase_and_CrewAI.ipynb b/crewai/query_based/RAG_with_Couchbase_and_CrewAI.ipynb index ddd66fe4..be40794b 100644 --- a/crewai/query_based/RAG_with_Couchbase_and_CrewAI.ipynb +++ b/crewai/query_based/RAG_with_Couchbase_and_CrewAI.ipynb @@ -4,14 +4,6 @@ "cell_type": "markdown", "id": "82d610e0", "metadata": {}, - "source": [ - "# Agent-Based RAG with Couchbase GSI Vector Search and CrewAI" - ] - }, - { - "cell_type": "markdown", - "id": "a3073978", - "metadata": {}, "source": [ "## Overview" ] @@ -21,7 +13,7 @@ "id": "7e91202c", "metadata": {}, "source": [ - "In this guide, we will walk you through building a powerful semantic search engine using [Couchbase](https://www.couchbase.com) as the backend database and [CrewAI](https://github.com/crewAIInc/crewAI) for agent-based RAG operations. CrewAI allows us to create specialized agents that can work together to handle different aspects of the RAG workflow, from document retrieval to response generation. This tutorial uses Couchbase's **Global Secondary Index (GSI)** vector search capabilities, which offer high-performance vector search optimized for large-scale applications. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system from scratch. Alternatively if you want to perform semantic search using the FTS index, please take a look at [this.](https://developer.couchbase.com/tutorial-crewai-couchbase-rag-with-fts/)" + "In this guide, we will walk you through building a powerful semantic search engine using [Couchbase](https://www.couchbase.com) as the backend database and [CrewAI](https://github.com/crewAIInc/crewAI) for agent-based RAG operations. CrewAI allows us to create specialized agents that can work together to handle different aspects of the RAG workflow, from document retrieval to response generation. This tutorial uses Couchbase's **Hyperscale and Composite Vector Index** capabilities, which offer high-performance vector search optimized for large-scale applications. Hyperscale Vector Indexes are designed for pure vector searches, while Composite Vector Indexes enable filtered searches combining vector similarity with scalar attributes. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system from scratch. Alternatively if you want to perform semantic search using the Search Vector Index, please take a look at [this.](https://developer.couchbase.com/tutorial-crewai-couchbase-rag-with-search-vector-index/)" ] }, { @@ -37,7 +29,7 @@ "id": "4e84bba4", "metadata": {}, "source": [ - "This tutorial is available as a Jupyter Notebook (.ipynb file) that you can run interactively. You can access the original notebook [here](https://github.com/couchbase-examples/vector-search-cookbook/blob/main/crewai/gsi/RAG_with_Couchbase_and_CrewAI.ipynb).\n", + "This tutorial is available as a Jupyter Notebook (.ipynb file) that you can run interactively. You can access the original notebook [here](https://github.com/couchbase-examples/vector-search-cookbook/blob/main/crewai/query_based/RAG_with_Couchbase_and_CrewAI.ipynb).\n", "\n", "You can either:\n", "- Download the notebook file and run it on [Google Colab](https://colab.research.google.com)\n", @@ -69,7 +61,7 @@ " - To get started with [Couchbase Capella](https://cloud.couchbase.com), create an account and use it to deploy a free tier operational cluster\n", " - This account provides you with an environment where you can explore and learn about Capella\n", " - To learn more, please follow the [Getting Started Guide](https://docs.couchbase.com/cloud/get-started/create-account.html)\n", - " - **Important**: This tutorial requires Couchbase Server **8.0+** for GSI vector search capabilities" + " - **Important**: This tutorial requires Couchbase Server **8.0+** for Hyperscale and Composite vector index capabilities" ] }, { @@ -113,12 +105,12 @@ "source": [ "We'll install the following key libraries:\n", "- `datasets`: For loading and managing our training data\n", - "- `langchain-couchbase`: To integrate Couchbase with LangChain for GSI vector storage and caching\n", + "- `langchain-couchbase`: To integrate Couchbase with LangChain for Hyperscale and Composite vector storage and caching\n", "- `langchain-openai`: For accessing OpenAI's embedding and chat models\n", "- `crewai`: To create and orchestrate our AI agents for RAG operations\n", "- `python-dotenv`: For securely managing environment variables and API keys\n", "\n", - "These libraries provide the foundation for building a semantic search engine with GSI vector embeddings, database integration, and agent-based RAG capabilities." + "These libraries provide the foundation for building a semantic search engine with Hyperscale and Composite vector embeddings, database integration, and agent-based RAG capabilities." ] }, { @@ -458,7 +450,7 @@ "id": "fa4faf3f", "metadata": {}, "source": [ - "## Understanding GSI Vector Search" + "## Understanding Hyperscale and Composite Vector Indexes" ] }, { @@ -466,7 +458,7 @@ "id": "69c7d28f", "metadata": {}, "source": [ - "### GSI Vector Index Configuration" + "### Hyperscale and Composite Vector Index Configuration" ] }, { @@ -474,7 +466,7 @@ "id": "90080454", "metadata": {}, "source": [ - "Semantic search with GSI requires creating a Global Secondary Index optimized for vector operations. Unlike FTS-based vector search, GSI vector indexes offer two distinct types optimized for different use cases:" + "Semantic search with Hyperscale and Composite Vector Indexes requires creating indexes optimized for vector operations. Unlike Search Vector Index-based vector search, Hyperscale and Composite vector indexes offer two distinct types optimized for different use cases. Learn more about these index types in the [Couchbase Vector Index Documentation](https://docs.couchbase.com/cloud/vector-index/use-vector-indexes.html)." ] }, { @@ -482,7 +474,7 @@ "id": "72154198", "metadata": {}, "source": [ - "#### GSI Vector Index Types" + "#### Vector Index Types" ] }, { @@ -490,7 +482,7 @@ "id": "b55cb1f4", "metadata": {}, "source": [ - "##### Hyperscale Vector Indexes (BHIVE)" + "##### Hyperscale Vector Indexes" ] }, { @@ -558,12 +550,12 @@ "\n", "For detailed configuration options, see the [Quantization & Centroid Settings](https://docs.couchbase.com/cloud/vector-index/hyperscale-vector-index.html#algo_settings).\n", "\n", - "For more information on GSI vector indexes, see [Couchbase GSI Vector Documentation](https://docs.couchbase.com/cloud/vector-index/use-vector-indexes.html)." + "For more information on Hyperscale and Composite vector indexes, see [Couchbase Vector Index Documentation](https://docs.couchbase.com/cloud/vector-index/use-vector-indexes.html)." ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "88e4c207", "metadata": {}, "outputs": [ @@ -571,24 +563,24 @@ "name": "stdout", "output_type": "stream", "text": [ - "GSI vector index configuration prepared\n" + "Hyperscale and Composite vector index configuration prepared\n" ] } ], "source": [ - "# GSI Vector Index Configuration\n", - "# Unlike FTS indexes, GSI vector indexes are created programmatically through the vector store\n", + "# Hyperscale and Composite Vector Index Configuration\n", + "# Unlike Search Vector Index, Hyperscale and Composite vector indexes are created programmatically through the vector store\n", "# We'll configure the parameters that will be used for index creation\n", "\n", "# Vector configuration\n", "DISTANCE_STRATEGY = DistanceStrategy.COSINE # Cosine similarity\n", - "INDEX_TYPE = IndexType.BHIVE # Using BHIVE for high-performance vector \n", + "INDEX_TYPE = IndexType.HYPERSCALE # Using HYPERSCALE for high-performance vector \n", "INDEX_DESCRIPTION = \"IVF,SQ8\" # Auto-selected centroids with 8-bit scalar quantization\n", "\n", "# To create a Composite Index instead, use the following:\n", "# INDEX_TYPE = IndexType.COMPOSITE # Combines vector search with scalar filtering\n", "\n", - "print(\"GSI vector index configuration prepared\")" + "print(\"Hyperscale and Composite vector index configuration prepared\")" ] }, { @@ -608,7 +600,7 @@ "\n", "```python\n", "# Alternative configuration for Composite index\n", - "INDEX_TYPE = IndexType.COMPOSITE # Instead of IndexType.BHIVE\n", + "INDEX_TYPE = IndexType.COMPOSITE # Instead of IndexType.HYPERSCALE\n", "INDEX_DESCRIPTION = \"IVF,SQ8\" # Same quantization settings\n", "DISTANCE_STRATEGY = DistanceStrategy.COSINE # Same distance metric\n", "\n", @@ -703,7 +695,7 @@ "id": "7340c7ce", "metadata": {}, "source": [ - "### Create Couchbase GSI Vector Store" + "### Create Couchbase Hyperscale Vector Store" ] }, { @@ -711,12 +703,12 @@ "id": "2d202628", "metadata": {}, "source": [ - "Set up the GSI vector store where we'll store document embeddings for high-performance semantic search." + "Set up the Hyperscale vector store where we'll store document embeddings for high-performance semantic search." ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "a877a51d", "metadata": {}, "outputs": [ @@ -724,19 +716,19 @@ "name": "stderr", "output_type": "stream", "text": [ - "2025-10-06 10:18:05 [INFO] GSI Vector store setup completed\n" + "2025-10-06 10:18:05 [INFO] Hyperscale Vector store setup completed\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "GSI Vector store initialized successfully\n" + "Hyperscale Vector store initialized successfully\n" ] } ], "source": [ - "# Setup GSI vector store with OpenAI embeddings\n", + "# Setup Hyperscale vector store with OpenAI embeddings\n", "try:\n", " vector_store = CouchbaseQueryVectorStore(\n", " cluster=cluster,\n", @@ -746,11 +738,11 @@ " embedding=embeddings,\n", " distance_metric=DISTANCE_STRATEGY\n", " )\n", - " print(\"GSI Vector store initialized successfully\")\n", - " logging.info(\"GSI Vector store setup completed\")\n", + " print(\"Hyperscale Vector store initialized successfully\")\n", + " logging.info(\"Hyperscale Vector store setup completed\")\n", "except Exception as e:\n", - " logging.error(f\"Failed to initialize GSI vector store: {str(e)}\")\n", - " raise RuntimeError(f\"GSI Vector store initialization failed: {str(e)}\")" + " logging.error(f\"Failed to initialize Hyperscale vector store: {str(e)}\")\n", + " raise RuntimeError(f\"Hyperscale Vector store initialization failed: {str(e)}\")" ] }, { @@ -912,13 +904,13 @@ "id": "51df07c1", "metadata": {}, "source": [ - "Now let's demonstrate the performance benefits of GSI optimization by testing pure vector search performance. We'll compare three optimization levels:\n", + "Now let's demonstrate the performance benefits of Hyperscale vector index optimization by testing pure vector search performance. We'll compare three optimization levels:\n", "\n", - "1. **Baseline Performance**: Vector search without GSI optimization\n", - "2. **GSI-Optimized Performance**: Same search with BHIVE GSI index\n", - "3. **Cache Benefits**: Show how caching can be applied on top of GSI for repeated queries\n", + "1. **Baseline Performance**: Vector search without Hyperscale vector index optimization\n", + "2. **Hyperscale-Optimized Performance**: Same search with Hyperscale vector index\n", + "3. **Cache Benefits**: Show how caching can be applied on top of Hyperscale vector index for repeated queries\n", "\n", - "**Important**: This testing focuses on pure vector search performance, isolating the GSI improvements from other workflow overhead." + "**Important**: This testing focuses on pure vector search performance, isolating the Hyperscale vector index improvements from other workflow overhead." ] }, { @@ -931,14 +923,14 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "c43f62fa", "metadata": {}, "outputs": [], "source": [ "import time\n", "\n", - "# Create GSI vector retriever optimized for high-performance searches\n", + "# Create Hyperscale vector retriever optimized for high-performance searches\n", "retriever = vector_store.as_retriever(\n", " search_type=\"similarity\",\n", " search_kwargs={\"k\": 4} # Return top 4 most similar documents\n", @@ -976,7 +968,7 @@ "id": "f939b9e1", "metadata": {}, "source": [ - "### Test 1: Baseline Performance (No GSI Index)" + "### Test 1: Baseline Performance (No Hyperscale Vector Index)" ] }, { @@ -984,12 +976,12 @@ "id": "e20d10ad", "metadata": {}, "source": [ - "Test pure vector search performance without GSI optimization." + "Test pure vector search performance without Hyperscale vector index optimization." ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "id": "71ceaa56", "metadata": {}, "outputs": [ @@ -997,7 +989,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Testing baseline vector search performance without GSI optimization...\n", + "Testing baseline vector search performance without Hyperscale vector index optimization...\n", "\n", "[Baseline Search] Testing vector search performance\n", "[Baseline Search] Query: 'What are the latest developments in football transfers?'\n", @@ -1005,17 +997,17 @@ "[Baseline Search] Found 4 relevant documents\n", "[Baseline Search] Top result preview: The latest updates and analysis from the BBC.\n", "\n", - "Baseline vector search time (without GSI): 1.3999 seconds\n", + "Baseline vector search time (without Hyperscale vector index): 1.3999 seconds\n", "\n" ] } ], "source": [ - "# Test baseline vector search performance without GSI index\n", + "# Test baseline vector search performance without Hyperscale vector index\n", "test_query = \"What are the latest developments in football transfers?\"\n", - "print(\"Testing baseline vector search performance without GSI optimization...\")\n", + "print(\"Testing baseline vector search performance without Hyperscale vector index optimization...\")\n", "baseline_time = test_vector_search_performance(test_query, \"Baseline Search\")\n", - "print(f\"\\nBaseline vector search time (without GSI): {baseline_time:.4f} seconds\\n\")" + "print(f\"\\nBaseline vector search time (without Hyperscale vector index): {baseline_time:.4f} seconds\\n\")" ] }, { @@ -1023,7 +1015,7 @@ "id": "90d304e9", "metadata": {}, "source": [ - "### Create BHIVE GSI Index" + "### Create Hyperscale Vector Index" ] }, { @@ -1031,12 +1023,12 @@ "id": "6ae9cef0", "metadata": {}, "source": [ - "Now let's create a BHIVE GSI vector index to enable high-performance vector searches. The index creation is done programmatically through the vector store, which will optimize the index settings based on our data and requirements." + "Now let's create a Hyperscale vector index to enable high-performance vector searches. The index creation is done programmatically through the vector store, which will optimize the index settings based on our data and requirements." ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "id": "389d1358", "metadata": {}, "outputs": [ @@ -1044,36 +1036,36 @@ "name": "stdout", "output_type": "stream", "text": [ - "Creating BHIVE GSI vector index...\n" + "Creating Hyperscale vector index...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2025-10-06 10:20:15 [INFO] BHIVE index created with description 'IVF,SQ8'\n" + "2025-10-06 10:20:15 [INFO] Hyperscale index created with description 'IVF,SQ8'\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "GSI Vector index created successfully\n", + "Hyperscale Vector index created successfully\n", "Waiting for index to become available...\n" ] } ], "source": [ - "# Create GSI Vector Index for high-performance searches\n", - "print(\"Creating BHIVE GSI vector index...\")\n", + "# Create Hyperscale Vector Index for high-performance searches\n", + "print(\"Creating Hyperscale vector index...\")\n", "try:\n", - " # Create a BHIVE index optimized for pure vector searches\n", + " # Create a Hyperscale index optimized for pure vector searches\n", " vector_store.create_index(\n", - " index_type=INDEX_TYPE, # BHIVE index type\n", + " index_type=INDEX_TYPE, # Hyperscale index type\n", " index_description=INDEX_DESCRIPTION # IVF,SQ8 for optimized performance\n", " )\n", - " print(f\"GSI Vector index created successfully\")\n", - " logging.info(f\"BHIVE index created with description '{INDEX_DESCRIPTION}'\")\n", + " print(f\"Hyperscale Vector index created successfully\")\n", + " logging.info(f\"Hyperscale index created with description '{INDEX_DESCRIPTION}'\")\n", " \n", " # Wait a moment for index to be available\n", " print(\"Waiting for index to become available...\")\n", @@ -1082,11 +1074,11 @@ "except Exception as e:\n", " # Index might already exist, which is fine\n", " if \"already exists\" in str(e).lower():\n", - " print(f\"GSI Vector index already exists, proceeding...\")\n", + " print(f\"Hyperscale Vector index already exists, proceeding...\")\n", " logging.info(f\"Index already exists\")\n", " else:\n", - " logging.error(f\"Failed to create GSI index: {str(e)}\")\n", - " raise RuntimeError(f\"GSI index creation failed: {str(e)}\")" + " logging.error(f\"Failed to create Hyperscale vector index: {str(e)}\")\n", + " raise RuntimeError(f\"Hyperscale vector index creation failed: {str(e)}\")" ] }, { @@ -1094,7 +1086,7 @@ "id": "6b9e5763", "metadata": {}, "source": [ - "### Test 2: GSI-Optimized Performance" + "### Test 2: Hyperscale-Optimized Performance" ] }, { @@ -1102,12 +1094,12 @@ "id": "8388f41b", "metadata": {}, "source": [ - "Test the same vector search with BHIVE GSI optimization." + "Test the same vector search with Hyperscale vector index optimization." ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "id": "b1b89f5b", "metadata": {}, "outputs": [ @@ -1115,22 +1107,22 @@ "name": "stdout", "output_type": "stream", "text": [ - "Testing vector search performance with BHIVE GSI optimization...\n", + "Testing vector search performance with Hyperscale vector index optimization...\n", "\n", - "[GSI-Optimized Search] Testing vector search performance\n", - "[GSI-Optimized Search] Query: 'What are the latest developments in football transfers?'\n", - "[GSI-Optimized Search] Vector search completed in 0.5885 seconds\n", - "[GSI-Optimized Search] Found 4 relevant documents\n", - "[GSI-Optimized Search] Top result preview: Four key areas for Everton's new owners to address\n", + "[Hyperscale-Optimized Search] Testing vector search performance\n", + "[Hyperscale-Optimized Search] Query: 'What are the latest developments in football transfers?'\n", + "[Hyperscale-Optimized Search] Vector search completed in 0.5885 seconds\n", + "[Hyperscale-Optimized Search] Found 4 relevant documents\n", + "[Hyperscale-Optimized Search] Top result preview: Four key areas for Everton's new owners to address\n", "\n", "Everton fans last saw silverware in 1995 when th...\n" ] } ], "source": [ - "# Test vector search performance with GSI index\n", - "print(\"Testing vector search performance with BHIVE GSI optimization...\")\n", - "gsi_search_time = test_vector_search_performance(test_query, \"GSI-Optimized Search\")" + "# Test vector search performance with Hyperscale vector index\n", + "print(\"Testing vector search performance with Hyperscale vector index optimization...\")\n", + "hyperscale_search_time = test_vector_search_performance(test_query, \"Hyperscale-Optimized Search\")" ] }, { @@ -1146,7 +1138,7 @@ "id": "1cc73249", "metadata": {}, "source": [ - "Now let's demonstrate how caching can improve performance for repeated queries. **Note**: Caching benefits apply to both baseline and GSI-optimized searches." + "Now let's demonstrate how caching can improve performance for repeated queries. **Note**: Caching benefits apply to both baseline and Hyperscale-optimized searches." ] }, { @@ -1212,7 +1204,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "id": "388ca617", "metadata": {}, "outputs": [ @@ -1224,8 +1216,8 @@ "================================================================================\n", "VECTOR SEARCH PERFORMANCE OPTIMIZATION SUMMARY\n", "================================================================================\n", - "Phase 1 - Baseline Search (No GSI): 1.3999 seconds\n", - "Phase 2 - GSI-Optimized Search: 0.5885 seconds\n", + "Phase 1 - Baseline Search (No Hyperscale): 1.3999 seconds\n", + "Phase 2 - Hyperscale-Optimized Search: 0.5885 seconds\n", "Phase 3 - Cache Benefits:\n", " First execution (cache miss): 0.6450 seconds\n", " Second execution (cache hit): 0.4306 seconds\n", @@ -1233,14 +1225,14 @@ "--------------------------------------------------------------------------------\n", "VECTOR SEARCH OPTIMIZATION IMPACT:\n", "--------------------------------------------------------------------------------\n", - "GSI Index Benefit: 2.38x faster (58.0% improvement)\n", + "Hyperscale Index Benefit: 2.38x faster (58.0% improvement)\n", "Cache Benefit: 1.50x faster (33.2% improvement)\n", "\n", "Key Insights for Vector Search Performance:\n", - "• GSI BHIVE indexes provide significant performance improvements for vector similarity search\n", + "• Hyperscale indexes provide significant performance improvements for vector similarity search\n", "• Performance gains are most dramatic for complex semantic queries\n", - "• BHIVE optimization is particularly effective for high-dimensional embeddings\n", - "• Combined with proper quantization (SQ8), GSI delivers production-ready performance\n", + "• Hyperscale optimization is particularly effective for high-dimensional embeddings\n", + "• Combined with proper quantization (SQ8), Hyperscale delivers production-ready performance\n", "• These performance improvements directly benefit any application using the vector store\n" ] } @@ -1250,8 +1242,8 @@ "print(\"VECTOR SEARCH PERFORMANCE OPTIMIZATION SUMMARY\")\n", "print(\"=\"*80)\n", "\n", - "print(f\"Phase 1 - Baseline Search (No GSI): {baseline_time:.4f} seconds\")\n", - "print(f\"Phase 2 - GSI-Optimized Search: {gsi_search_time:.4f} seconds\")\n", + "print(f\"Phase 1 - Baseline Search (No Hyperscale): {baseline_time:.4f} seconds\")\n", + "print(f\"Phase 2 - Hyperscale-Optimized Search: {hyperscale_search_time:.4f} seconds\")\n", "if cache_time_1 and cache_time_2:\n", " print(f\"Phase 3 - Cache Benefits:\")\n", " print(f\" First execution (cache miss): {cache_time_1:.4f} seconds\")\n", @@ -1261,12 +1253,12 @@ "print(\"VECTOR SEARCH OPTIMIZATION IMPACT:\")\n", "print(\"-\"*80)\n", "\n", - "# GSI improvement analysis\n", - "if baseline_time and gsi_search_time:\n", - " speedup = baseline_time / gsi_search_time if gsi_search_time > 0 else float('inf')\n", - " time_saved = baseline_time - gsi_search_time\n", + "# Hyperscale improvement analysis\n", + "if baseline_time and hyperscale_search_time:\n", + " speedup = baseline_time / hyperscale_search_time if hyperscale_search_time > 0 else float('inf')\n", + " time_saved = baseline_time - hyperscale_search_time\n", " percent_improvement = (time_saved / baseline_time) * 100\n", - " print(f\"GSI Index Benefit: {speedup:.2f}x faster ({percent_improvement:.1f}% improvement)\")\n", + " print(f\"Hyperscale Index Benefit: {speedup:.2f}x faster ({percent_improvement:.1f}% improvement)\")\n", "\n", "# Cache improvement analysis\n", "if cache_time_1 and cache_time_2 and cache_time_2 < cache_time_1:\n", @@ -1277,10 +1269,10 @@ " print(f\"Cache Benefit: Variable (depends on query complexity and caching mechanism)\")\n", "\n", "print(f\"\\nKey Insights for Vector Search Performance:\")\n", - "print(f\"• GSI BHIVE indexes provide significant performance improvements for vector similarity search\")\n", + "print(f\"• Hyperscale indexes provide significant performance improvements for vector similarity search\")\n", "print(f\"• Performance gains are most dramatic for complex semantic queries\")\n", - "print(f\"• BHIVE optimization is particularly effective for high-dimensional embeddings\")\n", - "print(f\"• Combined with proper quantization (SQ8), GSI delivers production-ready performance\")\n", + "print(f\"• Hyperscale optimization is particularly effective for high-dimensional embeddings\")\n", + "print(f\"• Combined with proper quantization (SQ8), Hyperscale vector indexes deliver production-ready performance\")\n", "print(f\"• These performance improvements directly benefit any application using the vector store\")" ] }, @@ -1311,7 +1303,7 @@ "- **Writer Agent**: Takes research findings and creates polished, structured responses\n", "- **Collaborative Workflow**: Agents work together, with the writer building on the researcher's findings\n", "\n", - "This multi-agent approach produces higher-quality responses than single-agent systems by separating research and writing expertise, while benefiting from the GSI performance improvements we just demonstrated." + "This multi-agent approach produces higher-quality responses than single-agent systems by separating research and writing expertise, while benefiting from the Hyperscale vector index performance improvements we just demonstrated." ] }, { @@ -1324,20 +1316,20 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "id": "c7b379d0", "metadata": {}, "outputs": [], "source": [ - "# Define the GSI vector search tool using the @tool decorator\n", - "@tool(\"gsi_vector_search\")\n", + "# Define the Hyperscale vector search tool using the @tool decorator\n", + "@tool(\"hyperscale_vector_search\")\n", "def search_tool(query: str) -> str:\n", - " \"\"\"Search for relevant documents using GSI vector similarity.\n", + " \"\"\"Search for relevant documents using Hyperscale vector similarity.\n", " Input should be a simple text query string.\n", - " Returns a list of relevant document contents from GSI vector search.\n", - " Use this tool to find detailed information about topics using high-performance GSI indexes.\"\"\"\n", + " Returns a list of relevant document contents from Hyperscale vector search.\n", + " Use this tool to find detailed information about topics using high-performance Hyperscale indexes.\"\"\"\n", " \n", - " # Invoke the GSI vector retriever (now optimized with BHIVE index)\n", + " # Invoke the Hyperscale vector retriever (now optimized with HYPERSCALE index)\n", " docs = retriever.invoke(query)\n", "\n", " # Format the results with distance information\n", @@ -1358,7 +1350,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "id": "73c44437", "metadata": {}, "outputs": [ @@ -1366,7 +1358,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "CrewAI agents created successfully with optimized GSI vector search\n" + "CrewAI agents created successfully with optimized Hyperscale vector search\n" ] } ], @@ -1403,7 +1395,7 @@ " allow_delegation=False\n", ")\n", "\n", - "print(\"CrewAI agents created successfully with optimized GSI vector search\")" + "print(\"CrewAI agents created successfully with optimized Hyperscale vector search\")" ] }, { @@ -1421,7 +1413,7 @@ "source": [ "The complete optimized RAG process:\n", "1. **User Query** → Research Agent\n", - "2. **Vector Search** → GSI BHIVE index finds similar documents (now with proven performance improvements)\n", + "2. **Vector Search** → Hyperscale index finds similar documents (now with proven performance improvements)\n", "3. **Document Analysis** → Research Agent analyzes and synthesizes findings\n", "4. **Response Writing** → Writer Agent creates polished, structured response\n", "5. **Final Output** → User receives comprehensive, well-formatted answer\n", @@ -1442,7 +1434,7 @@ "id": "3e7d956a", "metadata": {}, "source": [ - "Now let's demonstrate the complete optimized agent-based RAG system in action, benefiting from the GSI performance improvements we validated earlier." + "Now let's demonstrate the complete optimized agent-based RAG system in action, benefiting from the Hyperscale vector index performance improvements we validated earlier." ] }, { @@ -1455,13 +1447,13 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "id": "2176b29d", "metadata": {}, "outputs": [], "source": [ "def process_interactive_query(query, researcher, writer):\n", - " \"\"\"Run complete RAG workflow with CrewAI agents using optimized GSI vector search\"\"\"\n", + " \"\"\"Run complete RAG workflow with CrewAI agents using optimized Hyperscale vector search\"\"\"\n", " print(f\"\\nProcessing Query: {query}\")\n", " print(\"=\" * 80)\n", " \n", @@ -1545,7 +1537,7 @@ "id": "82ad950f", "metadata": {}, "source": [ - "You have successfully built a powerful agent-based RAG system that combines Couchbase's high-performance GSI vector storage capabilities with CrewAI's multi-agent architecture. This tutorial demonstrated the complete pipeline from data ingestion to intelligent response generation, with real performance benchmarks showing the dramatic improvements GSI indexing provides." + "You have successfully built a powerful agent-based RAG system that combines Couchbase's high-performance Hyperscale and Composite vector storage capabilities with CrewAI's multi-agent architecture. This tutorial demonstrated the complete pipeline from data ingestion to intelligent response generation, with real performance benchmarks showing the dramatic improvements Hyperscale vector indexing provides." ] } ], From ca0014a46e9808a82ca5b7cecbfd4d84328f7e7d Mon Sep 17 00:00:00 2001 From: giriraj-singh-couchbase Date: Tue, 9 Dec 2025 08:58:28 +0530 Subject: [PATCH 03/13] updated search vector index terminology and udpated frontmatter --- .../RAG_with_Couchbase_and_CrewAI.ipynb | 16 -- crewai/query_based/frontmatter.md | 4 +- .../RAG_with_Couchbase_and_CrewAI.ipynb | 179 +++++++++--------- crewai/search_based/frontmatter.md | 8 +- 4 files changed, 97 insertions(+), 110 deletions(-) diff --git a/crewai/query_based/RAG_with_Couchbase_and_CrewAI.ipynb b/crewai/query_based/RAG_with_Couchbase_and_CrewAI.ipynb index be40794b..fea49136 100644 --- a/crewai/query_based/RAG_with_Couchbase_and_CrewAI.ipynb +++ b/crewai/query_based/RAG_with_Couchbase_and_CrewAI.ipynb @@ -1,21 +1,5 @@ { "cells": [ - { - "cell_type": "markdown", - "id": "82d610e0", - "metadata": {}, - "source": [ - "## Overview" - ] - }, - { - "cell_type": "markdown", - "id": "7e91202c", - "metadata": {}, - "source": [ - "In this guide, we will walk you through building a powerful semantic search engine using [Couchbase](https://www.couchbase.com) as the backend database and [CrewAI](https://github.com/crewAIInc/crewAI) for agent-based RAG operations. CrewAI allows us to create specialized agents that can work together to handle different aspects of the RAG workflow, from document retrieval to response generation. This tutorial uses Couchbase's **Hyperscale and Composite Vector Index** capabilities, which offer high-performance vector search optimized for large-scale applications. Hyperscale Vector Indexes are designed for pure vector searches, while Composite Vector Indexes enable filtered searches combining vector similarity with scalar attributes. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system from scratch. Alternatively if you want to perform semantic search using the Search Vector Index, please take a look at [this.](https://developer.couchbase.com/tutorial-crewai-couchbase-rag-with-search-vector-index/)" - ] - }, { "cell_type": "markdown", "id": "255f3178", diff --git a/crewai/query_based/frontmatter.md b/crewai/query_based/frontmatter.md index eb914cea..9ea01d76 100644 --- a/crewai/query_based/frontmatter.md +++ b/crewai/query_based/frontmatter.md @@ -4,9 +4,7 @@ path: "/tutorial-crewai-couchbase-rag-with-hyperscale-or-composite-vector-index" title: Retrieval-Augmented Generation (RAG) with CrewAI using Couchbase Hyperscale and Composite Vector Index short_title: RAG with CrewAI using Couchbase Hyperscale and Composite Vector Index description: - - Learn how to build a semantic search engine using Couchbase and CrewAI. - - This tutorial demonstrates how to integrate Couchbase's vector search capabilities with CrewAI's agent-based approach. - - You'll understand how to perform Retrieval-Augmented Generation (RAG) using LangChain, CrewAI, Couchbase Hyperscale and Composite Vector Index. +In this guide, we will walk you through building a powerful semantic search engine using [Couchbase](https://www.couchbase.com) as the backend database and [CrewAI](https://github.com/crewAIInc/crewAI) for agent-based RAG operations. CrewAI allows us to create specialized agents that can work together to handle different aspects of the RAG workflow, from document retrieval to response generation. This tutorial uses Couchbase's **Hyperscale and Composite Vector Index** capabilities, which offer high-performance vector search optimized for large-scale applications. Hyperscale Vector Indexes are designed for pure vector searches, while Composite Vector Indexes enable filtered searches combining vector similarity with scalar attributes. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system from scratch. Alternatively if you want to perform semantic search using the Search Vector Index, please take a look at [this.](https://developer.couchbase.com/tutorial-crewai-couchbase-rag-with-search-vector-index/) content_type: tutorial filter: sdk technology: diff --git a/crewai/search_based/RAG_with_Couchbase_and_CrewAI.ipynb b/crewai/search_based/RAG_with_Couchbase_and_CrewAI.ipynb index ad9c4e61..ef08c6c0 100644 --- a/crewai/search_based/RAG_with_Couchbase_and_CrewAI.ipynb +++ b/crewai/search_based/RAG_with_Couchbase_and_CrewAI.ipynb @@ -4,21 +4,18 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Introduction\n", + "## How to Run This Tutorial\n", "\n", - "In this guide, we will walk you through building a powerful semantic search engine using [Couchbase](https://www.couchbase.com) as the backend database and [CrewAI](https://github.com/crewAIInc/crewAI) for agent-based RAG operations. CrewAI allows us to create specialized agents that can work together to handle different aspects of the RAG workflow, from document retrieval to response generation. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system from scratch. Alternatively if you want to perform semantic search using the GSI index, please take a look at [this.](https://developer.couchbase.com/tutorial-crewai-couchbase-rag-with-global-secondary-index)\n", - "\n", - "How to run this tutorial\n", - "----------------------\n", "This tutorial is available as a Jupyter Notebook (.ipynb file) that you can run \n", - "interactively. You can access the original notebook [here](https://github.com/couchbase-examples/vector-search-cookbook/blob/main/crewai/fts/RAG_with_Couchbase_and_CrewAI.ipynb).\n", + "interactively. You can access the original notebook [here](https://github.com/couchbase-examples/vector-search-cookbook/blob/main/crewai/search_based/RAG_with_Couchbase_and_CrewAI.ipynb).\n", "\n", "You can either:\n", "- Download the notebook file and run it on [Google Colab](https://colab.research.google.com)\n", "- Run it on your system by setting up the Python environment\n", "\n", - "Before you start\n", - "---------------\n", + "## Prerequisites\n", + "\n", + "### Couchbase Requirements\n", "\n", "1. Create and Deploy Your Free Tier Operational cluster on [Capella](https://cloud.couchbase.com/sign-up)\n", " - To get started with [Couchbase Capella](https://cloud.couchbase.com), create an account and use it to deploy \n", @@ -27,8 +24,9 @@ " about Capella with no time constraint\n", " - To learn more, please follow the [Getting Started Guide](https://docs.couchbase.com/cloud/get-started/create-account.html)\n", "\n", - "2. Couchbase Capella Configuration\n", - " When running Couchbase using Capella, the following prerequisites need to be met:\n", + "### Couchbase Capella Configuration\n", + "\n", + "When running Couchbase using Capella, the following prerequisites need to be met:\n", " - Create the database credentials to access the required bucket (Read and Write) used in the application\n", " - Allow access to the Cluster from the IP on which the application is running by following the [Network Security documentation](https://docs.couchbase.com/cloud/security/security.html#public-access)" ] @@ -37,16 +35,18 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Setting the Stage: Installing Necessary Libraries\n", + "## Setup and Installation\n", + "\n", + "### Installing Necessary Libraries\n", "\n", "We'll install the following key libraries:\n", "- `datasets`: For loading and managing our training data\n", - "- `langchain-couchbase`: To integrate Couchbase with LangChain for vector storage and caching\n", + "- `langchain-couchbase`: To integrate Couchbase with LangChain for Search Vector Index storage and caching\n", "- `langchain-openai`: For accessing OpenAI's embedding and chat models\n", "- `crewai`: To create and orchestrate our AI agents for RAG operations\n", "- `python-dotenv`: For securely managing environment variables and API keys\n", "\n", - "These libraries provide the foundation for building a semantic search engine with vector embeddings, \n", + "These libraries provide the foundation for building a semantic search engine with Search Vector Index embeddings, \n", "database integration, and agent-based RAG capabilities." ] }, @@ -71,7 +71,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Importing Necessary Libraries\n", + "### Import Required Modules\n", + "\n", "The script starts by importing a series of libraries required for various tasks, including handling JSON, logging, time tracking, Couchbase connections, embedding generation, and dataset loading." ] }, @@ -110,7 +111,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Setup Logging\n", + "### Configure Logging\n", + "\n", "Logging is configured to track the progress of the script and capture any errors or warnings." ] }, @@ -134,7 +136,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Loading Sensitive Information\n", + "### Load Environment Configuration\n", + "\n", "In this section, we prompt the user to input essential configuration settings needed. These settings include sensitive information like database credentials, and specific configuration names. Instead of hardcoding these details into the script, we request the user to provide them at runtime, ensuring flexibility and security.\n", "\n", "The script uses environment variables to store sensitive information, enhancing the overall security and maintainability of your code by avoiding hardcoded values." @@ -177,7 +180,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Connecting to the Couchbase Cluster\n", + "## Couchbase Connection Setup\n", + "\n", + "### Connect to Cluster\n", + "\n", "Connecting to a Couchbase cluster is the foundation of our project. Couchbase will serve as our primary data store, handling all the storage and retrieval operations required for our semantic search engine. By establishing this connection, we enable our application to interact with the database, allowing us to perform operations such as storing embeddings, querying data, and managing collections. This connection is the gateway through which all data will flow, so ensuring it's set up correctly is paramount." ] }, @@ -211,13 +217,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Verifying Search Service Availability\n", - " In this section, we verify that the Couchbase Search (FTS) service is available and responding correctly. This is a crucial check because our vector search functionality depends on it. If any issues are detected with the Search service, the function will raise an exception, allowing us to catch and handle problems early before attempting vector operations.\n" + "### Verify Search Service Availability\n", + "\n", + "In this section, we verify that the Couchbase Search service is available and responding correctly. This is a crucial check because our Search Vector Index functionality depends on it. If any issues are detected with the Search service, the function will raise an exception, allowing us to catch and handle problems early before attempting vector operations." ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -248,7 +255,7 @@ " break\n", "\n", " if not search_available:\n", - " raise RuntimeError(\"Search/FTS service not found or not responding\")\n", + " raise RuntimeError(\"Search service not found or not responding\")\n", " \n", " print(\"Search service check passed successfully\")\n", " except Exception as e:\n", @@ -265,32 +272,30 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Setting Up Collections in Couchbase\n", + "### Setup Collections\n", "\n", - "The setup_collection() function handles creating and configuring the hierarchical data organization in Couchbase:\n", + "Create and configure Couchbase bucket, scope, and collection for storing our vector data.\n", "\n", - "1. Bucket Creation:\n", + "1. **Bucket Creation:**\n", " - Checks if specified bucket exists, creates it if not\n", " - Sets bucket properties like RAM quota (1024MB) and replication (disabled)\n", - " - Note: If you are using Capella, create a bucket manually called vector-search-testing(or any name you prefer) with the same properties.\n", + " - Note: If you are using Capella, create a bucket manually called vector-search-testing (or any name you prefer) with the same properties.\n", "\n", - "2. Scope Management: \n", + "2. **Scope Management:** \n", " - Verifies if requested scope exists within bucket\n", " - Creates new scope if needed (unless it's the default \"_default\" scope)\n", "\n", - "3. Collection Setup:\n", + "3. **Collection Setup:**\n", " - Checks for collection existence within scope\n", " - Creates collection if it doesn't exist\n", " - Waits 2 seconds for collection to be ready\n", "\n", - "Additional Tasks:\n", + "**Additional Tasks:**\n", "- Creates primary index on collection for query performance\n", "- Clears any existing documents for clean state\n", "- Implements comprehensive error handling and logging\n", "\n", - "The function is called twice to set up:\n", - "1. Main collection for vector embeddings\n", - "2. Cache collection for storing results\n" + "The function is called to set up the main collection for vector embeddings." ] }, { @@ -398,13 +403,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Configuring and Initializing Couchbase Vector Search Index for Semantic Document Retrieval\n", + "## Understanding Search Vector Index\n", + "\n", + "### Configuring and Initializing Couchbase Search Vector Index for Semantic Document Retrieval\n", "\n", - "Semantic search requires an efficient way to retrieve relevant documents based on a user's query. This is where the Couchbase Vector Search Index comes into play. In this step, we load the Vector Search Index definition from a JSON file, which specifies how the index should be structured. This includes the fields to be indexed, the dimensions of the vectors, and other parameters that determine how the search engine processes queries based on vector similarity.\n", + "Semantic search requires an efficient way to retrieve relevant documents based on a user's query. This is where the Couchbase Search Vector Index comes into play. In this step, we load the Search Vector Index definition from a JSON file, which specifies how the index should be structured. This includes the fields to be indexed, the dimensions of the vectors, and other parameters that determine how the search engine processes queries based on vector similarity.\n", "\n", - "This CrewAI vector search index configuration requires specific default settings to function properly. This tutorial uses the bucket named `vector-search-testing` with the scope `shared` and collection `crew`. The configuration is set up for vectors with exactly `1536 dimensions`, using `dot product` similarity and optimized for `recall`. If you want to use a different bucket, scope, or collection, you will need to modify the index configuration accordingly.\n", + "This CrewAI Search Vector Index configuration requires specific default settings to function properly. This tutorial uses the bucket named `vector-search-testing` with the scope `shared` and collection `crew`. The configuration is set up for vectors with exactly `1536 dimensions`, using `dot product` similarity and optimized for `recall`. If you want to use a different bucket, scope, or collection, you will need to modify the index configuration accordingly.\n", "\n", - "For more information on creating a vector search index, please follow the instructions at [Couchbase Vector Search Documentation](https://docs.couchbase.com/cloud/vector-search/create-vector-search-index-ui.html)." + "For more information on creating a Search Vector Index, please follow the instructions at [Couchbase Vector Search Documentation](https://docs.couchbase.com/cloud/vector-search/create-vector-search-index-ui.html)." ] }, { @@ -432,9 +439,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Creating or Updating Search Indexes\n", + "### Creating or Updating Search Vector Index\n", "\n", - "With the index definition loaded, the next step is to create or update the **Vector Search Index** in Couchbase. This step is crucial because it optimizes our database for vector similarity search operations, allowing us to perform searches based on the semantic content of documents rather than just keywords. By creating or updating a Vector Search Index, we enable our search engine to handle complex queries that involve finding semantically similar documents using vector embeddings, which is essential for a robust semantic search engine." + "With the index definition loaded, the next step is to create or update the **Search Vector Index** in Couchbase. This step is crucial because it optimizes our database for vector similarity search operations, allowing us to perform searches based on the semantic content of documents rather than just keywords. By creating or updating a Search Vector Index, we enable our search engine to handle complex queries that involve finding semantically similar documents using vector embeddings, which is essential for a robust semantic search engine." ] }, { @@ -484,17 +491,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Setting Up OpenAI Components\n", + "## OpenAI Configuration\n", "\n", "This section initializes two key OpenAI components needed for our RAG system:\n", "\n", - "1. OpenAI Embeddings:\n", + "1. **OpenAI Embeddings:**\n", " - Uses the 'text-embedding-3-small' model\n", " - Converts text into high-dimensional vector representations (embeddings)\n", " - These embeddings enable semantic search by capturing the meaning of text\n", " - Required for vector similarity search in Couchbase\n", "\n", - "2. ChatOpenAI Language Model:\n", + "2. **ChatOpenAI Language Model:**\n", " - Uses the 'gpt-4o' model\n", " - Temperature set to 0.2 for balanced creativity and focus\n", " - Serves as the cognitive engine for CrewAI agents\n", @@ -504,13 +511,10 @@ " - Generate thoughtful responses based on that context\n", " - Follow instructions defined in agent roles and goals\n", " - Collaborate with other agents in the crew\n", - " - The relatively low temperature (0.2) ensures agents produce reliable,\n", - " consistent outputs while maintaining some creative problem-solving ability\n", + " - The relatively low temperature (0.2) ensures agents produce reliable, consistent outputs while maintaining some creative problem-solving ability\n", "\n", "Both components require a valid OpenAI API key (OPENAI_API_KEY) for authentication.\n", - "In the CrewAI framework, the LLM acts as the \"brain\" for each agent, allowing them\n", - "to interpret tasks, retrieve relevant information via the RAG system, and generate\n", - "appropriate outputs based on their specialized roles and expertise." + "In the CrewAI framework, the LLM acts as the \"brain\" for each agent, allowing them to interpret tasks, retrieve relevant information via the RAG system, and generate appropriate outputs based on their specialized roles and expertise." ] }, { @@ -546,8 +550,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Setting Up the Couchbase Vector Store\n", - "A vector store is where we'll keep our embeddings. Unlike the FTS index, which is used for text-based search, the vector store is specifically designed to handle embeddings and perform similarity searches. When a user inputs a query, the search engine converts the query into an embedding and compares it against the embeddings stored in the vector store. This allows the engine to find documents that are semantically similar to the query, even if they don't contain the exact same words. By setting up the vector store in Couchbase, we create a powerful tool that enables our search engine to understand and retrieve information based on the meaning and context of the query, rather than just the specific words used." + "## Document Processing and Vector Store Setup\n", + "\n", + "### Create Couchbase Search Vector Store\n", + "\n", + "A vector store is where we'll keep our embeddings. Unlike traditional text-based search, the Search Vector Store is specifically designed to handle embeddings and perform similarity searches. When a user inputs a query, the search engine converts the query into an embedding and compares it against the embeddings stored in the vector store. This allows the engine to find documents that are semantically similar to the query, even if they don't contain the exact same words. By setting up the Search Vector Store in Couchbase, we create a powerful tool that enables our search engine to understand and retrieve information based on the meaning and context of the query, rather than just the specific words used." ] }, { @@ -580,7 +587,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Load the BBC News Dataset\n", + "### Load BBC News Dataset\n", + "\n", "To build a search engine, we need data to search through. We use the BBC News dataset from RealTimeData, which provides real-world news articles. This dataset contains news articles from BBC covering various topics and time periods. Loading the dataset is a crucial step because it provides the raw material that our search engine will work with. The quality and diversity of the news articles make it an excellent choice for testing and refining our search engine, ensuring it can handle real-world news content effectively.\n", "\n", "The BBC News dataset allows us to work with authentic news articles, enabling us to build and test a search engine that can effectively process and retrieve relevant news content. The dataset is loaded using the Hugging Face datasets library, specifically accessing the \"RealTimeData/bbc_news_alltime\" dataset with the \"2024-12\" version." @@ -621,7 +629,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Cleaning up the Data\n", + "#### Data Cleaning\n", + "\n", "We will use the content of the news articles for our RAG system.\n", "\n", "The dataset contains a few duplicate records. We are removing them to avoid duplicate results in the retrieval stage of our RAG system." @@ -654,25 +663,19 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Saving Data to the Vector Store\n", + "#### Save Data to Vector Store\n", + "\n", "To efficiently handle the large number of articles, we process them in batches of articles at a time. This batch processing approach helps manage memory usage and provides better control over the ingestion process.\n", "\n", "We first filter out any articles that exceed 50,000 characters to avoid potential issues with token limits. Then, using the vector store's add_texts method, we add the filtered articles to our vector database. The batch_size parameter controls how many articles are processed in each iteration.\n", "\n", "This approach offers several benefits:\n", - "1. Memory Efficiency: Processing in smaller batches prevents memory overload\n", - "2. Error Handling: If an error occurs, only the current batch is affected\n", - "3. Progress Tracking: Easier to monitor and track the ingestion progress\n", - "4. Resource Management: Better control over CPU and network resource utilization\n", - "\n", - "We use a conservative batch size of 50 to ensure reliable operation.\n", - "The optimal batch size depends on many factors including:\n", - "- Document sizes being inserted\n", - "- Available system resources\n", - "- Network conditions\n", - "- Concurrent workload\n", - "\n", - "Consider measuring performance with your specific workload before adjusting.\n" + "1. **Memory Efficiency**: Processing in smaller batches prevents memory overload\n", + "2. **Error Handling**: If an error occurs, only the current batch is affected\n", + "3. **Progress Tracking**: Easier to monitor and track the ingestion progress\n", + "4. **Resource Management**: Better control over CPU and network resource utilization\n", + "\n", + "We use a conservative batch size of 50 to ensure reliable operation. The optimal batch size depends on many factors including document sizes, available system resources, network conditions, and concurrent workload." ] }, { @@ -708,20 +711,23 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Creating a Vector Search Tool\n", + "## CrewAI Agent Setup\n", + "\n", + "### Create Vector Search Tool\n", + "\n", "After loading our data into the vector store, we need to create a tool that can efficiently search through these vector embeddings. This involves two key components:\n", "\n", - "### Vector Retriever\n", + "**Vector Retriever:**\n", "The vector retriever is configured to perform similarity searches. This creates a retriever that performs semantic similarity searches against our vector database. The similarity search finds documents whose vector embeddings are closest to the query's embedding in the vector space.\n", "\n", - "### Search Tool\n", + "**Search Tool:**\n", "The search tool wraps the retriever in a user-friendly interface that:\n", "- Takes a query string as input\n", "- Passes the query to the retriever to find relevant documents\n", "- Formats the results with clear document separation using document numbers and dividers\n", "- Returns the formatted results as a single string with each document clearly delineated\n", "\n", - "The tool is designed to integrate seamlessly with our AI agents, providing them with reliable access to our knowledge base through vector similarity search. The lambda function in the tool handles both direct string queries and structured query objects, ensuring flexibility in how the tool can be invoked.\n" + "The tool is designed to integrate seamlessly with our AI agents, providing them with reliable access to our knowledge base through vector similarity search." ] }, { @@ -762,19 +768,19 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Creating CrewAI Agents\n", + "### Create CrewAI Agents\n", "\n", "We'll create two specialized AI agents using the CrewAI framework to handle different aspects of our information retrieval and analysis system:\n", "\n", - "## Research Expert Agent\n", + "**Research Expert Agent:**\n", "This agent is designed to:\n", - "- Execute semantic searches using our vector store\n", + "- Execute semantic searches using our Search Vector Store\n", "- Analyze and evaluate search results \n", "- Identify key information and insights\n", "- Verify facts across multiple sources\n", "- Synthesize findings into comprehensive research summaries\n", "\n", - "## Technical Writer Agent \n", + "**Technical Writer Agent:**\n", "This agent is responsible for:\n", "- Taking research findings and structuring them logically\n", "- Converting technical concepts into clear explanations\n", @@ -782,6 +788,7 @@ "- Maintaining engaging yet informative tone\n", "- Producing well-formatted final outputs\n", "\n", + "**Agent Workflow:**\n", "The agents work together in a coordinated way:\n", "1. Research agent finds and analyzes relevant documents\n", "2. Writer agent takes those findings and crafts polished responses\n", @@ -869,15 +876,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## How CrewAI Agents Work in this RAG System\n", - "\n", - "### Agent-Based RAG Architecture\n", + "### How the RAG Workflow Works\n", "\n", "This system uses a two-agent approach to implement Retrieval-Augmented Generation (RAG):\n", "\n", "1. **Research Expert Agent**:\n", " - Receives the user query\n", - " - Uses the vector search tool to retrieve relevant documents from Couchbase\n", + " - Uses the vector search tool to retrieve relevant documents from Couchbase Search Vector Store\n", " - Analyzes and synthesizes information from retrieved documents\n", " - Produces a comprehensive research summary with key findings\n", "\n", @@ -887,41 +892,42 @@ " - Creates a polished, user-friendly response\n", " - Ensures proper attribution and citation\n", "\n", - "#### How the Process Works:\n", + "**The Complete RAG Process:**\n", "\n", "1. **Query Processing**: User query is passed to the Research Agent\n", "2. **Vector Search**: Query is converted to embeddings and matched against document vectors\n", - "3. **Document Retrieval**: Most similar documents are retrieved from Couchbase\n", + "3. **Document Retrieval**: Most similar documents are retrieved from Couchbase Search Vector Store\n", "4. **Analysis**: Research Agent analyzes documents for relevance and extracts key information\n", "5. **Synthesis**: Research Agent combines findings into a coherent summary\n", "6. **Refinement**: Writer Agent restructures and enhances the content\n", "7. **Response Generation**: Final polished response is returned to the user\n", "\n", - "This multi-agent approach separates concerns (research vs. writing) and leverages\n", - "specialized expertise for each task, resulting in higher quality responses.\n" + "This multi-agent approach separates concerns (research vs. writing) and leverages specialized expertise for each task, resulting in higher quality responses." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Testing the Search System\n", + "## CrewAI Agent Demo\n", "\n", - "Test the system with some example queries." + "### Demo Function\n", + "\n", + "Test the system with example queries." ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def process_query(query, researcher, writer):\n", " \"\"\"\n", - " Test the complete RAG system with a user query.\n", + " Run complete RAG workflow with CrewAI agents using Search Vector Store.\n", " \n", " This function tests both the vector search capability and the agent-based processing:\n", - " 1. Vector search: Retrieves relevant documents from Couchbase\n", + " 1. Vector search: Retrieves relevant documents from Couchbase Search Vector Store\n", " 2. Agent processing: Uses CrewAI agents to analyze and format the response\n", " \n", " The function measures performance and displays detailed outputs from each step.\n", @@ -1428,10 +1434,11 @@ "metadata": {}, "source": [ "## Conclusion\n", - "By following these steps, you've built a powerful RAG system that combines Couchbase's vector storage capabilities with CrewAI's agent-based architecture. This multi-agent approach separates research and writing concerns, resulting in higher quality responses to user queries.\n", + "\n", + "By following these steps, you've built a powerful RAG system that combines Couchbase's Search Vector Index storage capabilities with CrewAI's agent-based architecture. This multi-agent approach separates research and writing concerns, resulting in higher quality responses to user queries.\n", "\n", "The system demonstrates several key advantages:\n", - "1. Efficient vector search using Couchbase's vector store\n", + "1. Efficient vector search using Couchbase's Search Vector Index\n", "2. Specialized AI agents that focus on different aspects of the RAG pipeline\n", "3. Collaborative workflow between agents to produce comprehensive, well-structured responses\n", "4. Scalable architecture that can be extended with additional agents for more complex tasks\n", diff --git a/crewai/search_based/frontmatter.md b/crewai/search_based/frontmatter.md index 0a447a37..0065eb9b 100644 --- a/crewai/search_based/frontmatter.md +++ b/crewai/search_based/frontmatter.md @@ -1,12 +1,10 @@ --- # frontmatter path: "/tutorial-crewai-couchbase-rag-with-search-vector-index" -title: Retrieval-Augmented Generation (RAG) with CrewAI using Couchbase Search Vector Index -short_title: RAG with CrewAI using Couchbase Search Vector Index +title: Retrieval-Augmented Generation (RAG) with CrewAI and Couchbase Search Vector Index +short_title: RAG with CrewAI and Couchbase Search Vector Index description: - - Learn how to build a semantic search engine using Couchbase and CrewAI. - - This tutorial demonstrates how to integrate Couchbase's vector search capabilities with CrewAI's agent-based approach. - - You'll understand how to perform Retrieval-Augmented Generation (RAG) using LangChain, CrewAI, and Couchbase. +In this guide, we will walk you through building a powerful semantic search engine using [Couchbase](https://www.couchbase.com) as the backend database and [CrewAI](https://github.com/crewAIInc/crewAI) for agent-based RAG operations. CrewAI allows us to create specialized agents that can work together to handle different aspects of the RAG workflow, from document retrieval to response generation. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system from scratch. Alternatively if you want to perform semantic search using Hyperscale and Composite Vector Indexes, please take a look at [this.](https://developer.couchbase.com/tutorial-crewai-couchbase-rag-with-hyperscale-or-composite-vector-index/) content_type: tutorial filter: sdk technology: From 58e89ecfe2371c4c55ba8a5d7959d4b182d818ff Mon Sep 17 00:00:00 2001 From: Giriraj Singh Date: Tue, 9 Dec 2025 09:03:18 +0530 Subject: [PATCH 04/13] Update crewai/query_based/RAG_with_Couchbase_and_CrewAI.ipynb fixed a comment Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- crewai/query_based/RAG_with_Couchbase_and_CrewAI.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crewai/query_based/RAG_with_Couchbase_and_CrewAI.ipynb b/crewai/query_based/RAG_with_Couchbase_and_CrewAI.ipynb index fea49136..bcfbae16 100644 --- a/crewai/query_based/RAG_with_Couchbase_and_CrewAI.ipynb +++ b/crewai/query_based/RAG_with_Couchbase_and_CrewAI.ipynb @@ -558,7 +558,7 @@ "\n", "# Vector configuration\n", "DISTANCE_STRATEGY = DistanceStrategy.COSINE # Cosine similarity\n", - "INDEX_TYPE = IndexType.HYPERSCALE # Using HYPERSCALE for high-performance vector \n", + "INDEX_TYPE = IndexType.HYPERSCALE # Using HYPERSCALE for high-performance vector search\n", "INDEX_DESCRIPTION = \"IVF,SQ8\" # Auto-selected centroids with 8-bit scalar quantization\n", "\n", "# To create a Composite Index instead, use the following:\n", From e04b6026b8157b7ce232db342ca0f24ba6dc5c76 Mon Sep 17 00:00:00 2001 From: giriraj-singh-couchbase Date: Tue, 9 Dec 2025 09:09:19 +0530 Subject: [PATCH 05/13] fixed frontmatter syntax --- crewai/query_based/frontmatter.md | 4 ++-- crewai/search_based/frontmatter.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/crewai/query_based/frontmatter.md b/crewai/query_based/frontmatter.md index 9ea01d76..94fb0cd5 100644 --- a/crewai/query_based/frontmatter.md +++ b/crewai/query_based/frontmatter.md @@ -3,8 +3,8 @@ path: "/tutorial-crewai-couchbase-rag-with-hyperscale-or-composite-vector-index" title: Retrieval-Augmented Generation (RAG) with CrewAI using Couchbase Hyperscale and Composite Vector Index short_title: RAG with CrewAI using Couchbase Hyperscale and Composite Vector Index -description: -In this guide, we will walk you through building a powerful semantic search engine using [Couchbase](https://www.couchbase.com) as the backend database and [CrewAI](https://github.com/crewAIInc/crewAI) for agent-based RAG operations. CrewAI allows us to create specialized agents that can work together to handle different aspects of the RAG workflow, from document retrieval to response generation. This tutorial uses Couchbase's **Hyperscale and Composite Vector Index** capabilities, which offer high-performance vector search optimized for large-scale applications. Hyperscale Vector Indexes are designed for pure vector searches, while Composite Vector Indexes enable filtered searches combining vector similarity with scalar attributes. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system from scratch. Alternatively if you want to perform semantic search using the Search Vector Index, please take a look at [this.](https://developer.couchbase.com/tutorial-crewai-couchbase-rag-with-search-vector-index/) +description: | + In this guide, we will walk you through building a powerful semantic search engine using [Couchbase](https://www.couchbase.com) as the backend database and [CrewAI](https://github.com/crewAIInc/crewAI) for agent-based RAG operations. CrewAI allows us to create specialized agents that can work together to handle different aspects of the RAG workflow, from document retrieval to response generation. This tutorial uses Couchbase's **Hyperscale and Composite Vector Index** capabilities, which offer high-performance vector search optimized for large-scale applications. Hyperscale Vector Indexes are designed for pure vector searches, while Composite Vector Indexes enable filtered searches combining vector similarity with scalar attributes. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system from scratch. Alternatively if you want to perform semantic search using the Search Vector Index, please take a look at [this.](https://developer.couchbase.com/tutorial-crewai-couchbase-rag-with-search-vector-index/) content_type: tutorial filter: sdk technology: diff --git a/crewai/search_based/frontmatter.md b/crewai/search_based/frontmatter.md index 0065eb9b..ec9ce204 100644 --- a/crewai/search_based/frontmatter.md +++ b/crewai/search_based/frontmatter.md @@ -3,8 +3,8 @@ path: "/tutorial-crewai-couchbase-rag-with-search-vector-index" title: Retrieval-Augmented Generation (RAG) with CrewAI and Couchbase Search Vector Index short_title: RAG with CrewAI and Couchbase Search Vector Index -description: -In this guide, we will walk you through building a powerful semantic search engine using [Couchbase](https://www.couchbase.com) as the backend database and [CrewAI](https://github.com/crewAIInc/crewAI) for agent-based RAG operations. CrewAI allows us to create specialized agents that can work together to handle different aspects of the RAG workflow, from document retrieval to response generation. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system from scratch. Alternatively if you want to perform semantic search using Hyperscale and Composite Vector Indexes, please take a look at [this.](https://developer.couchbase.com/tutorial-crewai-couchbase-rag-with-hyperscale-or-composite-vector-index/) +description: | + In this guide, we will walk you through building a powerful semantic search engine using [Couchbase](https://www.couchbase.com) as the backend database and [CrewAI](https://github.com/crewAIInc/crewAI) for agent-based RAG operations. CrewAI allows us to create specialized agents that can work together to handle different aspects of the RAG workflow, from document retrieval to response generation. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system from scratch. Alternatively if you want to perform semantic search using Hyperscale and Composite Vector Indexes, please take a look at [this.](https://developer.couchbase.com/tutorial-crewai-couchbase-rag-with-hyperscale-or-composite-vector-index/) content_type: tutorial filter: sdk technology: From 432879091f4af37f7f386576b206e9cdae938b8b Mon Sep 17 00:00:00 2001 From: giriraj-singh-couchbase Date: Tue, 9 Dec 2025 09:16:48 +0530 Subject: [PATCH 06/13] added bullets in frontmatter --- crewai/query_based/frontmatter.md | 5 +++-- crewai/search_based/frontmatter.md | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/crewai/query_based/frontmatter.md b/crewai/query_based/frontmatter.md index 94fb0cd5..56223cff 100644 --- a/crewai/query_based/frontmatter.md +++ b/crewai/query_based/frontmatter.md @@ -3,8 +3,9 @@ path: "/tutorial-crewai-couchbase-rag-with-hyperscale-or-composite-vector-index" title: Retrieval-Augmented Generation (RAG) with CrewAI using Couchbase Hyperscale and Composite Vector Index short_title: RAG with CrewAI using Couchbase Hyperscale and Composite Vector Index -description: | - In this guide, we will walk you through building a powerful semantic search engine using [Couchbase](https://www.couchbase.com) as the backend database and [CrewAI](https://github.com/crewAIInc/crewAI) for agent-based RAG operations. CrewAI allows us to create specialized agents that can work together to handle different aspects of the RAG workflow, from document retrieval to response generation. This tutorial uses Couchbase's **Hyperscale and Composite Vector Index** capabilities, which offer high-performance vector search optimized for large-scale applications. Hyperscale Vector Indexes are designed for pure vector searches, while Composite Vector Indexes enable filtered searches combining vector similarity with scalar attributes. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system from scratch. Alternatively if you want to perform semantic search using the Search Vector Index, please take a look at [this.](https://developer.couchbase.com/tutorial-crewai-couchbase-rag-with-search-vector-index/) +description: + - In this guide, we will walk you through building a powerful semantic search engine using [Couchbase](https://www.couchbase.com) as the backend database and [CrewAI](https://github.com/crewAIInc/crewAI) for agent-based RAG operations. CrewAI allows us to create specialized agents that can work together to handle different aspects of the RAG workflow, from document retrieval to response generation. + - This tutorial uses Couchbase's **Hyperscale and Composite Vector Index** capabilities, which offer high-performance vector search optimized for large-scale applications. Hyperscale Vector Indexes are designed for pure vector searches, while Composite Vector Indexes enable filtered searches combining vector similarity with scalar attributes. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system from scratch. Alternatively if you want to perform semantic search using the Search Vector Index, please take a look at [this.](https://developer.couchbase.com/tutorial-crewai-couchbase-rag-with-search-vector-index/) content_type: tutorial filter: sdk technology: diff --git a/crewai/search_based/frontmatter.md b/crewai/search_based/frontmatter.md index ec9ce204..cda0e322 100644 --- a/crewai/search_based/frontmatter.md +++ b/crewai/search_based/frontmatter.md @@ -3,8 +3,9 @@ path: "/tutorial-crewai-couchbase-rag-with-search-vector-index" title: Retrieval-Augmented Generation (RAG) with CrewAI and Couchbase Search Vector Index short_title: RAG with CrewAI and Couchbase Search Vector Index -description: | - In this guide, we will walk you through building a powerful semantic search engine using [Couchbase](https://www.couchbase.com) as the backend database and [CrewAI](https://github.com/crewAIInc/crewAI) for agent-based RAG operations. CrewAI allows us to create specialized agents that can work together to handle different aspects of the RAG workflow, from document retrieval to response generation. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system from scratch. Alternatively if you want to perform semantic search using Hyperscale and Composite Vector Indexes, please take a look at [this.](https://developer.couchbase.com/tutorial-crewai-couchbase-rag-with-hyperscale-or-composite-vector-index/) +description: + - In this guide, we will walk you through building a powerful semantic search engine using [Couchbase](https://www.couchbase.com) as the backend database and [CrewAI](https://github.com/crewAIInc/crewAI) for agent-based RAG operations. CrewAI allows us to create specialized agents that can work together to handle different aspects of the RAG workflow, from document retrieval to response generation. + - This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system from scratch. Alternatively if you want to perform semantic search using Hyperscale and Composite Vector Indexes, please take a look at [this.](https://developer.couchbase.com/tutorial-crewai-couchbase-rag-with-hyperscale-or-composite-vector-index/) content_type: tutorial filter: sdk technology: From 189ab346c9419e58591c122b5769d631de4ef063 Mon Sep 17 00:00:00 2001 From: giriraj-singh-couchbase Date: Tue, 9 Dec 2025 09:21:21 +0530 Subject: [PATCH 07/13] updated description for 200 length --- crewai/query_based/frontmatter.md | 7 +++++-- crewai/search_based/frontmatter.md | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/crewai/query_based/frontmatter.md b/crewai/query_based/frontmatter.md index 56223cff..c6cf053b 100644 --- a/crewai/query_based/frontmatter.md +++ b/crewai/query_based/frontmatter.md @@ -4,8 +4,11 @@ path: "/tutorial-crewai-couchbase-rag-with-hyperscale-or-composite-vector-index" title: Retrieval-Augmented Generation (RAG) with CrewAI using Couchbase Hyperscale and Composite Vector Index short_title: RAG with CrewAI using Couchbase Hyperscale and Composite Vector Index description: - - In this guide, we will walk you through building a powerful semantic search engine using [Couchbase](https://www.couchbase.com) as the backend database and [CrewAI](https://github.com/crewAIInc/crewAI) for agent-based RAG operations. CrewAI allows us to create specialized agents that can work together to handle different aspects of the RAG workflow, from document retrieval to response generation. - - This tutorial uses Couchbase's **Hyperscale and Composite Vector Index** capabilities, which offer high-performance vector search optimized for large-scale applications. Hyperscale Vector Indexes are designed for pure vector searches, while Composite Vector Indexes enable filtered searches combining vector similarity with scalar attributes. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system from scratch. Alternatively if you want to perform semantic search using the Search Vector Index, please take a look at [this.](https://developer.couchbase.com/tutorial-crewai-couchbase-rag-with-search-vector-index/) + - Learn to build a semantic search engine using [Couchbase](https://www.couchbase.com) and agent-based RAG workflows powered by [CrewAI](https://github.com/crewAIInc/crewAI). + - Explore Couchbase **Hyperscale** and **Composite Vector Indexes** for high-performance vector search, including pure vector and filtered similarity queries. + - Follow beginner-friendly, step-by-step instructions to build a fully functional semantic search system from scratch. + - For RAG using the Search Vector Index, see this guide: https://developer.couchbase.com/tutorial-crewai-couchbase-rag-with-search-vector-index/ + content_type: tutorial filter: sdk technology: diff --git a/crewai/search_based/frontmatter.md b/crewai/search_based/frontmatter.md index cda0e322..9ae580a1 100644 --- a/crewai/search_based/frontmatter.md +++ b/crewai/search_based/frontmatter.md @@ -4,8 +4,11 @@ path: "/tutorial-crewai-couchbase-rag-with-search-vector-index" title: Retrieval-Augmented Generation (RAG) with CrewAI and Couchbase Search Vector Index short_title: RAG with CrewAI and Couchbase Search Vector Index description: - - In this guide, we will walk you through building a powerful semantic search engine using [Couchbase](https://www.couchbase.com) as the backend database and [CrewAI](https://github.com/crewAIInc/crewAI) for agent-based RAG operations. CrewAI allows us to create specialized agents that can work together to handle different aspects of the RAG workflow, from document retrieval to response generation. - - This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system from scratch. Alternatively if you want to perform semantic search using Hyperscale and Composite Vector Indexes, please take a look at [this.](https://developer.couchbase.com/tutorial-crewai-couchbase-rag-with-hyperscale-or-composite-vector-index/) +description: + - Learn to build a semantic search engine using [Couchbase](https://www.couchbase.com) and agent-based RAG workflows powered by [CrewAI](https://github.com/crewAIInc/crewAI). + - Use CrewAI to create specialized agents that collaborate across the RAG pipeline, from retrieving documents to generating responses. + - Follow beginner-friendly, step-by-step instructions to build a complete semantic search system from scratch. + - For RAG using Hyperscale or Composite Vector Indexes, see: https://developer.couchbase.com/tutorial-crewai-couchbase-rag-with-hyperscale-or-composite-vector-index/ content_type: tutorial filter: sdk technology: From 3b59fa4c2ec7a8b079a615f1949ca56fcff10c06 Mon Sep 17 00:00:00 2001 From: giriraj-singh-couchbase Date: Tue, 9 Dec 2025 09:22:51 +0530 Subject: [PATCH 08/13] updated title length --- crewai/query_based/frontmatter.md | 2 +- crewai/search_based/frontmatter.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/crewai/query_based/frontmatter.md b/crewai/query_based/frontmatter.md index c6cf053b..14c796f9 100644 --- a/crewai/query_based/frontmatter.md +++ b/crewai/query_based/frontmatter.md @@ -1,7 +1,7 @@ --- # frontmatter path: "/tutorial-crewai-couchbase-rag-with-hyperscale-or-composite-vector-index" -title: Retrieval-Augmented Generation (RAG) with CrewAI using Couchbase Hyperscale and Composite Vector Index +title: RAG with CrewAI using Couchbase Hyperscale and Composite Vector Index short_title: RAG with CrewAI using Couchbase Hyperscale and Composite Vector Index description: - Learn to build a semantic search engine using [Couchbase](https://www.couchbase.com) and agent-based RAG workflows powered by [CrewAI](https://github.com/crewAIInc/crewAI). diff --git a/crewai/search_based/frontmatter.md b/crewai/search_based/frontmatter.md index 9ae580a1..6fcbab42 100644 --- a/crewai/search_based/frontmatter.md +++ b/crewai/search_based/frontmatter.md @@ -1,7 +1,7 @@ --- # frontmatter path: "/tutorial-crewai-couchbase-rag-with-search-vector-index" -title: Retrieval-Augmented Generation (RAG) with CrewAI and Couchbase Search Vector Index +title: RAG with CrewAI and Couchbase Search Vector Index short_title: RAG with CrewAI and Couchbase Search Vector Index description: description: From bfe48784d364011769289d9d762b8addd1a99bab Mon Sep 17 00:00:00 2001 From: giriraj-singh-couchbase Date: Tue, 9 Dec 2025 09:24:54 +0530 Subject: [PATCH 09/13] removed extra descr field --- crewai/search_based/frontmatter.md | 1 - 1 file changed, 1 deletion(-) diff --git a/crewai/search_based/frontmatter.md b/crewai/search_based/frontmatter.md index 6fcbab42..341aa960 100644 --- a/crewai/search_based/frontmatter.md +++ b/crewai/search_based/frontmatter.md @@ -3,7 +3,6 @@ path: "/tutorial-crewai-couchbase-rag-with-search-vector-index" title: RAG with CrewAI and Couchbase Search Vector Index short_title: RAG with CrewAI and Couchbase Search Vector Index -description: description: - Learn to build a semantic search engine using [Couchbase](https://www.couchbase.com) and agent-based RAG workflows powered by [CrewAI](https://github.com/crewAIInc/crewAI). - Use CrewAI to create specialized agents that collaborate across the RAG pipeline, from retrieving documents to generating responses. From 2d89e11769f05685b21000d120b08210a1b1f497 Mon Sep 17 00:00:00 2001 From: giriraj-singh-couchbase Date: Thu, 11 Dec 2025 13:15:20 +0530 Subject: [PATCH 10/13] added Introduction and fixed frontmatter with the received comments --- crewai/query_based/RAG_with_Couchbase_and_CrewAI.ipynb | 9 +++++++++ crewai/query_based/frontmatter.md | 3 +-- crewai/search_based/RAG_with_Couchbase_and_CrewAI.ipynb | 8 ++++++++ crewai/search_based/frontmatter.md | 1 - 4 files changed, 18 insertions(+), 3 deletions(-) diff --git a/crewai/query_based/RAG_with_Couchbase_and_CrewAI.ipynb b/crewai/query_based/RAG_with_Couchbase_and_CrewAI.ipynb index bcfbae16..8791011f 100644 --- a/crewai/query_based/RAG_with_Couchbase_and_CrewAI.ipynb +++ b/crewai/query_based/RAG_with_Couchbase_and_CrewAI.ipynb @@ -1,5 +1,14 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "e16d35e8", + "metadata": {}, + "source": [ + "## Introduction\n", + "In this guide, we will walk you through building a powerful semantic search engine using [Couchbase](https://www.couchbase.com) as the backend database and [CrewAI](https://github.com/crewAIInc/crewAI) for agent-based RAG operations. CrewAI allows us to create specialized agents that can work together to handle different aspects of the RAG workflow, from document retrieval to response generation. This tutorial uses Couchbase's **Hyperscale Index** vector search capabilities, which offer high-performance vector search optimized for large-scale applications. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system from scratch. Alternatively if you want to perform semantic search using the **Search Vector Index**, please take a look at [this.](https://developer.couchbase.com/tutorial-crewai-couchbase-rag-with-search-vector-index/)" + ] + }, { "cell_type": "markdown", "id": "255f3178", diff --git a/crewai/query_based/frontmatter.md b/crewai/query_based/frontmatter.md index 14c796f9..139299ef 100644 --- a/crewai/query_based/frontmatter.md +++ b/crewai/query_based/frontmatter.md @@ -5,9 +5,8 @@ title: RAG with CrewAI using Couchbase Hyperscale and Composite Vector Index short_title: RAG with CrewAI using Couchbase Hyperscale and Composite Vector Index description: - Learn to build a semantic search engine using [Couchbase](https://www.couchbase.com) and agent-based RAG workflows powered by [CrewAI](https://github.com/crewAIInc/crewAI). - - Explore Couchbase **Hyperscale** and **Composite Vector Indexes** for high-performance vector search, including pure vector and filtered similarity queries. + - Explore Couchbase Hyperscale and Composite Vector Indexes for high-performance vector search, including pure vector and filtered similarity queries. - Follow beginner-friendly, step-by-step instructions to build a fully functional semantic search system from scratch. - - For RAG using the Search Vector Index, see this guide: https://developer.couchbase.com/tutorial-crewai-couchbase-rag-with-search-vector-index/ content_type: tutorial filter: sdk diff --git a/crewai/search_based/RAG_with_Couchbase_and_CrewAI.ipynb b/crewai/search_based/RAG_with_Couchbase_and_CrewAI.ipynb index ef08c6c0..be3d7624 100644 --- a/crewai/search_based/RAG_with_Couchbase_and_CrewAI.ipynb +++ b/crewai/search_based/RAG_with_Couchbase_and_CrewAI.ipynb @@ -1,5 +1,13 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Introduction\n", + "In this guide, we will walk you through building a powerful semantic search engine using [Couchbase](https://www.couchbase.com) as the backend database and [CrewAI](https://github.com/crewAIInc/crewAI) for agent-based RAG operations. CrewAI allows us to create specialized agents that can work together to handle different aspects of the RAG workflow, from document retrieval to response generation. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system from scratch. Alternatively if you want to perform semantic search using the Hyperscale Vector Index, please take a look at [this.](https://developer.couchbase.com/tutorial-crewai-couchbase-rag-with-hyperscale-or-composite-vector-index)" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/crewai/search_based/frontmatter.md b/crewai/search_based/frontmatter.md index 341aa960..9ba219cf 100644 --- a/crewai/search_based/frontmatter.md +++ b/crewai/search_based/frontmatter.md @@ -7,7 +7,6 @@ description: - Learn to build a semantic search engine using [Couchbase](https://www.couchbase.com) and agent-based RAG workflows powered by [CrewAI](https://github.com/crewAIInc/crewAI). - Use CrewAI to create specialized agents that collaborate across the RAG pipeline, from retrieving documents to generating responses. - Follow beginner-friendly, step-by-step instructions to build a complete semantic search system from scratch. - - For RAG using Hyperscale or Composite Vector Indexes, see: https://developer.couchbase.com/tutorial-crewai-couchbase-rag-with-hyperscale-or-composite-vector-index/ content_type: tutorial filter: sdk technology: From f2d82d8d03c0b3dc6fd778c342523a2d4c3a585e Mon Sep 17 00:00:00 2001 From: giriraj-singh-couchbase Date: Thu, 11 Dec 2025 13:30:00 +0530 Subject: [PATCH 11/13] added composite vector index --- .../RAG_with_Couchbase_and_CrewAI.ipynb | 2 +- .../RAG_with_Couchbase_and_CrewAI.ipynb | 2 +- openrouter-deepseek/fts/frontmatter.md | 23 - ...th_Couchbase_and_Openrouter_Deepseek.ipynb | 1089 ----------------- openrouter-deepseek/gsi/frontmatter.md | 23 - .../{gsi => query_based}/.env.sample | 0 ...th_Couchbase_and_Openrouter_Deepseek.ipynb | 1081 ++++++++++++++++ .../query_based/frontmatter.md | 26 + .../{fts => search_based}/.env.sample | 0 ...th_Couchbase_and_Openrouter_Deepseek.ipynb | 160 +-- .../{fts => search_based}/deepseek_index.json | 0 .../search_based/frontmatter.md | 23 + 12 files changed, 1217 insertions(+), 1212 deletions(-) delete mode 100644 openrouter-deepseek/fts/frontmatter.md delete mode 100644 openrouter-deepseek/gsi/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb delete mode 100644 openrouter-deepseek/gsi/frontmatter.md rename openrouter-deepseek/{gsi => query_based}/.env.sample (100%) create mode 100644 openrouter-deepseek/query_based/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb create mode 100644 openrouter-deepseek/query_based/frontmatter.md rename openrouter-deepseek/{fts => search_based}/.env.sample (100%) rename openrouter-deepseek/{fts => search_based}/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb (93%) rename openrouter-deepseek/{fts => search_based}/deepseek_index.json (100%) create mode 100644 openrouter-deepseek/search_based/frontmatter.md diff --git a/crewai/query_based/RAG_with_Couchbase_and_CrewAI.ipynb b/crewai/query_based/RAG_with_Couchbase_and_CrewAI.ipynb index 8791011f..5247102b 100644 --- a/crewai/query_based/RAG_with_Couchbase_and_CrewAI.ipynb +++ b/crewai/query_based/RAG_with_Couchbase_and_CrewAI.ipynb @@ -6,7 +6,7 @@ "metadata": {}, "source": [ "## Introduction\n", - "In this guide, we will walk you through building a powerful semantic search engine using [Couchbase](https://www.couchbase.com) as the backend database and [CrewAI](https://github.com/crewAIInc/crewAI) for agent-based RAG operations. CrewAI allows us to create specialized agents that can work together to handle different aspects of the RAG workflow, from document retrieval to response generation. This tutorial uses Couchbase's **Hyperscale Index** vector search capabilities, which offer high-performance vector search optimized for large-scale applications. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system from scratch. Alternatively if you want to perform semantic search using the **Search Vector Index**, please take a look at [this.](https://developer.couchbase.com/tutorial-crewai-couchbase-rag-with-search-vector-index/)" + "In this guide, we will walk you through building a powerful semantic search engine using [Couchbase](https://www.couchbase.com) as the backend database and [CrewAI](https://github.com/crewAIInc/crewAI) for agent-based RAG operations. CrewAI allows us to create specialized agents that can work together to handle different aspects of the RAG workflow, from document retrieval to response generation. This tutorial uses Couchbase's **Hyperscale or Composite Index** vector search capabilities, which offer high-performance vector search optimized for large-scale applications. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system from scratch. Alternatively if you want to perform semantic search using the **Search Vector Index**, please take a look at [this.](https://developer.couchbase.com/tutorial-crewai-couchbase-rag-with-search-vector-index/)" ] }, { diff --git a/crewai/search_based/RAG_with_Couchbase_and_CrewAI.ipynb b/crewai/search_based/RAG_with_Couchbase_and_CrewAI.ipynb index be3d7624..824d7f0e 100644 --- a/crewai/search_based/RAG_with_Couchbase_and_CrewAI.ipynb +++ b/crewai/search_based/RAG_with_Couchbase_and_CrewAI.ipynb @@ -5,7 +5,7 @@ "metadata": {}, "source": [ "## Introduction\n", - "In this guide, we will walk you through building a powerful semantic search engine using [Couchbase](https://www.couchbase.com) as the backend database and [CrewAI](https://github.com/crewAIInc/crewAI) for agent-based RAG operations. CrewAI allows us to create specialized agents that can work together to handle different aspects of the RAG workflow, from document retrieval to response generation. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system from scratch. Alternatively if you want to perform semantic search using the Hyperscale Vector Index, please take a look at [this.](https://developer.couchbase.com/tutorial-crewai-couchbase-rag-with-hyperscale-or-composite-vector-index)" + "In this guide, we will walk you through building a powerful semantic search engine using [Couchbase](https://www.couchbase.com) as the backend database and [CrewAI](https://github.com/crewAIInc/crewAI) for agent-based RAG operations. CrewAI allows us to create specialized agents that can work together to handle different aspects of the RAG workflow, from document retrieval to response generation. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system from scratch. Alternatively if you want to perform semantic search using the **Hyperscale or Composite Vector Index**, please take a look at [this.](https://developer.couchbase.com/tutorial-crewai-couchbase-rag-with-hyperscale-or-composite-vector-index)" ] }, { diff --git a/openrouter-deepseek/fts/frontmatter.md b/openrouter-deepseek/fts/frontmatter.md deleted file mode 100644 index 3c33a3b9..00000000 --- a/openrouter-deepseek/fts/frontmatter.md +++ /dev/null @@ -1,23 +0,0 @@ ---- -# frontmatter -path: "/tutorial-openrouter-deepseek-with-fts" -title: Retrieval-Augmented Generation with Couchbase and OpenRouter Deepseek using FTS service -short_title: RAG with Couchbase and OpenRouter Deepseek using FTS service -description: - - Learn how to build a semantic search engine using Couchbase and OpenRouter with Deepseek using FTS service. - - This tutorial demonstrates how to integrate Couchbase's vector search capabilities with OpenRouter Deepseek as both embeddings and language model provider. - - You'll understand how to perform Retrieval-Augmented Generation (RAG) using LangChain and Couchbase. -content_type: tutorial -filter: sdk -technology: - - vector search -tags: - - FTS - - Artificial Intelligence - - LangChain - - Deepseek - - OpenRouter -sdk_language: - - python -length: 60 Mins ---- diff --git a/openrouter-deepseek/gsi/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb b/openrouter-deepseek/gsi/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb deleted file mode 100644 index cbde7f8c..00000000 --- a/openrouter-deepseek/gsi/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb +++ /dev/null @@ -1,1089 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Introduction \n", - "In this guide, we will walk you through building a powerful semantic search engine using Couchbase as the backend database and [Deepseek V3 as the language model provider (via OpenRouter or direct API)](https://deepseek.ai/) and OpenAI for embeddings. Semantic search goes beyond simple keyword matching by understanding the context and meaning behind the words in a query, making it an essential tool for applications that require intelligent information retrieval. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system using GSI( Global Secondary Index) from scratch. Alternatively if you want to perform semantic search using the FTS index, please take a look at [this.](https://developer.couchbase.com/tutorial-openrouter-deepseek-with-fts/)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# How to run this tutorial\n", - "\n", - "This tutorial is available as a Jupyter Notebook (`.ipynb` file) that you can run interactively. You can access the original notebook [here](https://github.com/couchbase-examples/vector-search-cookbook/blob/main/openrouter-deepseek/gsi/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb).\n", - "\n", - "You can either download the notebook file and run it on [Google Colab](https://colab.research.google.com/) or run it on your system by setting up the Python environment." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Before you start\n", - "\n", - "## Get Credentials for OpenRouter and Deepseek\n", - "* Sign up for an account at [OpenRouter](https://openrouter.ai/) to get your API key\n", - "* OpenRouter provides access to Deepseek models, so no separate Deepseek credentials are needed\n", - "* Store your OpenRouter API key securely as it will be used to access the models\n", - "* For [Deepseek](https://deepseek.ai/) models, you can use the default models provided by OpenRouter\n", - "\n", - "## Create and Deploy Your Free Tier Operational cluster on Capella\n", - "\n", - "To get started with Couchbase Capella, create an account and use it to deploy a forever free tier operational cluster. This account provides you with an environment where you can explore and learn about Capella with no time constraint.\n", - "\n", - "To learn more, please follow the [instructions](https://docs.couchbase.com/cloud/get-started/create-account.html).\n", - "\n", - "Note: To run this this tutorial, you will need Capella with Couchbase Server version 8.0 or above as GSI vector search is supported only from version 8.0\n", - "\n", - "### Couchbase Capella Configuration\n", - "\n", - "When running Couchbase using [Capella](https://cloud.couchbase.com/sign-in), the following prerequisites need to be met.\n", - "\n", - "* Create the [database credentials](https://docs.couchbase.com/cloud/clusters/manage-database-users.html) to access the required bucket (Read and Write) used in the application.\n", - "* [Allow access](https://docs.couchbase.com/cloud/clusters/allow-ip-address.html) to the Cluster from the IP on which the application is running." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setting the Stage: Installing Necessary Libraries\n", - "\n", - "To build our semantic search engine, we need a robust set of tools. The libraries we install handle everything from connecting to databases to performing complex machine learning tasks." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], - "source": [ - "%pip install --quiet datasets==3.5.0 langchain-couchbase==0.5.0 langchain-deepseek==0.1.3 langchain-openai==0.3.13 python-dotenv==1.1.1" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Importing Necessary Libraries\n", - "\n", - "The script starts by importing a series of libraries required for various tasks, including handling JSON, logging, time tracking, Couchbase connections, embedding generation, and dataset loading." - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [], - "source": [ - "import getpass\n", - "import json\n", - "import logging\n", - "import os\n", - "import time\n", - "from datetime import timedelta\n", - "\n", - "from couchbase.auth import PasswordAuthenticator\n", - "from couchbase.cluster import Cluster\n", - "from couchbase.exceptions import (CouchbaseException,\n", - " InternalServerFailureException,\n", - " QueryIndexAlreadyExistsException,ServiceUnavailableException)\n", - "from couchbase.management.buckets import CreateBucketSettings\n", - "from couchbase.management.search import SearchIndex\n", - "from couchbase.options import ClusterOptions\n", - "from datasets import load_dataset\n", - "from dotenv import load_dotenv\n", - "from langchain_core.globals import set_llm_cache\n", - "from langchain_core.output_parsers import StrOutputParser\n", - "from langchain_core.prompts.chat import ChatPromptTemplate\n", - "from langchain_core.runnables import RunnablePassthrough\n", - "from langchain_couchbase.cache import CouchbaseCache\n", - "from langchain_couchbase.vectorstores import CouchbaseQueryVectorStore\n", - "from langchain_couchbase.vectorstores import DistanceStrategy\n", - "from langchain_couchbase.vectorstores import IndexType\n", - "from langchain_openai import OpenAIEmbeddings" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setup Logging\n", - "Logging is configured to track the progress of the script and capture any errors or warnings." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)\n", - "\n", - "# Suppress httpx logging\n", - "logging.getLogger('httpx').setLevel(logging.CRITICAL)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Environment Variables and Configuration\n", - "\n", - "This section handles loading and validating environment variables and configuration settings:\n", - "#\n", - "1. API Keys:\n", - " - Supports either direct Deepseek API or OpenRouter API access\n", - " - Prompts for API key input if not found in environment\n", - " - Requires OpenAI API key for embeddings\n", - "#\n", - "2. Couchbase Settings:\n", - " - Connection details (host, username, password)\n", - " - Bucket, scope and collection names\n", - " - Vector search index configuration\n", - " - Cache collection settings\n", - "#\n", - "The code validates that all required credentials are present before proceeding.\n", - "It allows flexible configuration through environment variables or interactive prompts,\n", - "with sensible defaults for local development.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "# Load environment variables from .env file if it exists\n", - "load_dotenv(override= True)\n", - "\n", - "# API Keys\n", - "# Allow either Deepseek API directly or via OpenRouter\n", - "DEEPSEEK_API_KEY = os.getenv('DEEPSEEK_API_KEY')\n", - "OPENROUTER_API_KEY = os.getenv('OPENROUTER_API_KEY')\n", - "\n", - "if not DEEPSEEK_API_KEY and not OPENROUTER_API_KEY:\n", - " api_choice = input('Choose API (1 for Deepseek direct, 2 for OpenRouter): ')\n", - " if api_choice == '1':\n", - " DEEPSEEK_API_KEY = getpass.getpass('Enter your Deepseek API Key: ')\n", - " else:\n", - " OPENROUTER_API_KEY = getpass.getpass('Enter your OpenRouter API Key: ')\n", - "\n", - "OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') or getpass.getpass('Enter your OpenAI API Key: ')\n", - "\n", - "# Couchbase Settings\n", - "CB_HOST = os.getenv('CB_HOST') or input('Enter your Couchbase host (default: couchbase://localhost): ') or 'couchbase://localhost'\n", - "CB_USERNAME = os.getenv('CB_USERNAME') or input('Enter your Couchbase username (default: Administrator): ') or 'Administrator'\n", - "CB_PASSWORD = os.getenv('CB_PASSWORD') or getpass.getpass('Enter your Couchbase password (default: password): ') or 'password'\n", - "CB_BUCKET_NAME = os.getenv('CB_BUCKET_NAME') or input('Enter your Couchbase bucket name (default: query-vector-search-testing): ') or 'query-vector-search-testing'\n", - "SCOPE_NAME = os.getenv('SCOPE_NAME') or input('Enter your scope name (default: shared): ') or 'shared'\n", - "COLLECTION_NAME = os.getenv('COLLECTION_NAME') or input('Enter your collection name (default: deepseek): ') or 'deepseek'\n", - "CACHE_COLLECTION = os.getenv('CACHE_COLLECTION') or input('Enter your cache collection name (default: cache): ') or 'cache'\n", - "\n", - "# Check if required credentials are set\n", - "required_creds = {\n", - " 'OPENAI_API_KEY': OPENAI_API_KEY,\n", - " 'CB_HOST': CB_HOST,\n", - " 'CB_USERNAME': CB_USERNAME,\n", - " 'CB_PASSWORD': CB_PASSWORD,\n", - " 'CB_BUCKET_NAME': CB_BUCKET_NAME\n", - "}\n", - "\n", - "# Add the API key that was chosen\n", - "if DEEPSEEK_API_KEY:\n", - " required_creds['DEEPSEEK_API_KEY'] = DEEPSEEK_API_KEY\n", - "elif OPENROUTER_API_KEY:\n", - " required_creds['OPENROUTER_API_KEY'] = OPENROUTER_API_KEY\n", - "else:\n", - " raise ValueError(\"Either Deepseek API Key or OpenRouter API Key must be provided\")\n", - "\n", - "for cred_name, cred_value in required_creds.items():\n", - " if not cred_value:\n", - " raise ValueError(f\"{cred_name} is not set\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Connecting to the Couchbase Cluster\n", - "Connecting to a Couchbase cluster is the foundation of our project. Couchbase will serve as our primary data store, handling all the storage and retrieval operations required for our semantic search engine. By establishing this connection, we enable our application to interact with the database, allowing us to perform operations such as storing embeddings, querying data, and managing collections. This connection is the gateway through which all data will flow, so ensuring it's set up correctly is paramount.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-09-17 15:40:27,133 - INFO - Successfully connected to Couchbase\n" - ] - } - ], - "source": [ - "try:\n", - " auth = PasswordAuthenticator(CB_USERNAME, CB_PASSWORD)\n", - " options = ClusterOptions(auth)\n", - " cluster = Cluster(CB_HOST, options)\n", - " cluster.wait_until_ready(timedelta(seconds=5))\n", - " logging.info(\"Successfully connected to Couchbase\")\n", - "except Exception as e:\n", - " raise ConnectionError(f\"Failed to connect to Couchbase: {str(e)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setting Up Collections in Couchbase\n", - "\n", - "The setup_collection() function handles creating and configuring the hierarchical data organization in Couchbase:\n", - "\n", - "1. Bucket Creation:\n", - " - Checks if specified bucket exists, creates it if not\n", - " - Sets bucket properties like RAM quota (1024MB) and replication (disabled)\n", - " - Note: If you are using Capella, create a bucket manually called vector-search-testing(or any name you prefer) with the same properties.\n", - "\n", - "2. Scope Management: \n", - " - Verifies if requested scope exists within bucket\n", - " - Creates new scope if needed (unless it's the default \"_default\" scope)\n", - "\n", - "3. Collection Setup:\n", - " - Checks for collection existence within scope\n", - " - Creates collection if it doesn't exist\n", - " - Waits 2 seconds for collection to be ready\n", - "\n", - "Additional Tasks:\n", - "- Clears any existing documents for clean state\n", - "- Implements comprehensive error handling and logging\n", - "\n", - "The function is called twice to set up:\n", - "1. Main collection for vector embeddings\n", - "2. Cache collection for storing results\n" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-09-17 15:41:01,398 - INFO - Bucket 'query-vector-search-testing' exists.\n", - "2025-09-17 15:41:01,410 - INFO - Collection 'deepseek' does not exist. Creating it...\n", - "2025-09-17 15:41:01,453 - INFO - Collection 'deepseek' created successfully.\n", - "2025-09-17 15:41:03,712 - INFO - All documents cleared from the collection.\n", - "2025-09-17 15:41:03,713 - INFO - Bucket 'query-vector-search-testing' exists.\n", - "2025-09-17 15:41:03,728 - INFO - Collection 'cache' already exists. Skipping creation.\n", - "2025-09-17 15:41:05,821 - INFO - All documents cleared from the collection.\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def setup_collection(cluster, bucket_name, scope_name, collection_name):\n", - " try:\n", - " # Check if bucket exists, create if it doesn't\n", - " try:\n", - " bucket = cluster.bucket(bucket_name)\n", - " logging.info(f\"Bucket '{bucket_name}' exists.\")\n", - " except Exception as e:\n", - " logging.info(f\"Bucket '{bucket_name}' does not exist. Creating it...\")\n", - " bucket_settings = CreateBucketSettings(\n", - " name=bucket_name,\n", - " bucket_type='couchbase',\n", - " ram_quota_mb=1024,\n", - " flush_enabled=True,\n", - " num_replicas=0\n", - " )\n", - " cluster.buckets().create_bucket(bucket_settings)\n", - " time.sleep(2) # Wait for bucket creation to complete and become available\n", - " bucket = cluster.bucket(bucket_name)\n", - " logging.info(f\"Bucket '{bucket_name}' created successfully.\")\n", - "\n", - " bucket_manager = bucket.collections()\n", - "\n", - " # Check if scope exists, create if it doesn't\n", - " scopes = bucket_manager.get_all_scopes()\n", - " scope_exists = any(scope.name == scope_name for scope in scopes)\n", - " \n", - " if not scope_exists and scope_name != \"_default\":\n", - " logging.info(f\"Scope '{scope_name}' does not exist. Creating it...\")\n", - " bucket_manager.create_scope(scope_name)\n", - " logging.info(f\"Scope '{scope_name}' created successfully.\")\n", - "\n", - " # Check if collection exists, create if it doesn't\n", - " collections = bucket_manager.get_all_scopes()\n", - " collection_exists = any(\n", - " scope.name == scope_name and collection_name in [col.name for col in scope.collections]\n", - " for scope in collections\n", - " )\n", - "\n", - " if not collection_exists:\n", - " logging.info(f\"Collection '{collection_name}' does not exist. Creating it...\")\n", - " bucket_manager.create_collection(scope_name, collection_name)\n", - " logging.info(f\"Collection '{collection_name}' created successfully.\")\n", - " else:\n", - " logging.info(f\"Collection '{collection_name}' already exists. Skipping creation.\")\n", - "\n", - " # Wait for collection to be ready\n", - " collection = bucket.scope(scope_name).collection(collection_name)\n", - " time.sleep(2) # Give the collection time to be ready for queries\n", - "\n", - " # Clear all documents in the collection\n", - " try:\n", - " query = f\"DELETE FROM `{bucket_name}`.`{scope_name}`.`{collection_name}`\"\n", - " cluster.query(query).execute()\n", - " logging.info(\"All documents cleared from the collection.\")\n", - " except Exception as e:\n", - " logging.warning(f\"Error while clearing documents: {str(e)}. The collection might be empty.\")\n", - "\n", - " return collection\n", - " except Exception as e:\n", - " raise RuntimeError(f\"Error setting up collection: {str(e)}\")\n", - " \n", - "setup_collection(cluster, CB_BUCKET_NAME, SCOPE_NAME, COLLECTION_NAME)\n", - "setup_collection(cluster, CB_BUCKET_NAME, SCOPE_NAME, CACHE_COLLECTION)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Creating the Embeddings client\n", - "This section creates an OpenAI embeddings client using the OpenAI API key.\n", - "The embeddings client is configured to use the \"text-embedding-3-small\" model,\n", - "which converts text into numerical vector representations.\n", - "These vector embeddings are essential for semantic search and similarity matching.\n", - "The client will be used by the vector store to generate embeddings for documents." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-09-17 15:41:27,149 - INFO - Successfully created OpenAI embeddings client\n" - ] - } - ], - "source": [ - "try:\n", - " embeddings = OpenAIEmbeddings(\n", - " api_key=OPENAI_API_KEY,\n", - " model=\"text-embedding-3-small\"\n", - " )\n", - " logging.info(\"Successfully created OpenAI embeddings client\")\n", - "except Exception as e:\n", - " raise ValueError(f\"Error creating OpenAI embeddings client: {str(e)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setting Up the Couchbase Vector Store\n", - "A vector store is where we'll keep our embeddings. Unlike the FTS index, which is used for text-based search, the vector store is specifically designed to handle embeddings and perform similarity searches. When a user inputs a query, the search engine converts the query into an embedding and compares it against the embeddings stored in the vector store. This allows the engine to find documents that are semantically similar to the query, even if they don't contain the exact same words. By setting up the vector store in Couchbase, we create a powerful tool that enables our search engine to understand and retrieve information based on the meaning and context of the query, rather than just the specific words used." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-09-17 15:41:55,394 - INFO - Successfully created vector store\n" - ] - } - ], - "source": [ - "try:\n", - " vector_store = CouchbaseQueryVectorStore(\n", - " cluster=cluster,\n", - " bucket_name=CB_BUCKET_NAME,\n", - " scope_name=SCOPE_NAME,\n", - " collection_name=COLLECTION_NAME,\n", - " embedding = embeddings,\n", - " distance_metric=DistanceStrategy.COSINE\n", - " )\n", - " logging.info(\"Successfully created vector store\")\n", - "except Exception as e:\n", - " raise ValueError(f\"Failed to create vector store: {str(e)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load the BBC News Dataset\n", - "To build a search engine, we need data to search through. We use the BBC News dataset from RealTimeData, which provides real-world news articles. This dataset contains news articles from BBC covering various topics and time periods. Loading the dataset is a crucial step because it provides the raw material that our search engine will work with. The quality and diversity of the news articles make it an excellent choice for testing and refining our search engine, ensuring it can handle real-world news content effectively.\n", - "\n", - "The BBC News dataset allows us to work with authentic news articles, enabling us to build and test a search engine that can effectively process and retrieve relevant news content. The dataset is loaded using the Hugging Face datasets library, specifically accessing the \"RealTimeData/bbc_news_alltime\" dataset with the \"2024-12\" version." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-09-17 15:42:04,530 - INFO - Successfully loaded the BBC News dataset with 2687 rows.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loaded the BBC News dataset with 2687 rows\n" - ] - } - ], - "source": [ - "try:\n", - " news_dataset = load_dataset(\n", - " \"RealTimeData/bbc_news_alltime\", \"2024-12\", split=\"train\"\n", - " )\n", - " print(f\"Loaded the BBC News dataset with {len(news_dataset)} rows\")\n", - " logging.info(f\"Successfully loaded the BBC News dataset with {len(news_dataset)} rows.\")\n", - "except Exception as e:\n", - " raise ValueError(f\"Error loading the BBC News dataset: {str(e)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Cleaning up the Data\n", - "We will use the content of the news articles for our RAG system.\n", - "\n", - "The dataset contains a few duplicate records. We are removing them to avoid duplicate results in the retrieval stage of our RAG system." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "We have 1749 unique articles in our database.\n" - ] - } - ], - "source": [ - "news_articles = news_dataset[\"content\"]\n", - "unique_articles = set()\n", - "for article in news_articles:\n", - " if article:\n", - " unique_articles.add(article)\n", - "unique_news_articles = list(unique_articles)\n", - "print(f\"We have {len(unique_news_articles)} unique articles in our database.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Saving Data to the Vector Store\n", - "To efficiently handle the large number of articles, we process them in batches of articles at a time. This batch processing approach helps manage memory usage and provides better control over the ingestion process.\n", - "\n", - "We first filter out any articles that exceed 50,000 characters to avoid potential issues with token limits. Then, using the vector store's add_texts method, we add the filtered articles to our vector database. The batch_size parameter controls how many articles are processed in each iteration.\n", - "\n", - "This approach offers several benefits:\n", - "1. Memory Efficiency: Processing in smaller batches prevents memory overload\n", - "2. Progress Tracking: Easier to monitor and track the ingestion progress\n", - "3. Resource Management: Better control over CPU and network resource utilization\n", - "\n", - "We use a conservative batch size of 50 to ensure reliable operation.\n", - "The optimal batch size depends on many factors including:\n", - "- Document sizes being inserted\n", - "- Available system resources\n", - "- Network conditions\n", - "- Concurrent workload\n", - "\n", - "Consider measuring performance with your specific workload before adjusting.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-09-17 16:08:51,054 - INFO - Document ingestion completed successfully.\n" - ] - } - ], - "source": [ - "batch_size = 50\n", - "\n", - "# Automatic Batch Processing\n", - "articles = [article for article in unique_news_articles if article and len(article) <= 50000]\n", - "\n", - "try:\n", - " vector_store.add_texts(\n", - " texts=articles,\n", - " batch_size=batch_size\n", - " )\n", - " logging.info(\"Document ingestion completed successfully.\")\n", - "except Exception as e:\n", - " raise ValueError(f\"Failed to save documents to vector store: {str(e)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setting Up the LLM Model\n", - "In this section, we set up the Large Language Model (LLM) for our RAG system. We're using the Deepseek model, which can be accessed through two different methods:\n", - "\n", - "1. **Deepseek API Key**: This is obtained directly from Deepseek's platform (https://deepseek.ai) by creating an account and subscribing to their API services. With this key, you can access Deepseek's models directly using the `ChatDeepSeek` class from the `langchain_deepseek` package.\n", - "\n", - "2. **OpenRouter API Key**: OpenRouter (https://openrouter.ai) is a service that provides unified access to multiple LLM providers, including Deepseek. You can obtain an API key by creating an account on OpenRouter's website. This approach uses the `ChatOpenAI` class from `langchain_openai` but with a custom base URL pointing to OpenRouter's API endpoint.\n", - "\n", - "The key difference is that OpenRouter acts as an intermediary service that can route your requests to various LLM providers, while the Deepseek API gives you direct access to only Deepseek's models. OpenRouter can be useful if you want to switch between different LLM providers without changing your code significantly.\n", - "\n", - "In our implementation, we check for both keys and prioritize using the Deepseek API directly if available, falling back to OpenRouter if not. The model is configured with temperature=0 to ensure deterministic, focused responses suitable for RAG applications.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-09-18 11:18:25,192 - INFO - Successfully created Deepseek LLM client through OpenRouter\n" - ] - } - ], - "source": [ - "from langchain_deepseek import ChatDeepSeek\n", - "from langchain_openai import ChatOpenAI\n", - "\n", - "if DEEPSEEK_API_KEY:\n", - " try:\n", - " llm = ChatDeepSeek(\n", - " api_key=DEEPSEEK_API_KEY,\n", - " model_name=\"deepseek-chat\",\n", - " temperature=0\n", - " )\n", - " logging.info(\"Successfully created Deepseek LLM client\")\n", - " except Exception as e:\n", - " raise ValueError(f\"Error creating Deepseek LLM client: {str(e)}\")\n", - "elif OPENROUTER_API_KEY:\n", - " try:\n", - " llm = ChatOpenAI(\n", - " api_key=OPENROUTER_API_KEY,\n", - " base_url=\"https://openrouter.ai/api/v1\",\n", - " model=\"deepseek/deepseek-chat-v3.1\", \n", - " temperature=0,\n", - " )\n", - " logging.info(\"Successfully created Deepseek LLM client through OpenRouter\")\n", - " except Exception as e:\n", - " raise ValueError(f\"Error creating Deepseek LLM client: {str(e)}\")\n", - "else:\n", - " raise ValueError(\"Either Deepseek API Key or OpenRouter API Key must be provided\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Perform Semantic Search\n", - "Semantic search in Couchbase involves converting queries and documents into vector representations using an embeddings model. These vectors capture the semantic meaning of the text and are stored directly in Couchbase. When a query is made, Couchbase performs a similarity search by comparing the query vector against the stored document vectors. The similarity metric used for this comparison is configurable, allowing flexibility in how the relevance of documents is determined. Common metrics include cosine similarity, Euclidean distance, or dot product, but other metrics can be implemented based on specific use cases. Different embedding models like BERT, Word2Vec, or GloVe can also be used depending on the application's needs, with the vectors generated by these models stored and searched within Couchbase itself.\n", - "\n", - "In the provided code, the search process begins by recording the start time, followed by executing the `similarity_search_with_score` method of the `CouchbaseQueryVectorStore`. This method searches Couchbase for the most relevant documents based on the vector similarity to the query. The search results include the document content and the distance that reflects how closely each document aligns with the query in the defined semantic space. The time taken to perform this search is then calculated and logged, and the results are displayed, showing the most relevant documents along with their similarity scores. This approach leverages Couchbase as both a storage and retrieval engine for vector data, enabling efficient and scalable semantic searches. The integration of vector storage and search capabilities within Couchbase allows for sophisticated semantic search operations without relying on external services for vector storage or comparison." - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-09-17 16:11:07,177 - INFO - Semantic search completed in 2.46 seconds\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Semantic Search Results (completed in 2.46 seconds):\n", - "--------------------------------------------------------------------------------\n", - "Distance: 0.3693, Text: The Littler effect - how darts hit the bullseye\n", - "\n", - "Teenager Luke Littler began his bid to win the 2025 PDC World Darts Championship with a second-round win against Ryan Meikle. Here we assess Littler's impact after a remarkable rise which saw him named BBC Young Sports Personality of the Year and runner-up in the main award to athlete Keely Hodgkinson.\n", - "\n", - "One year ago, he was barely a household name in his own home. Now he is a sporting phenomenon. After emerging from obscurity aged 16 to reach the World Championship final, the life of Luke Littler and the sport he loves has been transformed. Viewing figures, ticket sales and social media interest have rocketed. Darts has hit the bullseye. This Christmas more than 100,000 children are expected to be opening Littler-branded magnetic dartboards as presents. His impact has helped double the number of junior academies, prompted plans to expand the World Championship and generated interest in darts from Saudi Arabian backers.\n", - "\n", - "Just months after taking his GCSE exams and ranked 164th in the world, Littler beat former champions Raymond van Barneveld and Rob Cross en route to the PDC World Championship final in January, before his run ended with a 7-4 loss to Luke Humphries. With his nickname 'The Nuke' on his purple and yellow shirt and the Alexandra Palace crowd belting out his walk-on song, Pitbull's tune Greenlight, he became an instant hit. Electric on the stage, calm off it. The down-to-earth teenager celebrated with a kebab and computer games. \"We've been watching his progress since he was about seven. He was on our radar, but we never anticipated what would happen. The next thing we know 'Littlermania' is spreading everywhere,\" PDC president Barry Hearn told BBC Sport. A peak TV audience of 3.7 million people watched the final - easily Sky's biggest figure for a non-football sporting event. The teenager from Warrington in Cheshire was too young to legally drive or drink alcohol, but earned £200,000 for finishing second - part of £1m prize money in his first year as a professional - and an invitation to the elite Premier League competition. He turned 17 later in January but was he too young for the demanding event over 17 Thursday nights in 17 locations? He ended up winning the whole thing, and hit a nine-dart finish against Humphries in the final. From Bahrain to Wolverhampton, Littler claimed 10 titles in 2024 and is now eyeing the World Championship.\n", - "\n", - "As he progressed at the Ally Pally, the Manchester United fan was sent a good luck message by the club's former midfielder and ex-England captain David Beckham. In 12 months, Littler's Instagram followers have risen from 4,000 to 1.3m. Commercial backers include a clothing range, cereal firm and train company and he will appear in a reboot of the TV darts show Bullseye. Google say he was the most searched-for athlete online in the UK during 2024. On the back of his success, Littler darts, boards, cabinets, shirts are being snapped up in big numbers. \"This Christmas the junior magnetic dartboard is selling out, we're talking over 100,000. They're 20 quid and a great introduction for young children,\" said Garry Plummer, the boss of sponsors Target Darts, who first signed a deal with Littler's family when he was aged 12. \"All the toy shops want it, they all want him - 17, clean, doesn't drink, wonderful.\"\n", - "\n", - "Littler beat Luke Humphries to win the Premier League title in May\n", - "\n", - "The number of academies for children under the age of 16 has doubled in the last year, says Junior Darts Corporation chairman Steve Brown. There are 115 dedicated groups offering youngsters equipment, tournaments and a place to develop, with bases including Australia, Bulgaria, Greece, Norway, USA and Mongolia. \"We've seen so many inquiries from around the world, it's been such a boom. It took us 14 years to get 1,600 members and within 12 months we have over 3,000, and waiting lists,\" said Brown. \"When I played darts as a child, I was quite embarrassed to tell my friends what my hobby was. All these kids playing darts now are pretty popular at school. It's a bit rock 'n roll and recognised as a cool thing to do.\" Plans are being hatched to extend the World Championship by four days and increase the number of players from 96 to 128. That will boost the number of tickets available by 25,000 to 115,000 but Hearn reckons he could sell three times as many. He says Saudi Arabia wants to host a tournament, which is likely to happen if no-alcohol regulations are relaxed. \"They will change their rules in the next 12 months probably for certain areas having alcohol, and we'll take darts there and have a party in Saudi,\" he said. \"When I got involved in darts, the total prize money was something like £300,000 for the year. This year it will go to £20m. I expect in five years' time, we'll be playing for £40m.\"\n", - "\n", - "Former electrician Cross charged to the 2018 world title in his first full season, while Adrian Lewis and Michael van Gerwen were multiple victors in their 20s and 16-time champion Phil ‘The Power’ Taylor is widely considered the greatest of all time. Littler is currently fourth in the world rankings, although that is based on a two-year Order of Merit. There have been suggestions from others the spotlight on the teenager means world number one Humphries, 29, has been denied the coverage he deserves, but no darts player has made a mark at such a young age as Littler. \"Luke Humphries is another fabulous player who is going to be around for years. Sport is a very brutal world. It is about winning and claiming the high ground. There will be envy around,\" Hearn said. \"Luke Littler is the next Tiger Woods for darts so they better get used to it, and the only way to compete is to get better.\" World number 38 Martin Lukeman was awestruck as he described facing a peak Littler after being crushed 16-3 in the Grand Slam final, with the teenager winning 15 consecutive legs. \"I can't compete with that, it was like Godly. He was relentless, he is so good it's ridiculous,\" he said. Lukeman can still see the benefits he brings, adding: \"What he's done for the sport is brilliant. If it wasn't for him, our wages wouldn't be going up. There's more sponsors, more money coming in, all good.\" Hearn feels future competition may come from players even younger than Littler. \"I watched a 10-year-old a few months ago who averaged 104.89 and checked out a 4-3 win with a 136 finish. They smell the money, the fame and put the hard work in,\" he said. How much better Littler can get is guesswork, although Plummer believes he wants to reach new heights. \"He never says 'how good was I?' But I think he wants to break records and beat Phil Taylor's 16 World Championships and 16 World Matchplay titles,\" he said. \"He's young enough to do it.\" A version of this article was originally published on 29 November.\n", - "• None Know a lot about Littler? Take our quiz\n", - "--------------------------------------------------------------------------------\n", - "Distance: 0.3900, Text: Luke Littler has risen from 164th to fourth in the rankings in a year\n", - "\n", - "A tearful Luke Littler hit a tournament record 140.91 set average as he started his bid for the PDC World Championship title with a dramatic 3-1 win over Ryan Meikle. The 17-year-old made headlines around the world when he reached the tournament final in January, where he lost to Luke Humphries. Starting this campaign on Saturday, Littler was millimetres away from a nine-darter when he missed double 12 as he blew Meikle away in the fourth and final set of the second-round match. Littler was overcome with emotion at the end, cutting short his on-stage interview. \"It was probably the toughest game I've ever played. I had to fight until the end,\" he said later in a news conference. \"As soon as the question came on stage and then boom, the tears came. It was just a bit too much to speak on stage. \"It is the worst game I have played. I have never felt anything like that tonight.\" Admitting to nerves during the match, he told Sky Sports: \"Yes, probably the biggest time it's hit me. Coming into it I was fine, but as soon as [referee] George Noble said 'game on', I couldn't throw them.\" Littler started slowly against Meikle, who had two darts for the opening set, but he took the lead by twice hitting double 20. Meikle did not look overawed against his fellow Englishman and levelled, but Littler won the third set and exploded into life in the fourth. The tournament favourite hit four maximum 180s as he clinched three straight legs in 11, 10 and 11 darts for a record set average, and 100.85 overall. Meanwhile, two seeds crashed out on Saturday night – five-time world champion Raymond van Barneveld lost to Welshman Nick Kenny, while England's Ryan Joyce beat Danny Noppert. Australian Damon Heta was another to narrowly miss out on a nine-darter, just failing on double 12 when throwing for the match in a 3-1 win over Connor Scutt. Ninth seed Heta hit four 100-plus checkouts to come from a set down against Scutt in a match in which both men averaged more than 97.\n", - "\n", - "Littler was hugged by his parents after victory over Meikle\n", - "\n", - "... (output truncated for brevity)\n" - ] - } - ], - "source": [ - "query = \"What were Luke Littler's key achievements and records in his recent PDC World Championship match?\"\n", - "\n", - "try:\n", - " # Perform the semantic search\n", - " start_time = time.time()\n", - " search_results = vector_store.similarity_search_with_score(query, k=10)\n", - " search_elapsed_time = time.time() - start_time\n", - "\n", - " logging.info(f\"Semantic search completed in {search_elapsed_time:.2f} seconds\")\n", - "\n", - " # Display search results\n", - " print(f\"\\nSemantic Search Results (completed in {search_elapsed_time:.2f} seconds):\")\n", - " print(\"-\" * 80)\n", - "\n", - " for doc, score in search_results:\n", - " print(f\"Distance: {score:.4f}, Text: {doc.page_content}\")\n", - " print(\"-\" * 80)\n", - "\n", - "except CouchbaseException as e:\n", - " raise RuntimeError(f\"Error performing semantic search: {str(e)}\")\n", - "except Exception as e:\n", - " raise RuntimeError(f\"Unexpected error: {str(e)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Optimizing Vector Search with Global Secondary Index (GSI)\n", - "\n", - "While the above semantic search using similarity_search_with_score works effectively, we can significantly improve query performance by leveraging Global Secondary Index (GSI) in Couchbase.\n", - "\n", - "Couchbase offers three types of vector indexes, but for GSI-based vector search we focus on two main types:\n", - "\n", - "Hyperscale Vector Indexes (BHIVE)\n", - "- Best for pure vector searches - content discovery, recommendations, semantic search\n", - "- High performance with low memory footprint - designed to scale to billions of vectors\n", - "- Optimized for concurrent operations - supports simultaneous searches and inserts\n", - "- Use when: You primarily perform vector-only queries without complex scalar filtering\n", - "- Ideal for: Large-scale semantic search, recommendation systems, content discovery\n", - "\n", - "Composite Vector Indexes \n", - "- Best for filtered vector searches - combines vector search with scalar value filtering\n", - "- Efficient pre-filtering - scalar attributes reduce the vector comparison scope\n", - "- Use when: Your queries combine vector similarity with scalar filters that eliminate large portions of data\n", - "- Ideal for: Compliance-based filtering, user-specific searches, time-bounded queries\n", - "\n", - "Choosing the Right Index Type\n", - "- Start with Hyperscale Vector Index for pure vector searches and large datasets\n", - "- Use Composite Vector Index when scalar filters significantly reduce your search space\n", - "- Consider your dataset size: Hyperscale scales to billions, Composite works well for tens of millions to billions\n", - "\n", - "For more information on GSI vector indexes, see [Couchbase GSI Vector Documentation](https://docs.couchbase.com/cloud/vector-index/use-vector-indexes.html).\n", - "\n", - "\n", - "## Understanding Index Configuration (Couchbase 8.0 Feature)\n", - "\n", - "The index_description parameter controls how Couchbase optimizes vector storage and search performance through centroids and quantization:\n", - "\n", - "Format: `'IVF[],{PQ|SQ}'`\n", - "\n", - "Centroids (IVF - Inverted File):\n", - "- Controls how the dataset is subdivided for faster searches\n", - "- More centroids = faster search, slower training \n", - "- Fewer centroids = slower search, faster training\n", - "- If omitted (like IVF,SQ8), Couchbase auto-selects based on dataset size\n", - "\n", - "Quantization Options:\n", - "- SQ (Scalar Quantization): SQ4, SQ6, SQ8 (4, 6, or 8 bits per dimension)\n", - "- PQ (Product Quantization): PQx (e.g., PQ32x8)\n", - "- Higher values = better accuracy, larger index size\n", - "\n", - "Common Examples:\n", - "- IVF,SQ8 - Auto centroids, 8-bit scalar quantization (good default)\n", - "- IVF1000,SQ6 - 1000 centroids, 6-bit scalar quantization \n", - "- IVF,PQ32x8 - Auto centroids, 32 subquantizers with 8 bits\n", - "\n", - "For detailed configuration options, see the [Quantization & Centroid Settings](https://docs.couchbase.com/cloud/vector-index/hyperscale-vector-index.html#algo_settings).\n", - "\n", - "In the code below, we demonstrate creating a BHIVE index. This method takes an index type (BHIVE or COMPOSITE) and description parameter for optimization settings. Alternatively, GSI indexes can be created manually from the Couchbase UI." - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [], - "source": [ - "vector_store.create_index(index_type=IndexType.BHIVE, index_name=\"openrouterdeepseek_bhive_index\",index_description=\"IVF,SQ8\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The example below shows running the same similarity search, but now using the BHIVE GSI index we created above. You'll notice improved performance as the index efficiently retrieves data.\n", - "\n", - "**Important**: When using Composite indexes, scalar filters take precedence over vector similarity, which can improve performance for filtered searches but may miss some semantically relevant results that don't match the scalar criteria.\n", - "\n", - "Note: In GSI vector search, the distance represents the vector distance between the query and document embeddings. Lower distance indicate higher similarity, while higher distance indicate lower similarity." - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-09-18 11:17:19,626 - INFO - Semantic search completed in 0.88 seconds\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Semantic Search Results (completed in 0.88 seconds):\n", - "--------------------------------------------------------------------------------\n", - "Distance: 0.3694, Text: The Littler effect - how darts hit the bullseye\n", - "\n", - "Teenager Luke Littler began his bid to win the 2025 PDC World Darts Championship with a second-round win against Ryan Meikle. Here we assess Littler's impact after a remarkable rise which saw him named BBC Young Sports Personality of the Year and runner-up in the main award to athlete Keely Hodgkinson.\n", - "\n", - "One year ago, he was barely a household name in his own home. Now he is a sporting phenomenon. After emerging from obscurity aged 16 to reach the World Championship final, the life of Luke Littler and the sport he loves has been transformed. Viewing figures, ticket sales and social media interest have rocketed. Darts has hit the bullseye. This Christmas more than 100,000 children are expected to be opening Littler-branded magnetic dartboards as presents. His impact has helped double the number of junior academies, prompted plans to expand the World Championship and generated interest in darts from Saudi Arabian backers.\n", - "\n", - "Just months after taking his GCSE exams and ranked 164th in the world, Littler beat former champions Raymond van Barneveld and Rob Cross en route to the PDC World Championship final in January, before his run ended with a 7-4 loss to Luke Humphries. With his nickname 'The Nuke' on his purple and yellow shirt and the Alexandra Palace crowd belting out his walk-on song, Pitbull's tune Greenlight, he became an instant hit. Electric on the stage, calm off it. The down-to-earth teenager celebrated with a kebab and computer games. \"We've been watching his progress since he was about seven. He was on our radar, but we never anticipated what would happen. The next thing we know 'Littlermania' is spreading everywhere,\" PDC president Barry Hearn told BBC Sport. A peak TV audience of 3.7 million people watched the final - easily Sky's biggest figure for a non-football sporting event. The teenager from Warrington in Cheshire was too young to legally drive or drink alcohol, but earned £200,000 for finishing second - part of £1m prize money in his first year as a professional - and an invitation to the elite Premier League competition. He turned 17 later in January but was he too young for the demanding event over 17 Thursday nights in 17 locations? He ended up winning the whole thing, and hit a nine-dart finish against Humphries in the final. From Bahrain to Wolverhampton, Littler claimed 10 titles in 2024 and is now eyeing the World Championship.\n", - "\n", - "As he progressed at the Ally Pally, the Manchester United fan was sent a good luck message by the club's former midfielder and ex-England captain David Beckham. In 12 months, Littler's Instagram followers have risen from 4,000 to 1.3m. Commercial backers include a clothing range, cereal firm and train company and he will appear in a reboot of the TV darts show Bullseye. Google say he was the most searched-for athlete online in the UK during 2024. On the back of his success, Littler darts, boards, cabinets, shirts are being snapped up in big numbers. \"This Christmas the junior magnetic dartboard is selling out, we're talking over 100,000. They're 20 quid and a great introduction for young children,\" said Garry Plummer, the boss of sponsors Target Darts, who first signed a deal with Littler's family when he was aged 12. \"All the toy shops want it, they all want him - 17, clean, doesn't drink, wonderful.\"\n", - "\n", - "Littler beat Luke Humphries to win the Premier League title in May\n", - "\n", - "The number of academies for children under the age of 16 has doubled in the last year, says Junior Darts Corporation chairman Steve Brown. There are 115 dedicated groups offering youngsters equipment, tournaments and a place to develop, with bases including Australia, Bulgaria, Greece, Norway, USA and Mongolia. \"We've seen so many inquiries from around the world, it's been such a boom. It took us 14 years to get 1,600 members and within 12 months we have over 3,000, and waiting lists,\" said Brown. \"When I played darts as a child, I was quite embarrassed to tell my friends what my hobby was. All these kids playing darts now are pretty popular at school. It's a bit rock 'n roll and recognised as a cool thing to do.\" Plans are being hatched to extend the World Championship by four days and increase the number of players from 96 to 128. That will boost the number of tickets available by 25,000 to 115,000 but Hearn reckons he could sell three times as many. He says Saudi Arabia wants to host a tournament, which is likely to happen if no-alcohol regulations are relaxed. \"They will change their rules in the next 12 months probably for certain areas having alcohol, and we'll take darts there and have a party in Saudi,\" he said. \"When I got involved in darts, the total prize money was something like £300,000 for the year. This year it will go to £20m. I expect in five years' time, we'll be playing for £40m.\"\n", - "\n", - "Former electrician Cross charged to the 2018 world title in his first full season, while Adrian Lewis and Michael van Gerwen were multiple victors in their 20s and 16-time champion Phil ‘The Power’ Taylor is widely considered the greatest of all time. Littler is currently fourth in the world rankings, although that is based on a two-year Order of Merit. There have been suggestions from others the spotlight on the teenager means world number one Humphries, 29, has been denied the coverage he deserves, but no darts player has made a mark at such a young age as Littler. \"Luke Humphries is another fabulous player who is going to be around for years. Sport is a very brutal world. It is about winning and claiming the high ground. There will be envy around,\" Hearn said. \"Luke Littler is the next Tiger Woods for darts so they better get used to it, and the only way to compete is to get better.\" World number 38 Martin Lukeman was awestruck as he described facing a peak Littler after being crushed 16-3 in the Grand Slam final, with the teenager winning 15 consecutive legs. \"I can't compete with that, it was like Godly. He was relentless, he is so good it's ridiculous,\" he said. Lukeman can still see the benefits he brings, adding: \"What he's done for the sport is brilliant. If it wasn't for him, our wages wouldn't be going up. There's more sponsors, more money coming in, all good.\" Hearn feels future competition may come from players even younger than Littler. \"I watched a 10-year-old a few months ago who averaged 104.89 and checked out a 4-3 win with a 136 finish. They smell the money, the fame and put the hard work in,\" he said. How much better Littler can get is guesswork, although Plummer believes he wants to reach new heights. \"He never says 'how good was I?' But I think he wants to break records and beat Phil Taylor's 16 World Championships and 16 World Matchplay titles,\" he said. \"He's young enough to do it.\" A version of this article was originally published on 29 November.\n", - "• None Know a lot about Littler? Take our quiz\n", - "--------------------------------------------------------------------------------\n", - "Distance: 0.3901, Text: Luke Littler has risen from 164th to fourth in the rankings in a year\n", - "\n", - "A tearful Luke Littler hit a tournament record 140.91 set average as he started his bid for the PDC World Championship title with a dramatic 3-1 win over Ryan Meikle. The 17-year-old made headlines around the world when he reached the tournament final in January, where he lost to Luke Humphries. Starting this campaign on Saturday, Littler was millimetres away from a nine-darter when he missed double 12 as he blew Meikle away in the fourth and final set of the second-round match. Littler was overcome with emotion at the end, cutting short his on-stage interview. \"It was probably the toughest game I've ever played. I had to fight until the end,\" he said later in a news conference. \"As soon as the question came on stage and then boom, the tears came. It was just a bit too much to speak on stage. \"It is the worst game I have played. I have never felt anything like that tonight.\" Admitting to nerves during the match, he told Sky Sports: \"Yes, probably the biggest time it's hit me. Coming into it I was fine, but as soon as [referee] George Noble said 'game on', I couldn't throw them.\" Littler started slowly against Meikle, who had two darts for the opening set, but he took the lead by twice hitting double 20. Meikle did not look overawed against his fellow Englishman and levelled, but Littler won the third set and exploded into life in the fourth. The tournament favourite hit four maximum 180s as he clinched three straight legs in 11, 10 and 11 darts for a record set average, and 100.85 overall. Meanwhile, two seeds crashed out on Saturday night – five-time world champion Raymond van Barneveld lost to Welshman Nick Kenny, while England's Ryan Joyce beat Danny Noppert. Australian Damon Heta was another to narrowly miss out on a nine-darter, just failing on double 12 when throwing for the match in a 3-1 win over Connor Scutt. Ninth seed Heta hit four 100-plus checkouts to come from a set down against Scutt in a match in which both men averaged more than 97.\n", - "\n", - "Littler was hugged by his parents after victory over Meikle\n", - "\n", - "... (output truncated for brevity)\n" - ] - } - ], - "source": [ - "\n", - "query = \"What were Luke Littler's key achievements and records in his recent PDC World Championship match?\"\n", - "\n", - "try:\n", - " # Perform the semantic search\n", - " start_time = time.time()\n", - " search_results = vector_store.similarity_search_with_score(query, k=10)\n", - " search_elapsed_time = time.time() - start_time\n", - "\n", - " logging.info(f\"Semantic search completed in {search_elapsed_time:.2f} seconds\")\n", - "\n", - " # Display search results\n", - " print(f\"\\nSemantic Search Results (completed in {search_elapsed_time:.2f} seconds):\")\n", - " print(\"-\" * 80)\n", - "\n", - " for doc, score in search_results:\n", - " print(f\"Distance: {score:.4f}, Text: {doc.page_content}\")\n", - " print(\"-\" * 80)\n", - "\n", - "except CouchbaseException as e:\n", - " raise RuntimeError(f\"Error performing semantic search: {str(e)}\")\n", - "except Exception as e:\n", - " raise RuntimeError(f\"Unexpected error: {str(e)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note: To create a COMPOSITE index, the below code can be used.\n", - "Choose based on your specific use case and query patterns. For this tutorial's news search scenario, either index type would work, but BHIVE might be more efficient for pure semantic search across news articles." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vector_store.create_index(index_type=IndexType.COMPOSITE, index_name=\"openrouterdeepseek_composite_index\", index_description=\"IVF,SQ8\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setting Up a Couchbase Cache\n", - "To further optimize our system, we set up a Couchbase-based cache. A cache is a temporary storage layer that holds data that is frequently accessed, speeding up operations by reducing the need to repeatedly retrieve the same information from the database. In our setup, the cache will help us accelerate repetitive tasks, such as looking up similar documents. By implementing a cache, we enhance the overall performance of our search engine, ensuring that it can handle high query volumes and deliver results quickly.\n", - "\n", - "Caching is particularly valuable in scenarios where users may submit similar queries multiple times or where certain pieces of information are frequently requested. By storing these in a cache, we can significantly reduce the time it takes to respond to these queries, improving the user experience.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-09-17 16:10:11,473 - INFO - Successfully created cache\n" - ] - } - ], - "source": [ - "try:\n", - " cache = CouchbaseCache(\n", - " cluster=cluster,\n", - " bucket_name=CB_BUCKET_NAME,\n", - " scope_name=SCOPE_NAME,\n", - " collection_name=CACHE_COLLECTION,\n", - " )\n", - " logging.info(\"Successfully created cache\")\n", - " set_llm_cache(cache)\n", - "except Exception as e:\n", - " raise ValueError(f\"Failed to create cache: {str(e)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Retrieval-Augmented Generation (RAG) with Couchbase and LangChain\n", - "Couchbase and LangChain can be seamlessly integrated to create RAG (Retrieval-Augmented Generation) chains, enhancing the process of generating contextually relevant responses. In this setup, Couchbase serves as the vector store, where embeddings of documents are stored. When a query is made, LangChain retrieves the most relevant documents from Couchbase by comparing the query’s embedding with the stored document embeddings. These documents, which provide contextual information, are then passed to a generative language model within LangChain.\n", - "\n", - "The language model, equipped with the context from the retrieved documents, generates a response that is both informed and contextually accurate. This integration allows the RAG chain to leverage Couchbase’s efficient storage and retrieval capabilities, while LangChain handles the generation of responses based on the context provided by the retrieved documents. Together, they create a powerful system that can deliver highly relevant and accurate answers by combining the strengths of both retrieval and generation." - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-09-18 11:18:34,032 - INFO - Successfully created RAG chain\n" - ] - } - ], - "source": [ - "# Create RAG prompt template\n", - "rag_prompt = ChatPromptTemplate.from_messages([\n", - " (\"system\", \"You are a helpful assistant that answers questions based on the provided context.\"),\n", - " (\"human\", \"Context: {context}\\n\\nQuestion: {question}\")\n", - "])\n", - "\n", - "# Create RAG chain\n", - "rag_chain = (\n", - " {\"context\": vector_store.as_retriever(), \"question\": RunnablePassthrough()}\n", - " | rag_prompt\n", - " | llm\n", - " | StrOutputParser()\n", - ")\n", - "logging.info(\"Successfully created RAG chain\")" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "RAG Response: Based on the provided context, Luke Littler's key achievements and records in his recent PDC World Championship match (second-round win against Ryan Meikle) were:\n", - "\n", - "* **Tournament Record Set Average:** He hit a tournament record 140.91 set average during the match.\n", - "* **Near Nine-Darter:** He was \"millimetres away from a nine-darter\" when he missed double 12.\n", - "* **Dominant Final Set:** He won the fourth and final set in just 32 darts (the minimum possible is 27), which included hitting four maximum 180s and clinching three straight legs in 11, 10, and 11 darts.\n", - "* **Overall High Average:** He maintained a high overall match average of 100.85.\n", - "RAG response generated in 0.49 seconds\n" - ] - } - ], - "source": [ - "try:\n", - " start_time = time.time()\n", - " rag_response = rag_chain.invoke(query)\n", - " rag_elapsed_time = time.time() - start_time\n", - "\n", - " print(f\"RAG Response: {rag_response}\")\n", - " print(f\"RAG response generated in {rag_elapsed_time:.2f} seconds\")\n", - "except InternalServerFailureException as e:\n", - " if \"query request rejected\" in str(e):\n", - " print(\"Error: Search request was rejected due to rate limiting. Please try again later.\")\n", - " else:\n", - " print(f\"Internal server error occurred: {str(e)}\")\n", - "except Exception as e:\n", - " print(f\"Unexpected error occurred: {str(e)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Using Couchbase as a caching mechanism\n", - "Couchbase can be effectively used as a caching mechanism for RAG (Retrieval-Augmented Generation) responses by storing and retrieving precomputed results for specific queries. This approach enhances the system's efficiency and speed, particularly when dealing with repeated or similar queries. When a query is first processed, the RAG chain retrieves relevant documents, generates a response using the language model, and then stores this response in Couchbase, with the query serving as the key.\n", - "\n", - "For subsequent requests with the same query, the system checks Couchbase first. If a cached response is found, it is retrieved directly from Couchbase, bypassing the need to re-run the entire RAG process. This significantly reduces response time because the computationally expensive steps of document retrieval and response generation are skipped. Couchbase's role in this setup is to provide a fast and scalable storage solution for caching these responses, ensuring that frequently asked queries can be answered more quickly and efficiently.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Query 1: What happened in the match between Fullham and Liverpool?\n", - "Response: In the match between Fulham and Liverpool, Liverpool played the majority of the game with 10 men after Andy Robertson received a red card in the 17th minute. Despite being a player down, Liverpool came from behind twice to secure a 2-2 draw. Diogo Jota scored an 86th-minute equalizer to earn Liverpool a point. The performance was praised for its resilience, with Fulham's Antonee Robinson noting that Liverpool \"didn't feel like they had 10 men at all.\" Liverpool maintained over 60% possession and led in attacking metrics such as shots and chances. Both managers acknowledged the strong efforts of their teams in what was described as an enthralling encounter.\n", - "Time taken: 4.65 seconds\n", - "\n", - "Query 2: What were Luke Littler's key achievements and records in his recent PDC World Championship match?\n", - "Response: Based on the provided context, Luke Littler's key achievements and records in his recent PDC World Championship match (second-round win against Ryan Meikle) were:\n", - "\n", - "* **Tournament Record Set Average:** He hit a tournament record 140.91 set average during the match.\n", - "* **Near Nine-Darter:** He was \"millimetres away from a nine-darter\" when he missed double 12.\n", - "* **Dominant Final Set:** He won the fourth and final set in just 32 darts (the minimum possible is 27), which included hitting four maximum 180s and clinching three straight legs in 11, 10, and 11 darts.\n", - "* **Overall High Average:** He maintained a high overall match average of 100.85.\n", - "Time taken: 0.45 seconds\n", - "\n", - "Query 3: What happened in the match between Fullham and Liverpool?\n", - "Response: In the match between Fulham and Liverpool, Liverpool played the majority of the game with 10 men after Andy Robertson received a red card in the 17th minute. Despite being a player down, Liverpool came from behind twice to secure a 2-2 draw. Diogo Jota scored an 86th-minute equalizer to earn Liverpool a point. The performance was praised for its resilience, with Fulham's Antonee Robinson noting that Liverpool \"didn't feel like they had 10 men at all.\" Liverpool maintained over 60% possession and led in attacking metrics such as shots and chances. Both managers acknowledged the strong efforts of their teams in what was described as an enthralling encounter.\n", - "Time taken: 1.15 seconds\n" - ] - } - ], - "source": [ - "try:\n", - " queries = [\n", - " \"What happened in the match between Fullham and Liverpool?\",\n", - " \"What were Luke Littler's key achievements and records in his recent PDC World Championship match?\", # Repeated query\n", - " \"What happened in the match between Fullham and Liverpool?\", # Repeated query\n", - " ]\n", - "\n", - " for i, query in enumerate(queries, 1):\n", - " print(f\"\\nQuery {i}: {query}\")\n", - " start_time = time.time()\n", - "\n", - " response = rag_chain.invoke(query)\n", - " elapsed_time = time.time() - start_time\n", - " print(f\"Response: {response}\")\n", - " print(f\"Time taken: {elapsed_time:.2f} seconds\")\n", - "\n", - "except InternalServerFailureException as e:\n", - " if \"query request rejected\" in str(e):\n", - " print(\"Error: Search request was rejected due to rate limiting. Please try again later.\")\n", - " else:\n", - " print(f\"Internal server error occurred: {str(e)}\")\n", - "except Exception as e:\n", - " print(f\"Unexpected error occurred: {str(e)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Conclusion\n", - "By following these steps, you'll have a fully functional semantic search engine that leverages the strengths of Couchbase and Deepseek(via Openrouter). This guide is designed not just to show you how to build the system, but also to explain why each step is necessary, giving you a deeper understanding of the principles behind semantic search and how to implement it effectively. Whether you're a newcomer to software development or an experienced developer looking to expand your skills, this guide will provide you with the knowledge and tools you need to create a powerful, AI-driven search engine." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/openrouter-deepseek/gsi/frontmatter.md b/openrouter-deepseek/gsi/frontmatter.md deleted file mode 100644 index 2cd04153..00000000 --- a/openrouter-deepseek/gsi/frontmatter.md +++ /dev/null @@ -1,23 +0,0 @@ ---- -# frontmatter -path: "/tutorial-openrouter-deepseek-with-global-secondary-index" -title: Retrieval-Augmented Generation with Couchbase and OpenRouter Deepseek using GSI index -short_title: RAG with Couchbase and OpenRouter Deepseek using GSI index -description: - - Learn how to build a semantic search engine using Couchbase and OpenRouter with Deepseek using GSI index. - - This tutorial demonstrates how to integrate Couchbase's vector search capabilities with OpenRouter Deepseek as both embeddings and language model provider. - - You'll understand how to perform Retrieval-Augmented Generation (RAG) using LangChain and Couchbase. -content_type: tutorial -filter: sdk -technology: - - vector search -tags: - - GSI - - Artificial Intelligence - - LangChain - - Deepseek - - OpenRouter -sdk_language: - - python -length: 60 Mins ---- diff --git a/openrouter-deepseek/gsi/.env.sample b/openrouter-deepseek/query_based/.env.sample similarity index 100% rename from openrouter-deepseek/gsi/.env.sample rename to openrouter-deepseek/query_based/.env.sample diff --git a/openrouter-deepseek/query_based/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb b/openrouter-deepseek/query_based/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb new file mode 100644 index 00000000..d3c64b16 --- /dev/null +++ b/openrouter-deepseek/query_based/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb @@ -0,0 +1,1081 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Introduction\n", + "\n", + "In this guide, we will walk you through building a powerful semantic search engine using Couchbase as the backend database and [Deepseek V3 as the language model provider (via OpenRouter or direct API)](https://deepseek.ai/) and OpenAI for embeddings. Semantic search goes beyond simple keyword matching by understanding the context and meaning behind the words in a query, making it an essential tool for applications that require intelligent information retrieval. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system using Couchbase's **Hyperscale and Composite Vector Index** capabilities from scratch. Hyperscale Vector Indexes are designed for pure vector searches, while Composite Vector Indexes enable filtered searches combining vector similarity with scalar attributes. Learn more about these index types in the [Couchbase Vector Index Documentation](https://docs.couchbase.com/cloud/vector-index/use-vector-indexes.html)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## How to Run This Tutorial\n", + "\n", + "This tutorial is available as a Jupyter Notebook (`.ipynb` file) that you can run interactively. You can access the original notebook [here](https://github.com/couchbase-examples/vector-search-cookbook/blob/main/openrouter-deepseek/query_based/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb).\n", + "\n", + "You can either download the notebook file and run it on [Google Colab](https://colab.research.google.com/) or run it on your system by setting up the Python environment." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "\n", + "### Get Credentials for OpenRouter and Deepseek\n", + "\n", + "* Sign up for an account at [OpenRouter](https://openrouter.ai/) to get your API key\n", + "* OpenRouter provides access to Deepseek models, so no separate Deepseek credentials are needed\n", + "* Store your OpenRouter API key securely as it will be used to access the models\n", + "* For [Deepseek](https://deepseek.ai/) models, you can use the default models provided by OpenRouter\n", + "\n", + "### Couchbase Requirements\n", + "\n", + "Create and Deploy Your Free Tier Operational cluster on [Capella](https://cloud.couchbase.com/sign-up)\n", + "\n", + "To get started with Couchbase Capella, create an account and use it to deploy a forever free tier operational cluster. This account provides you with an environment where you can explore and learn about Capella with no time constraint.\n", + "\n", + "To learn more, please follow the [instructions](https://docs.couchbase.com/cloud/get-started/create-account.html).\n", + "\n", + "**Note**: To run this tutorial, you will need Capella with Couchbase Server version 8.0 or above as Hyperscale and Composite vector search is supported only from version 8.0\n", + "\n", + "### Couchbase Capella Configuration\n", + "\n", + "When running Couchbase using [Capella](https://cloud.couchbase.com/sign-in), the following prerequisites need to be met.\n", + "\n", + "* Create the [database credentials](https://docs.couchbase.com/cloud/clusters/manage-database-users.html) to access the required bucket (Read and Write) used in the application.\n", + "* [Allow access](https://docs.couchbase.com/cloud/clusters/allow-ip-address.html) to the Cluster from the IP on which the application is running." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup and Installation\n", + "\n", + "### Installing Necessary Libraries\n", + "\n", + "To build our semantic search engine, we need a robust set of tools. The libraries we install handle everything from connecting to databases to performing complex machine learning tasks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install --quiet datasets==3.5.0 langchain-couchbase==0.5.0 langchain-deepseek==0.1.3 langchain-openai==0.3.13 python-dotenv==1.1.1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Import Required Modules\n", + "\n", + "The script starts by importing a series of libraries required for various tasks, including handling JSON, logging, time tracking, Couchbase connections, embedding generation, and dataset loading." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "import getpass\n", + "import json\n", + "import logging\n", + "import os\n", + "import time\n", + "from datetime import timedelta\n", + "\n", + "from couchbase.auth import PasswordAuthenticator\n", + "from couchbase.cluster import Cluster\n", + "from couchbase.exceptions import (CouchbaseException,\n", + " InternalServerFailureException,\n", + " QueryIndexAlreadyExistsException,ServiceUnavailableException)\n", + "from couchbase.management.buckets import CreateBucketSettings\n", + "from couchbase.management.search import SearchIndex\n", + "from couchbase.options import ClusterOptions\n", + "from datasets import load_dataset\n", + "from dotenv import load_dotenv\n", + "from langchain_core.globals import set_llm_cache\n", + "from langchain_core.output_parsers import StrOutputParser\n", + "from langchain_core.prompts.chat import ChatPromptTemplate\n", + "from langchain_core.runnables import RunnablePassthrough\n", + "from langchain_couchbase.cache import CouchbaseCache\n", + "from langchain_couchbase.vectorstores import CouchbaseQueryVectorStore\n", + "from langchain_couchbase.vectorstores import DistanceStrategy\n", + "from langchain_couchbase.vectorstores import IndexType\n", + "from langchain_openai import OpenAIEmbeddings" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Configure Logging\n", + "\n", + "Logging is configured to track the progress of the script and capture any errors or warnings." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)\n", + "\n", + "# Suppress httpx logging\n", + "logging.getLogger('httpx').setLevel(logging.CRITICAL)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load Environment Configuration\n", + "\n", + "This section handles loading and validating environment variables and configuration settings:\n", + "\n", + "1. **API Keys:**\n", + " - Supports either direct Deepseek API or OpenRouter API access\n", + " - Prompts for API key input if not found in environment\n", + " - Requires OpenAI API key for embeddings\n", + "\n", + "2. **Couchbase Settings:**\n", + " - Connection details (host, username, password)\n", + " - Bucket, scope and collection names\n", + " - Vector search index configuration\n", + " - Cache collection settings\n", + "\n", + "The code validates that all required credentials are present before proceeding. It allows flexible configuration through environment variables or interactive prompts, with sensible defaults for local development." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# Load environment variables from .env file if it exists\n", + "load_dotenv(override= True)\n", + "\n", + "# API Keys\n", + "# Allow either Deepseek API directly or via OpenRouter\n", + "DEEPSEEK_API_KEY = os.getenv('DEEPSEEK_API_KEY')\n", + "OPENROUTER_API_KEY = os.getenv('OPENROUTER_API_KEY')\n", + "\n", + "if not DEEPSEEK_API_KEY and not OPENROUTER_API_KEY:\n", + " api_choice = input('Choose API (1 for Deepseek direct, 2 for OpenRouter): ')\n", + " if api_choice == '1':\n", + " DEEPSEEK_API_KEY = getpass.getpass('Enter your Deepseek API Key: ')\n", + " else:\n", + " OPENROUTER_API_KEY = getpass.getpass('Enter your OpenRouter API Key: ')\n", + "\n", + "OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') or getpass.getpass('Enter your OpenAI API Key: ')\n", + "\n", + "# Couchbase Settings\n", + "CB_HOST = os.getenv('CB_HOST') or input('Enter your Couchbase host (default: couchbase://localhost): ') or 'couchbase://localhost'\n", + "CB_USERNAME = os.getenv('CB_USERNAME') or input('Enter your Couchbase username (default: Administrator): ') or 'Administrator'\n", + "CB_PASSWORD = os.getenv('CB_PASSWORD') or getpass.getpass('Enter your Couchbase password (default: password): ') or 'password'\n", + "CB_BUCKET_NAME = os.getenv('CB_BUCKET_NAME') or input('Enter your Couchbase bucket name (default: query-vector-search-testing): ') or 'query-vector-search-testing'\n", + "SCOPE_NAME = os.getenv('SCOPE_NAME') or input('Enter your scope name (default: shared): ') or 'shared'\n", + "COLLECTION_NAME = os.getenv('COLLECTION_NAME') or input('Enter your collection name (default: deepseek): ') or 'deepseek'\n", + "CACHE_COLLECTION = os.getenv('CACHE_COLLECTION') or input('Enter your cache collection name (default: cache): ') or 'cache'\n", + "\n", + "# Check if required credentials are set\n", + "required_creds = {\n", + " 'OPENAI_API_KEY': OPENAI_API_KEY,\n", + " 'CB_HOST': CB_HOST,\n", + " 'CB_USERNAME': CB_USERNAME,\n", + " 'CB_PASSWORD': CB_PASSWORD,\n", + " 'CB_BUCKET_NAME': CB_BUCKET_NAME\n", + "}\n", + "\n", + "# Add the API key that was chosen\n", + "if DEEPSEEK_API_KEY:\n", + " required_creds['DEEPSEEK_API_KEY'] = DEEPSEEK_API_KEY\n", + "elif OPENROUTER_API_KEY:\n", + " required_creds['OPENROUTER_API_KEY'] = OPENROUTER_API_KEY\n", + "else:\n", + " raise ValueError(\"Either Deepseek API Key or OpenRouter API Key must be provided\")\n", + "\n", + "for cred_name, cred_value in required_creds.items():\n", + " if not cred_value:\n", + " raise ValueError(f\"{cred_name} is not set\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Couchbase Connection Setup\n", + "\n", + "### Connect to Cluster\n", + "\n", + "Connecting to a Couchbase cluster is the foundation of our project. Couchbase will serve as our primary data store, handling all the storage and retrieval operations required for our semantic search engine. By establishing this connection, we enable our application to interact with the database, allowing us to perform operations such as storing embeddings, querying data, and managing collections. This connection is the gateway through which all data will flow, so ensuring it's set up correctly is paramount." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-17 15:40:27,133 - INFO - Successfully connected to Couchbase\n" + ] + } + ], + "source": [ + "try:\n", + " auth = PasswordAuthenticator(CB_USERNAME, CB_PASSWORD)\n", + " options = ClusterOptions(auth)\n", + " cluster = Cluster(CB_HOST, options)\n", + " cluster.wait_until_ready(timedelta(seconds=5))\n", + " logging.info(\"Successfully connected to Couchbase\")\n", + "except Exception as e:\n", + " raise ConnectionError(f\"Failed to connect to Couchbase: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Setup Collections\n", + "\n", + "The setup_collection() function handles creating and configuring the hierarchical data organization in Couchbase:\n", + "\n", + "1. **Bucket Creation:**\n", + " - Checks if specified bucket exists, creates it if not\n", + " - Sets bucket properties like RAM quota (1024MB) and replication (disabled)\n", + " - Note: If you are using Capella, create a bucket manually called vector-search-testing (or any name you prefer) with the same properties.\n", + "\n", + "2. **Scope Management:** \n", + " - Verifies if requested scope exists within bucket\n", + " - Creates new scope if needed (unless it's the default \"_default\" scope)\n", + "\n", + "3. **Collection Setup:**\n", + " - Checks for collection existence within scope\n", + " - Creates collection if it doesn't exist\n", + " - Waits 2 seconds for collection to be ready\n", + "\n", + "**Additional Tasks:**\n", + "- Clears any existing documents for clean state\n", + "- Implements comprehensive error handling and logging\n", + "\n", + "The function is called twice to set up:\n", + "1. Main collection for vector embeddings\n", + "2. Cache collection for storing results" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-17 15:41:01,398 - INFO - Bucket 'query-vector-search-testing' exists.\n", + "2025-09-17 15:41:01,410 - INFO - Collection 'deepseek' does not exist. Creating it...\n", + "2025-09-17 15:41:01,453 - INFO - Collection 'deepseek' created successfully.\n", + "2025-09-17 15:41:03,712 - INFO - All documents cleared from the collection.\n", + "2025-09-17 15:41:03,713 - INFO - Bucket 'query-vector-search-testing' exists.\n", + "2025-09-17 15:41:03,728 - INFO - Collection 'cache' already exists. Skipping creation.\n", + "2025-09-17 15:41:05,821 - INFO - All documents cleared from the collection.\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def setup_collection(cluster, bucket_name, scope_name, collection_name):\n", + " try:\n", + " # Check if bucket exists, create if it doesn't\n", + " try:\n", + " bucket = cluster.bucket(bucket_name)\n", + " logging.info(f\"Bucket '{bucket_name}' exists.\")\n", + " except Exception as e:\n", + " logging.info(f\"Bucket '{bucket_name}' does not exist. Creating it...\")\n", + " bucket_settings = CreateBucketSettings(\n", + " name=bucket_name,\n", + " bucket_type='couchbase',\n", + " ram_quota_mb=1024,\n", + " flush_enabled=True,\n", + " num_replicas=0\n", + " )\n", + " cluster.buckets().create_bucket(bucket_settings)\n", + " time.sleep(2) # Wait for bucket creation to complete and become available\n", + " bucket = cluster.bucket(bucket_name)\n", + " logging.info(f\"Bucket '{bucket_name}' created successfully.\")\n", + "\n", + " bucket_manager = bucket.collections()\n", + "\n", + " # Check if scope exists, create if it doesn't\n", + " scopes = bucket_manager.get_all_scopes()\n", + " scope_exists = any(scope.name == scope_name for scope in scopes)\n", + " \n", + " if not scope_exists and scope_name != \"_default\":\n", + " logging.info(f\"Scope '{scope_name}' does not exist. Creating it...\")\n", + " bucket_manager.create_scope(scope_name)\n", + " logging.info(f\"Scope '{scope_name}' created successfully.\")\n", + "\n", + " # Check if collection exists, create if it doesn't\n", + " collections = bucket_manager.get_all_scopes()\n", + " collection_exists = any(\n", + " scope.name == scope_name and collection_name in [col.name for col in scope.collections]\n", + " for scope in collections\n", + " )\n", + "\n", + " if not collection_exists:\n", + " logging.info(f\"Collection '{collection_name}' does not exist. Creating it...\")\n", + " bucket_manager.create_collection(scope_name, collection_name)\n", + " logging.info(f\"Collection '{collection_name}' created successfully.\")\n", + " else:\n", + " logging.info(f\"Collection '{collection_name}' already exists. Skipping creation.\")\n", + "\n", + " # Wait for collection to be ready\n", + " collection = bucket.scope(scope_name).collection(collection_name)\n", + " time.sleep(2) # Give the collection time to be ready for queries\n", + "\n", + " # Clear all documents in the collection\n", + " try:\n", + " query = f\"DELETE FROM `{bucket_name}`.`{scope_name}`.`{collection_name}`\"\n", + " cluster.query(query).execute()\n", + " logging.info(\"All documents cleared from the collection.\")\n", + " except Exception as e:\n", + " logging.warning(f\"Error while clearing documents: {str(e)}. The collection might be empty.\")\n", + "\n", + " return collection\n", + " except Exception as e:\n", + " raise RuntimeError(f\"Error setting up collection: {str(e)}\")\n", + " \n", + "setup_collection(cluster, CB_BUCKET_NAME, SCOPE_NAME, COLLECTION_NAME)\n", + "setup_collection(cluster, CB_BUCKET_NAME, SCOPE_NAME, CACHE_COLLECTION)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## OpenAI Configuration\n", + "\n", + "### Creating the Embeddings Client\n", + "\n", + "This section creates an OpenAI embeddings client using the OpenAI API key. The embeddings client is configured to use the \"text-embedding-3-small\" model, which converts text into numerical vector representations. These vector embeddings are essential for semantic search and similarity matching. The client will be used by the vector store to generate embeddings for documents." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-17 15:41:27,149 - INFO - Successfully created OpenAI embeddings client\n" + ] + } + ], + "source": [ + "try:\n", + " embeddings = OpenAIEmbeddings(\n", + " api_key=OPENAI_API_KEY,\n", + " model=\"text-embedding-3-small\"\n", + " )\n", + " logging.info(\"Successfully created OpenAI embeddings client\")\n", + "except Exception as e:\n", + " raise ValueError(f\"Error creating OpenAI embeddings client: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Document Processing and Vector Store Setup\n", + "\n", + "### Create Couchbase Hyperscale Vector Store\n", + "\n", + "A vector store is where we'll keep our embeddings. Unlike traditional text-based search, the Hyperscale Vector Store is specifically designed to handle embeddings and perform similarity searches. When a user inputs a query, the search engine converts the query into an embedding and compares it against the embeddings stored in the vector store. This allows the engine to find documents that are semantically similar to the query, even if they don't contain the exact same words. By setting up the Hyperscale Vector Store in Couchbase, we create a powerful tool that enables our search engine to understand and retrieve information based on the meaning and context of the query, rather than just the specific words used." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-17 15:41:55,394 - INFO - Successfully created vector store\n" + ] + } + ], + "source": [ + "try:\n", + " vector_store = CouchbaseQueryVectorStore(\n", + " cluster=cluster,\n", + " bucket_name=CB_BUCKET_NAME,\n", + " scope_name=SCOPE_NAME,\n", + " collection_name=COLLECTION_NAME,\n", + " embedding = embeddings,\n", + " distance_metric=DistanceStrategy.COSINE\n", + " )\n", + " logging.info(\"Successfully created vector store\")\n", + "except Exception as e:\n", + " raise ValueError(f\"Failed to create vector store: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load BBC News Dataset\n", + "\n", + "To build a search engine, we need data to search through. We use the BBC News dataset from RealTimeData, which provides real-world news articles. This dataset contains news articles from BBC covering various topics and time periods. Loading the dataset is a crucial step because it provides the raw material that our search engine will work with. The quality and diversity of the news articles make it an excellent choice for testing and refining our search engine, ensuring it can handle real-world news content effectively.\n", + "\n", + "The BBC News dataset allows us to work with authentic news articles, enabling us to build and test a search engine that can effectively process and retrieve relevant news content. The dataset is loaded using the Hugging Face datasets library, specifically accessing the \"RealTimeData/bbc_news_alltime\" dataset with the \"2024-12\" version." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-17 15:42:04,530 - INFO - Successfully loaded the BBC News dataset with 2687 rows.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded the BBC News dataset with 2687 rows\n" + ] + } + ], + "source": [ + "try:\n", + " news_dataset = load_dataset(\n", + " \"RealTimeData/bbc_news_alltime\", \"2024-12\", split=\"train\"\n", + " )\n", + " print(f\"Loaded the BBC News dataset with {len(news_dataset)} rows\")\n", + " logging.info(f\"Successfully loaded the BBC News dataset with {len(news_dataset)} rows.\")\n", + "except Exception as e:\n", + " raise ValueError(f\"Error loading the BBC News dataset: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Data Cleaning\n", + "\n", + "We will use the content of the news articles for our RAG system.\n", + "\n", + "The dataset contains a few duplicate records. We are removing them to avoid duplicate results in the retrieval stage of our RAG system." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "We have 1749 unique articles in our database.\n" + ] + } + ], + "source": [ + "news_articles = news_dataset[\"content\"]\n", + "unique_articles = set()\n", + "for article in news_articles:\n", + " if article:\n", + " unique_articles.add(article)\n", + "unique_news_articles = list(unique_articles)\n", + "print(f\"We have {len(unique_news_articles)} unique articles in our database.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Save Data to Vector Store\n", + "\n", + "To efficiently handle the large number of articles, we process them in batches of articles at a time. This batch processing approach helps manage memory usage and provides better control over the ingestion process.\n", + "\n", + "We first filter out any articles that exceed 50,000 characters to avoid potential issues with token limits. Then, using the vector store's add_texts method, we add the filtered articles to our vector database. The batch_size parameter controls how many articles are processed in each iteration.\n", + "\n", + "This approach offers several benefits:\n", + "1. **Memory Efficiency**: Processing in smaller batches prevents memory overload\n", + "2. **Progress Tracking**: Easier to monitor and track the ingestion progress\n", + "3. **Resource Management**: Better control over CPU and network resource utilization\n", + "\n", + "We use a conservative batch size of 50 to ensure reliable operation. The optimal batch size depends on many factors including document sizes, available system resources, network conditions, and concurrent workload." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-17 16:08:51,054 - INFO - Document ingestion completed successfully.\n" + ] + } + ], + "source": [ + "batch_size = 50\n", + "\n", + "# Automatic Batch Processing\n", + "articles = [article for article in unique_news_articles if article and len(article) <= 50000]\n", + "\n", + "try:\n", + " vector_store.add_texts(\n", + " texts=articles,\n", + " batch_size=batch_size\n", + " )\n", + " logging.info(\"Document ingestion completed successfully.\")\n", + "except Exception as e:\n", + " raise ValueError(f\"Failed to save documents to vector store: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Deepseek LLM Configuration\n", + "\n", + "### Setting Up the LLM Model\n", + "\n", + "In this section, we set up the Large Language Model (LLM) for our RAG system. We're using the Deepseek model, which can be accessed through two different methods:\n", + "\n", + "1. **Deepseek API Key**: This is obtained directly from Deepseek's platform (https://deepseek.ai) by creating an account and subscribing to their API services. With this key, you can access Deepseek's models directly using the `ChatDeepSeek` class from the `langchain_deepseek` package.\n", + "\n", + "2. **OpenRouter API Key**: OpenRouter (https://openrouter.ai) is a service that provides unified access to multiple LLM providers, including Deepseek. You can obtain an API key by creating an account on OpenRouter's website. This approach uses the `ChatOpenAI` class from `langchain_openai` but with a custom base URL pointing to OpenRouter's API endpoint.\n", + "\n", + "The key difference is that OpenRouter acts as an intermediary service that can route your requests to various LLM providers, while the Deepseek API gives you direct access to only Deepseek's models. OpenRouter can be useful if you want to switch between different LLM providers without changing your code significantly.\n", + "\n", + "In our implementation, we check for both keys and prioritize using the Deepseek API directly if available, falling back to OpenRouter if not. The model is configured with temperature=0 to ensure deterministic, focused responses suitable for RAG applications." + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-18 11:18:25,192 - INFO - Successfully created Deepseek LLM client through OpenRouter\n" + ] + } + ], + "source": [ + "from langchain_deepseek import ChatDeepSeek\n", + "from langchain_openai import ChatOpenAI\n", + "\n", + "if DEEPSEEK_API_KEY:\n", + " try:\n", + " llm = ChatDeepSeek(\n", + " api_key=DEEPSEEK_API_KEY,\n", + " model_name=\"deepseek-chat\",\n", + " temperature=0\n", + " )\n", + " logging.info(\"Successfully created Deepseek LLM client\")\n", + " except Exception as e:\n", + " raise ValueError(f\"Error creating Deepseek LLM client: {str(e)}\")\n", + "elif OPENROUTER_API_KEY:\n", + " try:\n", + " llm = ChatOpenAI(\n", + " api_key=OPENROUTER_API_KEY,\n", + " base_url=\"https://openrouter.ai/api/v1\",\n", + " model=\"deepseek/deepseek-chat-v3.1\", \n", + " temperature=0,\n", + " )\n", + " logging.info(\"Successfully created Deepseek LLM client through OpenRouter\")\n", + " except Exception as e:\n", + " raise ValueError(f\"Error creating Deepseek LLM client: {str(e)}\")\n", + "else:\n", + " raise ValueError(\"Either Deepseek API Key or OpenRouter API Key must be provided\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Semantic Search Demo\n", + "\n", + "### Perform Semantic Search\n", + "\n", + "Semantic search in Couchbase involves converting queries and documents into vector representations using an embeddings model. These vectors capture the semantic meaning of the text and are stored directly in Couchbase. When a query is made, Couchbase performs a similarity search by comparing the query vector against the stored document vectors. The similarity metric used for this comparison is configurable, allowing flexibility in how the relevance of documents is determined. Common metrics include cosine similarity, Euclidean distance, or dot product, but other metrics can be implemented based on specific use cases. Different embedding models like BERT, Word2Vec, or GloVe can also be used depending on the application's needs, with the vectors generated by these models stored and searched within Couchbase itself.\n", + "\n", + "In the provided code, the search process begins by recording the start time, followed by executing the `similarity_search_with_score` method of the `CouchbaseQueryVectorStore`. This method searches Couchbase for the most relevant documents based on the vector similarity to the query. The search results include the document content and the distance that reflects how closely each document aligns with the query in the defined semantic space. The time taken to perform this search is then calculated and logged, and the results are displayed, showing the most relevant documents along with their similarity scores. This approach leverages Couchbase as both a storage and retrieval engine for vector data, enabling efficient and scalable semantic searches. The integration of vector storage and search capabilities within Couchbase allows for sophisticated semantic search operations without relying on external services for vector storage or comparison." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-17 16:11:07,177 - INFO - Semantic search completed in 2.46 seconds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Semantic Search Results (completed in 2.46 seconds):\n", + "--------------------------------------------------------------------------------\n", + "Distance: 0.3693, Text: The Littler effect - how darts hit the bullseye\n", + "\n", + "Teenager Luke Littler began his bid to win the 2025 PDC World Darts Championship with a second-round win against Ryan Meikle. Here we assess Littler's impact after a remarkable rise which saw him named BBC Young Sports Personality of the Year and runner-up in the main award to athlete Keely Hodgkinson.\n", + "\n", + "One year ago, he was barely a household name in his own home. Now he is a sporting phenomenon. After emerging from obscurity aged 16 to reach the World Championship final, the life of Luke Littler and the sport he loves has been transformed. Viewing figures, ticket sales and social media interest have rocketed. Darts has hit the bullseye. This Christmas more than 100,000 children are expected to be opening Littler-branded magnetic dartboards as presents. His impact has helped double the number of junior academies, prompted plans to expand the World Championship and generated interest in darts from Saudi Arabian backers.\n", + "\n", + "Just months after taking his GCSE exams and ranked 164th in the world, Littler beat former champions Raymond van Barneveld and Rob Cross en route to the PDC World Championship final in January, before his run ended with a 7-4 loss to Luke Humphries. With his nickname 'The Nuke' on his purple and yellow shirt and the Alexandra Palace crowd belting out his walk-on song, Pitbull's tune Greenlight, he became an instant hit. Electric on the stage, calm off it. The down-to-earth teenager celebrated with a kebab and computer games. \"We've been watching his progress since he was about seven. He was on our radar, but we never anticipated what would happen. The next thing we know 'Littlermania' is spreading everywhere,\" PDC president Barry Hearn told BBC Sport. A peak TV audience of 3.7 million people watched the final - easily Sky's biggest figure for a non-football sporting event. The teenager from Warrington in Cheshire was too young to legally drive or drink alcohol, but earned £200,000 for finishing second - part of £1m prize money in his first year as a professional - and an invitation to the elite Premier League competition. He turned 17 later in January but was he too young for the demanding event over 17 Thursday nights in 17 locations? He ended up winning the whole thing, and hit a nine-dart finish against Humphries in the final. From Bahrain to Wolverhampton, Littler claimed 10 titles in 2024 and is now eyeing the World Championship.\n", + "\n", + "As he progressed at the Ally Pally, the Manchester United fan was sent a good luck message by the club's former midfielder and ex-England captain David Beckham. In 12 months, Littler's Instagram followers have risen from 4,000 to 1.3m. Commercial backers include a clothing range, cereal firm and train company and he will appear in a reboot of the TV darts show Bullseye. Google say he was the most searched-for athlete online in the UK during 2024. On the back of his success, Littler darts, boards, cabinets, shirts are being snapped up in big numbers. \"This Christmas the junior magnetic dartboard is selling out, we're talking over 100,000. They're 20 quid and a great introduction for young children,\" said Garry Plummer, the boss of sponsors Target Darts, who first signed a deal with Littler's family when he was aged 12. \"All the toy shops want it, they all want him - 17, clean, doesn't drink, wonderful.\"\n", + "\n", + "Littler beat Luke Humphries to win the Premier League title in May\n", + "\n", + "The number of academies for children under the age of 16 has doubled in the last year, says Junior Darts Corporation chairman Steve Brown. There are 115 dedicated groups offering youngsters equipment, tournaments and a place to develop, with bases including Australia, Bulgaria, Greece, Norway, USA and Mongolia. \"We've seen so many inquiries from around the world, it's been such a boom. It took us 14 years to get 1,600 members and within 12 months we have over 3,000, and waiting lists,\" said Brown. \"When I played darts as a child, I was quite embarrassed to tell my friends what my hobby was. All these kids playing darts now are pretty popular at school. It's a bit rock 'n roll and recognised as a cool thing to do.\" Plans are being hatched to extend the World Championship by four days and increase the number of players from 96 to 128. That will boost the number of tickets available by 25,000 to 115,000 but Hearn reckons he could sell three times as many. He says Saudi Arabia wants to host a tournament, which is likely to happen if no-alcohol regulations are relaxed. \"They will change their rules in the next 12 months probably for certain areas having alcohol, and we'll take darts there and have a party in Saudi,\" he said. \"When I got involved in darts, the total prize money was something like £300,000 for the year. This year it will go to £20m. I expect in five years' time, we'll be playing for £40m.\"\n", + "\n", + "Former electrician Cross charged to the 2018 world title in his first full season, while Adrian Lewis and Michael van Gerwen were multiple victors in their 20s and 16-time champion Phil ‘The Power’ Taylor is widely considered the greatest of all time. Littler is currently fourth in the world rankings, although that is based on a two-year Order of Merit. There have been suggestions from others the spotlight on the teenager means world number one Humphries, 29, has been denied the coverage he deserves, but no darts player has made a mark at such a young age as Littler. \"Luke Humphries is another fabulous player who is going to be around for years. Sport is a very brutal world. It is about winning and claiming the high ground. There will be envy around,\" Hearn said. \"Luke Littler is the next Tiger Woods for darts so they better get used to it, and the only way to compete is to get better.\" World number 38 Martin Lukeman was awestruck as he described facing a peak Littler after being crushed 16-3 in the Grand Slam final, with the teenager winning 15 consecutive legs. \"I can't compete with that, it was like Godly. He was relentless, he is so good it's ridiculous,\" he said. Lukeman can still see the benefits he brings, adding: \"What he's done for the sport is brilliant. If it wasn't for him, our wages wouldn't be going up. There's more sponsors, more money coming in, all good.\" Hearn feels future competition may come from players even younger than Littler. \"I watched a 10-year-old a few months ago who averaged 104.89 and checked out a 4-3 win with a 136 finish. They smell the money, the fame and put the hard work in,\" he said. How much better Littler can get is guesswork, although Plummer believes he wants to reach new heights. \"He never says 'how good was I?' But I think he wants to break records and beat Phil Taylor's 16 World Championships and 16 World Matchplay titles,\" he said. \"He's young enough to do it.\" A version of this article was originally published on 29 November.\n", + "• None Know a lot about Littler? Take our quiz\n", + "--------------------------------------------------------------------------------\n", + "Distance: 0.3900, Text: Luke Littler has risen from 164th to fourth in the rankings in a year\n", + "\n", + "A tearful Luke Littler hit a tournament record 140.91 set average as he started his bid for the PDC World Championship title with a dramatic 3-1 win over Ryan Meikle. The 17-year-old made headlines around the world when he reached the tournament final in January, where he lost to Luke Humphries. Starting this campaign on Saturday, Littler was millimetres away from a nine-darter when he missed double 12 as he blew Meikle away in the fourth and final set of the second-round match. Littler was overcome with emotion at the end, cutting short his on-stage interview. \"It was probably the toughest game I've ever played. I had to fight until the end,\" he said later in a news conference. \"As soon as the question came on stage and then boom, the tears came. It was just a bit too much to speak on stage. \"It is the worst game I have played. I have never felt anything like that tonight.\" Admitting to nerves during the match, he told Sky Sports: \"Yes, probably the biggest time it's hit me. Coming into it I was fine, but as soon as [referee] George Noble said 'game on', I couldn't throw them.\" Littler started slowly against Meikle, who had two darts for the opening set, but he took the lead by twice hitting double 20. Meikle did not look overawed against his fellow Englishman and levelled, but Littler won the third set and exploded into life in the fourth. The tournament favourite hit four maximum 180s as he clinched three straight legs in 11, 10 and 11 darts for a record set average, and 100.85 overall. Meanwhile, two seeds crashed out on Saturday night – five-time world champion Raymond van Barneveld lost to Welshman Nick Kenny, while England's Ryan Joyce beat Danny Noppert. Australian Damon Heta was another to narrowly miss out on a nine-darter, just failing on double 12 when throwing for the match in a 3-1 win over Connor Scutt. Ninth seed Heta hit four 100-plus checkouts to come from a set down against Scutt in a match in which both men averaged more than 97.\n", + "\n", + "Littler was hugged by his parents after victory over Meikle\n", + "\n", + "... (output truncated for brevity)\n" + ] + } + ], + "source": [ + "query = \"What were Luke Littler's key achievements and records in his recent PDC World Championship match?\"\n", + "\n", + "try:\n", + " # Perform the semantic search\n", + " start_time = time.time()\n", + " search_results = vector_store.similarity_search_with_score(query, k=10)\n", + " search_elapsed_time = time.time() - start_time\n", + "\n", + " logging.info(f\"Semantic search completed in {search_elapsed_time:.2f} seconds\")\n", + "\n", + " # Display search results\n", + " print(f\"\\nSemantic Search Results (completed in {search_elapsed_time:.2f} seconds):\")\n", + " print(\"-\" * 80)\n", + "\n", + " for doc, score in search_results:\n", + " print(f\"Distance: {score:.4f}, Text: {doc.page_content}\")\n", + " print(\"-\" * 80)\n", + "\n", + "except CouchbaseException as e:\n", + " raise RuntimeError(f\"Error performing semantic search: {str(e)}\")\n", + "except Exception as e:\n", + " raise RuntimeError(f\"Unexpected error: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Optimizing Vector Search with Hyperscale and Composite Vector Indexes\n", + "\n", + "While the above semantic search using similarity_search_with_score works effectively, we can significantly improve query performance by leveraging Hyperscale and Composite Vector Indexes in Couchbase.\n", + "\n", + "Couchbase offers different types of vector indexes. For Hyperscale and Composite vector search we focus on two main types as detailed in the [Couchbase Vector Index Documentation](https://docs.couchbase.com/cloud/vector-index/use-vector-indexes.html):\n", + "\n", + "### Vector Index Types\n", + "\n", + "**1. Hyperscale Vector Indexes (BHIVE)**\n", + "- Optimized for pure vector searches like content discovery, recommendations, and semantic search\n", + "- High performance with low memory footprint, optimized for concurrent operations\n", + "- Designed to scale to billions of vectors\n", + "- Use when you primarily perform vector-only queries without complex scalar filtering\n", + "\n", + "**2. Composite Vector Indexes**\n", + "- Best for filtered vector searches that combine vector search with scalar value filtering\n", + "- Efficient pre-filtering where scalar attributes reduce the vector comparison scope\n", + "- Use when your queries combine vector similarity with scalar filters that eliminate large portions of data\n", + "- Note: Scalar filters take precedence over vector similarity\n", + "\n", + "### Understanding Index Configuration\n", + "\n", + "The `index_description` parameter controls how Couchbase optimizes vector storage and search through centroids and quantization. For detailed configuration options, see the [Quantization & Centroid Settings](https://docs.couchbase.com/cloud/vector-index/hyperscale-vector-index.html#algo_settings).\n", + "\n", + "Let's create a BHIVE Hyperscale Vector Index to optimize our search performance:" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "vector_store.create_index(index_type=IndexType.BHIVE, index_name=\"openrouterdeepseek_bhive_index\",index_description=\"IVF,SQ8\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The example below shows running the same similarity search, but now using the BHIVE GSI index we created above. You'll notice improved performance as the index efficiently retrieves data.\n", + "\n", + "**Important**: When using Composite indexes, scalar filters take precedence over vector similarity, which can improve performance for filtered searches but may miss some semantically relevant results that don't match the scalar criteria.\n", + "\n", + "Note: In GSI vector search, the distance represents the vector distance between the query and document embeddings. Lower distance indicate higher similarity, while higher distance indicate lower similarity." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-18 11:17:19,626 - INFO - Semantic search completed in 0.88 seconds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Semantic Search Results (completed in 0.88 seconds):\n", + "--------------------------------------------------------------------------------\n", + "Distance: 0.3694, Text: The Littler effect - how darts hit the bullseye\n", + "\n", + "Teenager Luke Littler began his bid to win the 2025 PDC World Darts Championship with a second-round win against Ryan Meikle. Here we assess Littler's impact after a remarkable rise which saw him named BBC Young Sports Personality of the Year and runner-up in the main award to athlete Keely Hodgkinson.\n", + "\n", + "One year ago, he was barely a household name in his own home. Now he is a sporting phenomenon. After emerging from obscurity aged 16 to reach the World Championship final, the life of Luke Littler and the sport he loves has been transformed. Viewing figures, ticket sales and social media interest have rocketed. Darts has hit the bullseye. This Christmas more than 100,000 children are expected to be opening Littler-branded magnetic dartboards as presents. His impact has helped double the number of junior academies, prompted plans to expand the World Championship and generated interest in darts from Saudi Arabian backers.\n", + "\n", + "Just months after taking his GCSE exams and ranked 164th in the world, Littler beat former champions Raymond van Barneveld and Rob Cross en route to the PDC World Championship final in January, before his run ended with a 7-4 loss to Luke Humphries. With his nickname 'The Nuke' on his purple and yellow shirt and the Alexandra Palace crowd belting out his walk-on song, Pitbull's tune Greenlight, he became an instant hit. Electric on the stage, calm off it. The down-to-earth teenager celebrated with a kebab and computer games. \"We've been watching his progress since he was about seven. He was on our radar, but we never anticipated what would happen. The next thing we know 'Littlermania' is spreading everywhere,\" PDC president Barry Hearn told BBC Sport. A peak TV audience of 3.7 million people watched the final - easily Sky's biggest figure for a non-football sporting event. The teenager from Warrington in Cheshire was too young to legally drive or drink alcohol, but earned £200,000 for finishing second - part of £1m prize money in his first year as a professional - and an invitation to the elite Premier League competition. He turned 17 later in January but was he too young for the demanding event over 17 Thursday nights in 17 locations? He ended up winning the whole thing, and hit a nine-dart finish against Humphries in the final. From Bahrain to Wolverhampton, Littler claimed 10 titles in 2024 and is now eyeing the World Championship.\n", + "\n", + "As he progressed at the Ally Pally, the Manchester United fan was sent a good luck message by the club's former midfielder and ex-England captain David Beckham. In 12 months, Littler's Instagram followers have risen from 4,000 to 1.3m. Commercial backers include a clothing range, cereal firm and train company and he will appear in a reboot of the TV darts show Bullseye. Google say he was the most searched-for athlete online in the UK during 2024. On the back of his success, Littler darts, boards, cabinets, shirts are being snapped up in big numbers. \"This Christmas the junior magnetic dartboard is selling out, we're talking over 100,000. They're 20 quid and a great introduction for young children,\" said Garry Plummer, the boss of sponsors Target Darts, who first signed a deal with Littler's family when he was aged 12. \"All the toy shops want it, they all want him - 17, clean, doesn't drink, wonderful.\"\n", + "\n", + "Littler beat Luke Humphries to win the Premier League title in May\n", + "\n", + "The number of academies for children under the age of 16 has doubled in the last year, says Junior Darts Corporation chairman Steve Brown. There are 115 dedicated groups offering youngsters equipment, tournaments and a place to develop, with bases including Australia, Bulgaria, Greece, Norway, USA and Mongolia. \"We've seen so many inquiries from around the world, it's been such a boom. It took us 14 years to get 1,600 members and within 12 months we have over 3,000, and waiting lists,\" said Brown. \"When I played darts as a child, I was quite embarrassed to tell my friends what my hobby was. All these kids playing darts now are pretty popular at school. It's a bit rock 'n roll and recognised as a cool thing to do.\" Plans are being hatched to extend the World Championship by four days and increase the number of players from 96 to 128. That will boost the number of tickets available by 25,000 to 115,000 but Hearn reckons he could sell three times as many. He says Saudi Arabia wants to host a tournament, which is likely to happen if no-alcohol regulations are relaxed. \"They will change their rules in the next 12 months probably for certain areas having alcohol, and we'll take darts there and have a party in Saudi,\" he said. \"When I got involved in darts, the total prize money was something like £300,000 for the year. This year it will go to £20m. I expect in five years' time, we'll be playing for £40m.\"\n", + "\n", + "Former electrician Cross charged to the 2018 world title in his first full season, while Adrian Lewis and Michael van Gerwen were multiple victors in their 20s and 16-time champion Phil ‘The Power’ Taylor is widely considered the greatest of all time. Littler is currently fourth in the world rankings, although that is based on a two-year Order of Merit. There have been suggestions from others the spotlight on the teenager means world number one Humphries, 29, has been denied the coverage he deserves, but no darts player has made a mark at such a young age as Littler. \"Luke Humphries is another fabulous player who is going to be around for years. Sport is a very brutal world. It is about winning and claiming the high ground. There will be envy around,\" Hearn said. \"Luke Littler is the next Tiger Woods for darts so they better get used to it, and the only way to compete is to get better.\" World number 38 Martin Lukeman was awestruck as he described facing a peak Littler after being crushed 16-3 in the Grand Slam final, with the teenager winning 15 consecutive legs. \"I can't compete with that, it was like Godly. He was relentless, he is so good it's ridiculous,\" he said. Lukeman can still see the benefits he brings, adding: \"What he's done for the sport is brilliant. If it wasn't for him, our wages wouldn't be going up. There's more sponsors, more money coming in, all good.\" Hearn feels future competition may come from players even younger than Littler. \"I watched a 10-year-old a few months ago who averaged 104.89 and checked out a 4-3 win with a 136 finish. They smell the money, the fame and put the hard work in,\" he said. How much better Littler can get is guesswork, although Plummer believes he wants to reach new heights. \"He never says 'how good was I?' But I think he wants to break records and beat Phil Taylor's 16 World Championships and 16 World Matchplay titles,\" he said. \"He's young enough to do it.\" A version of this article was originally published on 29 November.\n", + "• None Know a lot about Littler? Take our quiz\n", + "--------------------------------------------------------------------------------\n", + "Distance: 0.3901, Text: Luke Littler has risen from 164th to fourth in the rankings in a year\n", + "\n", + "A tearful Luke Littler hit a tournament record 140.91 set average as he started his bid for the PDC World Championship title with a dramatic 3-1 win over Ryan Meikle. The 17-year-old made headlines around the world when he reached the tournament final in January, where he lost to Luke Humphries. Starting this campaign on Saturday, Littler was millimetres away from a nine-darter when he missed double 12 as he blew Meikle away in the fourth and final set of the second-round match. Littler was overcome with emotion at the end, cutting short his on-stage interview. \"It was probably the toughest game I've ever played. I had to fight until the end,\" he said later in a news conference. \"As soon as the question came on stage and then boom, the tears came. It was just a bit too much to speak on stage. \"It is the worst game I have played. I have never felt anything like that tonight.\" Admitting to nerves during the match, he told Sky Sports: \"Yes, probably the biggest time it's hit me. Coming into it I was fine, but as soon as [referee] George Noble said 'game on', I couldn't throw them.\" Littler started slowly against Meikle, who had two darts for the opening set, but he took the lead by twice hitting double 20. Meikle did not look overawed against his fellow Englishman and levelled, but Littler won the third set and exploded into life in the fourth. The tournament favourite hit four maximum 180s as he clinched three straight legs in 11, 10 and 11 darts for a record set average, and 100.85 overall. Meanwhile, two seeds crashed out on Saturday night – five-time world champion Raymond van Barneveld lost to Welshman Nick Kenny, while England's Ryan Joyce beat Danny Noppert. Australian Damon Heta was another to narrowly miss out on a nine-darter, just failing on double 12 when throwing for the match in a 3-1 win over Connor Scutt. Ninth seed Heta hit four 100-plus checkouts to come from a set down against Scutt in a match in which both men averaged more than 97.\n", + "\n", + "Littler was hugged by his parents after victory over Meikle\n", + "\n", + "... (output truncated for brevity)\n" + ] + } + ], + "source": [ + "\n", + "query = \"What were Luke Littler's key achievements and records in his recent PDC World Championship match?\"\n", + "\n", + "try:\n", + " # Perform the semantic search\n", + " start_time = time.time()\n", + " search_results = vector_store.similarity_search_with_score(query, k=10)\n", + " search_elapsed_time = time.time() - start_time\n", + "\n", + " logging.info(f\"Semantic search completed in {search_elapsed_time:.2f} seconds\")\n", + "\n", + " # Display search results\n", + " print(f\"\\nSemantic Search Results (completed in {search_elapsed_time:.2f} seconds):\")\n", + " print(\"-\" * 80)\n", + "\n", + " for doc, score in search_results:\n", + " print(f\"Distance: {score:.4f}, Text: {doc.page_content}\")\n", + " print(\"-\" * 80)\n", + "\n", + "except CouchbaseException as e:\n", + " raise RuntimeError(f\"Error performing semantic search: {str(e)}\")\n", + "except Exception as e:\n", + " raise RuntimeError(f\"Unexpected error: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Alternative: Create Composite Index\n", + "\n", + "Note: To create a COMPOSITE index instead of BHIVE, the below code can be used. Choose based on your specific use case and query patterns. For this tutorial's news search scenario, either index type would work, but BHIVE might be more efficient for pure semantic search across news articles." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vector_store.create_index(index_type=IndexType.COMPOSITE, index_name=\"openrouterdeepseek_composite_index\", index_description=\"IVF,SQ8\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## RAG System Setup\n", + "\n", + "### Setup Couchbase Cache\n", + "\n", + "To further optimize our system, we set up a Couchbase-based cache. A cache is a temporary storage layer that holds data that is frequently accessed, speeding up operations by reducing the need to repeatedly retrieve the same information from the database. In our setup, the cache will help us accelerate repetitive tasks, such as looking up similar documents. By implementing a cache, we enhance the overall performance of our search engine, ensuring that it can handle high query volumes and deliver results quickly.\n", + "\n", + "Caching is particularly valuable in scenarios where users may submit similar queries multiple times or where certain pieces of information are frequently requested. By storing these in a cache, we can significantly reduce the time it takes to respond to these queries, improving the user experience." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-17 16:10:11,473 - INFO - Successfully created cache\n" + ] + } + ], + "source": [ + "try:\n", + " cache = CouchbaseCache(\n", + " cluster=cluster,\n", + " bucket_name=CB_BUCKET_NAME,\n", + " scope_name=SCOPE_NAME,\n", + " collection_name=CACHE_COLLECTION,\n", + " )\n", + " logging.info(\"Successfully created cache\")\n", + " set_llm_cache(cache)\n", + "except Exception as e:\n", + " raise ValueError(f\"Failed to create cache: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## RAG System Demo\n", + "\n", + "### Retrieval-Augmented Generation (RAG) with Couchbase and LangChain\n", + "\n", + "Couchbase and LangChain can be seamlessly integrated to create RAG (Retrieval-Augmented Generation) chains, enhancing the process of generating contextually relevant responses. In this setup, Couchbase serves as the vector store, where embeddings of documents are stored. When a query is made, LangChain retrieves the most relevant documents from Couchbase by comparing the query's embedding with the stored document embeddings. These documents, which provide contextual information, are then passed to a generative language model within LangChain.\n", + "\n", + "The language model, equipped with the context from the retrieved documents, generates a response that is both informed and contextually accurate. This integration allows the RAG chain to leverage Couchbase's efficient storage and retrieval capabilities, while LangChain handles the generation of responses based on the context provided by the retrieved documents. Together, they create a powerful system that can deliver highly relevant and accurate answers by combining the strengths of both retrieval and generation." + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-18 11:18:34,032 - INFO - Successfully created RAG chain\n" + ] + } + ], + "source": [ + "# Create RAG prompt template\n", + "rag_prompt = ChatPromptTemplate.from_messages([\n", + " (\"system\", \"You are a helpful assistant that answers questions based on the provided context.\"),\n", + " (\"human\", \"Context: {context}\\n\\nQuestion: {question}\")\n", + "])\n", + "\n", + "# Create RAG chain\n", + "rag_chain = (\n", + " {\"context\": vector_store.as_retriever(), \"question\": RunnablePassthrough()}\n", + " | rag_prompt\n", + " | llm\n", + " | StrOutputParser()\n", + ")\n", + "logging.info(\"Successfully created RAG chain\")" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RAG Response: Based on the provided context, Luke Littler's key achievements and records in his recent PDC World Championship match (second-round win against Ryan Meikle) were:\n", + "\n", + "* **Tournament Record Set Average:** He hit a tournament record 140.91 set average during the match.\n", + "* **Near Nine-Darter:** He was \"millimetres away from a nine-darter\" when he missed double 12.\n", + "* **Dominant Final Set:** He won the fourth and final set in just 32 darts (the minimum possible is 27), which included hitting four maximum 180s and clinching three straight legs in 11, 10, and 11 darts.\n", + "* **Overall High Average:** He maintained a high overall match average of 100.85.\n", + "RAG response generated in 0.49 seconds\n" + ] + } + ], + "source": [ + "try:\n", + " start_time = time.time()\n", + " rag_response = rag_chain.invoke(query)\n", + " rag_elapsed_time = time.time() - start_time\n", + "\n", + " print(f\"RAG Response: {rag_response}\")\n", + " print(f\"RAG response generated in {rag_elapsed_time:.2f} seconds\")\n", + "except InternalServerFailureException as e:\n", + " if \"query request rejected\" in str(e):\n", + " print(\"Error: Search request was rejected due to rate limiting. Please try again later.\")\n", + " else:\n", + " print(f\"Internal server error occurred: {str(e)}\")\n", + "except Exception as e:\n", + " print(f\"Unexpected error occurred: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using Couchbase as a Caching Mechanism\n", + "\n", + "Couchbase can be effectively used as a caching mechanism for RAG (Retrieval-Augmented Generation) responses by storing and retrieving precomputed results for specific queries. This approach enhances the system's efficiency and speed, particularly when dealing with repeated or similar queries. When a query is first processed, the RAG chain retrieves relevant documents, generates a response using the language model, and then stores this response in Couchbase, with the query serving as the key.\n", + "\n", + "For subsequent requests with the same query, the system checks Couchbase first. If a cached response is found, it is retrieved directly from Couchbase, bypassing the need to re-run the entire RAG process. This significantly reduces response time because the computationally expensive steps of document retrieval and response generation are skipped. Couchbase's role in this setup is to provide a fast and scalable storage solution for caching these responses, ensuring that frequently asked queries can be answered more quickly and efficiently." + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Query 1: What happened in the match between Fullham and Liverpool?\n", + "Response: In the match between Fulham and Liverpool, Liverpool played the majority of the game with 10 men after Andy Robertson received a red card in the 17th minute. Despite being a player down, Liverpool came from behind twice to secure a 2-2 draw. Diogo Jota scored an 86th-minute equalizer to earn Liverpool a point. The performance was praised for its resilience, with Fulham's Antonee Robinson noting that Liverpool \"didn't feel like they had 10 men at all.\" Liverpool maintained over 60% possession and led in attacking metrics such as shots and chances. Both managers acknowledged the strong efforts of their teams in what was described as an enthralling encounter.\n", + "Time taken: 4.65 seconds\n", + "\n", + "Query 2: What were Luke Littler's key achievements and records in his recent PDC World Championship match?\n", + "Response: Based on the provided context, Luke Littler's key achievements and records in his recent PDC World Championship match (second-round win against Ryan Meikle) were:\n", + "\n", + "* **Tournament Record Set Average:** He hit a tournament record 140.91 set average during the match.\n", + "* **Near Nine-Darter:** He was \"millimetres away from a nine-darter\" when he missed double 12.\n", + "* **Dominant Final Set:** He won the fourth and final set in just 32 darts (the minimum possible is 27), which included hitting four maximum 180s and clinching three straight legs in 11, 10, and 11 darts.\n", + "* **Overall High Average:** He maintained a high overall match average of 100.85.\n", + "Time taken: 0.45 seconds\n", + "\n", + "Query 3: What happened in the match between Fullham and Liverpool?\n", + "Response: In the match between Fulham and Liverpool, Liverpool played the majority of the game with 10 men after Andy Robertson received a red card in the 17th minute. Despite being a player down, Liverpool came from behind twice to secure a 2-2 draw. Diogo Jota scored an 86th-minute equalizer to earn Liverpool a point. The performance was praised for its resilience, with Fulham's Antonee Robinson noting that Liverpool \"didn't feel like they had 10 men at all.\" Liverpool maintained over 60% possession and led in attacking metrics such as shots and chances. Both managers acknowledged the strong efforts of their teams in what was described as an enthralling encounter.\n", + "Time taken: 1.15 seconds\n" + ] + } + ], + "source": [ + "try:\n", + " queries = [\n", + " \"What happened in the match between Fullham and Liverpool?\",\n", + " \"What were Luke Littler's key achievements and records in his recent PDC World Championship match?\", # Repeated query\n", + " \"What happened in the match between Fullham and Liverpool?\", # Repeated query\n", + " ]\n", + "\n", + " for i, query in enumerate(queries, 1):\n", + " print(f\"\\nQuery {i}: {query}\")\n", + " start_time = time.time()\n", + "\n", + " response = rag_chain.invoke(query)\n", + " elapsed_time = time.time() - start_time\n", + " print(f\"Response: {response}\")\n", + " print(f\"Time taken: {elapsed_time:.2f} seconds\")\n", + "\n", + "except InternalServerFailureException as e:\n", + " if \"query request rejected\" in str(e):\n", + " print(\"Error: Search request was rejected due to rate limiting. Please try again later.\")\n", + " else:\n", + " print(f\"Internal server error occurred: {str(e)}\")\n", + "except Exception as e:\n", + " print(f\"Unexpected error occurred: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion\n", + "By following these steps, you'll have a fully functional semantic search engine that leverages the strengths of Couchbase and Deepseek(via Openrouter). This guide is designed not just to show you how to build the system, but also to explain why each step is necessary, giving you a deeper understanding of the principles behind semantic search and how to implement it effectively. Whether you're a newcomer to software development or an experienced developer looking to expand your skills, this guide will provide you with the knowledge and tools you need to create a powerful, AI-driven search engine." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/openrouter-deepseek/query_based/frontmatter.md b/openrouter-deepseek/query_based/frontmatter.md new file mode 100644 index 00000000..d9bc1b6e --- /dev/null +++ b/openrouter-deepseek/query_based/frontmatter.md @@ -0,0 +1,26 @@ +--- +# frontmatter +path: "/tutorial-openrouter-deepseek-with-hyperscale-or-composite-vector-index" +title: Retrieval-Augmented Generation (RAG) with OpenRouter Deepseek and Couchbase Hyperscale and Composite Vector Index +short_title: RAG with OpenRouter Deepseek and Couchbase Hyperscale and Composite Vector Index +description: + - Learn how to build a semantic search engine using OpenRouter Deepseek and Couchbase Hyperscale and Composite Vector Index. + - This tutorial demonstrates how to integrate Couchbase's Hyperscale and Composite Vector Index capabilities with OpenRouter Deepseek as both embeddings and language model provider. + - You'll understand how to perform Retrieval-Augmented Generation (RAG) using LangChain, OpenRouter Deepseek, and Couchbase Hyperscale and Composite Vector Index. + - Alternatively if you want to perform semantic search using the Search Vector Index, please take a look at [this.](https://developer.couchbase.com/tutorial-openrouter-deepseek-with-search-vector-index/) +content_type: tutorial +filter: sdk +technology: + - vector search +tags: + - Hyperscale Vector Index + - Composite Vector Index + - Artificial Intelligence + - LangChain + - Deepseek + - OpenRouter +sdk_language: + - python +length: 60 Mins +alt_paths: ["/tutorial-openrouter-deepseek-with-hyperscale-vector-index", "/tutorial-openrouter-deepseek-with-composite-vector-index"] +--- diff --git a/openrouter-deepseek/fts/.env.sample b/openrouter-deepseek/search_based/.env.sample similarity index 100% rename from openrouter-deepseek/fts/.env.sample rename to openrouter-deepseek/search_based/.env.sample diff --git a/openrouter-deepseek/fts/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb b/openrouter-deepseek/search_based/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb similarity index 93% rename from openrouter-deepseek/fts/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb rename to openrouter-deepseek/search_based/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb index 387bb2bd..65c9f5d2 100644 --- a/openrouter-deepseek/fts/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb +++ b/openrouter-deepseek/search_based/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb @@ -4,17 +4,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Introduction \n", - "In this guide, we will walk you through building a powerful semantic search engine using Couchbase as the backend database and [Deepseek V3 as the language model provider (via OpenRouter or direct API)](https://deepseek.ai/) and OpenAI for embeddings. Semantic search goes beyond simple keyword matching by understanding the context and meaning behind the words in a query, making it an essential tool for applications that require intelligent information retrieval. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system using the FTS service from scratch. Alternatively if you want to perform semantic search using the GSI index, please take a look at [this.](https://developer.couchbase.com/tutorial-openrouter-deepseek-with-global-secondary-index/)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# How to run this tutorial\n", + "## How to Run This Tutorial\n", "\n", - "This tutorial is available as a Jupyter Notebook (`.ipynb` file) that you can run interactively. You can access the original notebook [here](https://github.com/couchbase-examples/vector-search-cookbook/blob/main/openrouter-deepseek/fts/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb).\n", + "This tutorial is available as a Jupyter Notebook (`.ipynb` file) that you can run interactively. You can access the original notebook [here](https://github.com/couchbase-examples/vector-search-cookbook/blob/main/openrouter-deepseek/search_based/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb).\n", "\n", "You can either download the notebook file and run it on [Google Colab](https://colab.research.google.com/) or run it on your system by setting up the Python environment." ] @@ -23,15 +15,18 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Before you start\n", + "## Prerequisites\n", + "\n", + "### Get Credentials for OpenRouter and Deepseek\n", "\n", - "## Get Credentials for OpenRouter and Deepseek\n", "* Sign up for an account at [OpenRouter](https://openrouter.ai/) to get your API key\n", "* OpenRouter provides access to Deepseek models, so no separate Deepseek credentials are needed\n", "* Store your OpenRouter API key securely as it will be used to access the models\n", "* For [Deepseek](https://deepseek.ai/) models, you can use the default models provided by OpenRouter\n", "\n", - "## Create and Deploy Your Free Tier Operational cluster on Capella\n", + "### Couchbase Requirements\n", + "\n", + "Create and Deploy Your Free Tier Operational cluster on [Capella](https://cloud.couchbase.com/sign-up)\n", "\n", "To get started with Couchbase Capella, create an account and use it to deploy a forever free tier operational cluster. This account provides you with an environment where you can explore and learn about Capella with no time constraint.\n", "\n", @@ -49,7 +44,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Setting the Stage: Installing Necessary Libraries\n", + "## Setup and Installation\n", + "\n", + "### Installing Necessary Libraries\n", "\n", "To build our semantic search engine, we need a robust set of tools. The libraries we install handle everything from connecting to databases to performing complex machine learning tasks." ] @@ -75,7 +72,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Importing Necessary Libraries\n", + "### Import Required Modules\n", "\n", "The script starts by importing a series of libraries required for various tasks, including handling JSON, logging, time tracking, Couchbase connections, embedding generation, and dataset loading." ] @@ -116,7 +113,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Setup Logging\n", + "### Configure Logging\n", + "\n", "Logging is configured to track the progress of the script and capture any errors or warnings." ] }, @@ -136,24 +134,22 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Environment Variables and Configuration\n", + "### Load Environment Configuration\n", "\n", "This section handles loading and validating environment variables and configuration settings:\n", - "#\n", - "1. API Keys:\n", + "\n", + "1. **API Keys:**\n", " - Supports either direct Deepseek API or OpenRouter API access\n", " - Prompts for API key input if not found in environment\n", " - Requires OpenAI API key for embeddings\n", - "#\n", - "2. Couchbase Settings:\n", + "\n", + "2. **Couchbase Settings:**\n", " - Connection details (host, username, password)\n", " - Bucket, scope and collection names\n", " - Vector search index configuration\n", " - Cache collection settings\n", - "#\n", - "The code validates that all required credentials are present before proceeding.\n", - "It allows flexible configuration through environment variables or interactive prompts,\n", - "with sensible defaults for local development.\n" + "\n", + "The code validates that all required credentials are present before proceeding. It allows flexible configuration through environment variables or interactive prompts, with sensible defaults for local development." ] }, { @@ -215,9 +211,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Connecting to the Couchbase Cluster\n", - "Connecting to a Couchbase cluster is the foundation of our project. Couchbase will serve as our primary data store, handling all the storage and retrieval operations required for our semantic search engine. By establishing this connection, we enable our application to interact with the database, allowing us to perform operations such as storing embeddings, querying data, and managing collections. This connection is the gateway through which all data will flow, so ensuring it's set up correctly is paramount.\n", - "\n" + "## Couchbase Connection Setup\n", + "\n", + "### Connect to Cluster\n", + "\n", + "Connecting to a Couchbase cluster is the foundation of our project. Couchbase will serve as our primary data store, handling all the storage and retrieval operations required for our semantic search engine. By establishing this connection, we enable our application to interact with the database, allowing us to perform operations such as storing embeddings, querying data, and managing collections. This connection is the gateway through which all data will flow, so ensuring it's set up correctly is paramount." ] }, { @@ -248,32 +246,32 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Setting Up Collections in Couchbase\n", + "### Setup Collections\n", "\n", "The setup_collection() function handles creating and configuring the hierarchical data organization in Couchbase:\n", "\n", - "1. Bucket Creation:\n", + "1. **Bucket Creation:**\n", " - Checks if specified bucket exists, creates it if not\n", " - Sets bucket properties like RAM quota (1024MB) and replication (disabled)\n", - " - Note: If you are using Capella, create a bucket manually called vector-search-testing(or any name you prefer) with the same properties.\n", + " - Note: If you are using Capella, create a bucket manually called vector-search-testing (or any name you prefer) with the same properties.\n", "\n", - "2. Scope Management: \n", + "2. **Scope Management:** \n", " - Verifies if requested scope exists within bucket\n", " - Creates new scope if needed (unless it's the default \"_default\" scope)\n", "\n", - "3. Collection Setup:\n", + "3. **Collection Setup:**\n", " - Checks for collection existence within scope\n", " - Creates collection if it doesn't exist\n", " - Waits 2 seconds for collection to be ready\n", "\n", - "Additional Tasks:\n", + "**Additional Tasks:**\n", "- Creates primary index on collection for query performance\n", "- Clears any existing documents for clean state\n", "- Implements comprehensive error handling and logging\n", "\n", "The function is called twice to set up:\n", "1. Main collection for vector embeddings\n", - "2. Cache collection for storing results\n" + "2. Cache collection for storing results" ] }, { @@ -383,13 +381,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Loading Couchbase Vector Search Index\n", + "## Understanding Search Vector Index\n", "\n", - "Semantic search requires an efficient way to retrieve relevant documents based on a user's query. This is where the Couchbase **Vector Search Index** comes into play. In this step, we load the Vector Search Index definition from a JSON file, which specifies how the index should be structured. This includes the fields to be indexed, the dimensions of the vectors, and other parameters that determine how the search engine processes queries based on vector similarity.\n", + "### Loading Couchbase Search Vector Index\n", "\n", - "This OpenRouter Deepseek vector search index configuration requires specific default settings to function properly. This tutorial uses the bucket named `vector-search-testing` with the scope `shared` and collection `deepseek`. The configuration is set up for vectors with exactly `1536 dimensions`, using dot product similarity and optimized for recall. If you want to use a different bucket, scope, or collection, you will need to modify the index configuration accordingly.\n", + "Semantic search requires an efficient way to retrieve relevant documents based on a user's query. This is where the Couchbase **Search Vector Index** comes into play. In this step, we load the Search Vector Index definition from a JSON file, which specifies how the index should be structured. This includes the fields to be indexed, the dimensions of the vectors, and other parameters that determine how the search engine processes queries based on vector similarity.\n", "\n", - "For more information on creating a vector search index, please follow the [instructions](https://docs.couchbase.com/cloud/vector-search/create-vector-search-index-ui.html).\n" + "This OpenRouter Deepseek Search Vector Index configuration requires specific default settings to function properly. This tutorial uses the bucket named `vector-search-testing` with the scope `shared` and collection `deepseek`. The configuration is set up for vectors with exactly `1536 dimensions`, using dot product similarity and optimized for recall. If you want to use a different bucket, scope, or collection, you will need to modify the index configuration accordingly.\n", + "\n", + "For more information on creating a Search Vector Index, please follow the [instructions](https://docs.couchbase.com/cloud/vector-search/create-vector-search-index-ui.html)." ] }, { @@ -409,9 +409,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Creating or Updating Search Indexes\n", + "### Creating or Updating Search Vector Index\n", "\n", - "With the index definition loaded, the next step is to create or update the **Vector Search Index** in Couchbase. This step is crucial because it optimizes our database for vector similarity search operations, allowing us to perform searches based on the semantic content of documents rather than just keywords. By creating or updating a Vector Search Index, we enable our search engine to handle complex queries that involve finding semantically similar documents using vector embeddings, which is essential for a robust semantic search engine.Creating search indexes placeholder text." + "With the index definition loaded, the next step is to create or update the **Search Vector Index** in Couchbase. This step is crucial because it optimizes our database for vector similarity search operations, allowing us to perform searches based on the semantic content of documents rather than just keywords. By creating or updating a Search Vector Index, we enable our search engine to handle complex queries that involve finding semantically similar documents using vector embeddings, which is essential for a robust semantic search engine." ] }, { @@ -461,12 +461,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Creating the Embeddings client\n", - "This section creates an OpenAI embeddings client using the OpenAI API key.\n", - "The embeddings client is configured to use the \"text-embedding-3-small\" model,\n", - "which converts text into numerical vector representations.\n", - "These vector embeddings are essential for semantic search and similarity matching.\n", - "The client will be used by the vector store to generate embeddings for documents." + "## OpenAI Configuration\n", + "\n", + "### Creating the Embeddings Client\n", + "\n", + "This section creates an OpenAI embeddings client using the OpenAI API key. The embeddings client is configured to use the \"text-embedding-3-small\" model, which converts text into numerical vector representations. These vector embeddings are essential for semantic search and similarity matching. The client will be used by the vector store to generate embeddings for documents." ] }, { @@ -497,8 +496,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Setting Up the Couchbase Vector Store\n", - "A vector store is where we'll keep our embeddings. Unlike the FTS index, which is used for text-based search, the vector store is specifically designed to handle embeddings and perform similarity searches. When a user inputs a query, the search engine converts the query into an embedding and compares it against the embeddings stored in the vector store. This allows the engine to find documents that are semantically similar to the query, even if they don't contain the exact same words. By setting up the vector store in Couchbase, we create a powerful tool that enables our search engine to understand and retrieve information based on the meaning and context of the query, rather than just the specific words used." + "## Document Processing and Vector Store Setup\n", + "\n", + "### Create Couchbase Search Vector Store\n", + "\n", + "A vector store is where we'll keep our embeddings. Unlike traditional text-based search, the Search Vector Store is specifically designed to handle embeddings and perform similarity searches. When a user inputs a query, the search engine converts the query into an embedding and compares it against the embeddings stored in the vector store. This allows the engine to find documents that are semantically similar to the query, even if they don't contain the exact same words. By setting up the Search Vector Store in Couchbase, we create a powerful tool that enables our search engine to understand and retrieve information based on the meaning and context of the query, rather than just the specific words used." ] }, { @@ -533,7 +535,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Load the BBC News Dataset\n", + "### Load BBC News Dataset\n", + "\n", "To build a search engine, we need data to search through. We use the BBC News dataset from RealTimeData, which provides real-world news articles. This dataset contains news articles from BBC covering various topics and time periods. Loading the dataset is a crucial step because it provides the raw material that our search engine will work with. The quality and diversity of the news articles make it an excellent choice for testing and refining our search engine, ensuring it can handle real-world news content effectively.\n", "\n", "The BBC News dataset allows us to work with authentic news articles, enabling us to build and test a search engine that can effectively process and retrieve relevant news content. The dataset is loaded using the Hugging Face datasets library, specifically accessing the \"RealTimeData/bbc_news_alltime\" dataset with the \"2024-12\" version." @@ -574,7 +577,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Cleaning up the Data\n", + "#### Data Cleaning\n", + "\n", "We will use the content of the news articles for our RAG system.\n", "\n", "The dataset contains a few duplicate records. We are removing them to avoid duplicate results in the retrieval stage of our RAG system." @@ -607,24 +611,18 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Saving Data to the Vector Store\n", + "#### Save Data to Vector Store\n", + "\n", "To efficiently handle the large number of articles, we process them in batches of articles at a time. This batch processing approach helps manage memory usage and provides better control over the ingestion process.\n", "\n", "We first filter out any articles that exceed 50,000 characters to avoid potential issues with token limits. Then, using the vector store's add_texts method, we add the filtered articles to our vector database. The batch_size parameter controls how many articles are processed in each iteration.\n", "\n", "This approach offers several benefits:\n", - "1. Memory Efficiency: Processing in smaller batches prevents memory overload\n", - "2. Progress Tracking: Easier to monitor and track the ingestion progress\n", - "3. Resource Management: Better control over CPU and network resource utilization\n", - "\n", - "We use a conservative batch size of 50 to ensure reliable operation.\n", - "The optimal batch size depends on many factors including:\n", - "- Document sizes being inserted\n", - "- Available system resources\n", - "- Network conditions\n", - "- Concurrent workload\n", - "\n", - "Consider measuring performance with your specific workload before adjusting.\n" + "1. **Memory Efficiency**: Processing in smaller batches prevents memory overload\n", + "2. **Progress Tracking**: Easier to monitor and track the ingestion progress\n", + "3. **Resource Management**: Better control over CPU and network resource utilization\n", + "\n", + "We use a conservative batch size of 50 to ensure reliable operation. The optimal batch size depends on many factors including document sizes, available system resources, network conditions, and concurrent workload." ] }, { @@ -660,10 +658,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Setting Up a Couchbase Cache\n", + "### Setup Couchbase Cache\n", + "\n", "To further optimize our system, we set up a Couchbase-based cache. A cache is a temporary storage layer that holds data that is frequently accessed, speeding up operations by reducing the need to repeatedly retrieve the same information from the database. In our setup, the cache will help us accelerate repetitive tasks, such as looking up similar documents. By implementing a cache, we enhance the overall performance of our search engine, ensuring that it can handle high query volumes and deliver results quickly.\n", "\n", - "Caching is particularly valuable in scenarios where users may submit similar queries multiple times or where certain pieces of information are frequently requested. By storing these in a cache, we can significantly reduce the time it takes to respond to these queries, improving the user experience.\n" + "Caching is particularly valuable in scenarios where users may submit similar queries multiple times or where certain pieces of information are frequently requested. By storing these in a cache, we can significantly reduce the time it takes to respond to these queries, improving the user experience." ] }, { @@ -697,7 +696,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Setting Up the LLM Model\n", + "## Deepseek LLM Configuration\n", + "\n", + "### Setting Up the LLM Model\n", + "\n", "In this section, we set up the Large Language Model (LLM) for our RAG system. We're using the Deepseek model, which can be accessed through two different methods:\n", "\n", "1. **Deepseek API Key**: This is obtained directly from Deepseek's platform (https://deepseek.ai) by creating an account and subscribing to their API services. With this key, you can access Deepseek's models directly using the `ChatDeepSeek` class from the `langchain_deepseek` package.\n", @@ -706,7 +708,7 @@ "\n", "The key difference is that OpenRouter acts as an intermediary service that can route your requests to various LLM providers, while the Deepseek API gives you direct access to only Deepseek's models. OpenRouter can be useful if you want to switch between different LLM providers without changing your code significantly.\n", "\n", - "In our implementation, we check for both keys and prioritize using the Deepseek API directly if available, falling back to OpenRouter if not. The model is configured with temperature=0 to ensure deterministic, focused responses suitable for RAG applications.\n" + "In our implementation, we check for both keys and prioritize using the Deepseek API directly if available, falling back to OpenRouter if not. The model is configured with temperature=0 to ensure deterministic, focused responses suitable for RAG applications." ] }, { @@ -755,7 +757,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Perform Semantic Search\n", + "## Semantic Search Demo\n", + "\n", + "### Perform Semantic Search\n", + "\n", "Semantic search in Couchbase involves converting queries and documents into vector representations using an embeddings model. These vectors capture the semantic meaning of the text and are stored directly in Couchbase. When a query is made, Couchbase performs a similarity search by comparing the query vector against the stored document vectors. The similarity metric used for this comparison is configurable, allowing flexibility in how the relevance of documents is determined.\n", "\n", "In the provided code, the search process begins by recording the start time, followed by executing the similarity_search_with_score method of the CouchbaseSearchVectorStore. This method searches Couchbase for the most relevant documents based on the vector similarity to the query. The search results include the document content and a similarity score that reflects how closely each document aligns with the query in the defined semantic space. The time taken to perform this search is then calculated and logged, and the results are displayed, showing the most relevant documents along with their similarity scores. This approach leverages Couchbase as both a storage and retrieval engine for vector data, enabling efficient and scalable semantic searches. The integration of vector storage and search capabilities within Couchbase allows for sophisticated semantic search operations without relying on external services for vector storage or comparison." @@ -911,10 +916,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Retrieval-Augmented Generation (RAG) with Couchbase and LangChain\n", - "Couchbase and LangChain can be seamlessly integrated to create RAG (Retrieval-Augmented Generation) chains, enhancing the process of generating contextually relevant responses. In this setup, Couchbase serves as the vector store, where embeddings of documents are stored. When a query is made, LangChain retrieves the most relevant documents from Couchbase by comparing the query’s embedding with the stored document embeddings. These documents, which provide contextual information, are then passed to a generative language model within LangChain.\n", + "## RAG System Demo\n", "\n", - "The language model, equipped with the context from the retrieved documents, generates a response that is both informed and contextually accurate. This integration allows the RAG chain to leverage Couchbase’s efficient storage and retrieval capabilities, while LangChain handles the generation of responses based on the context provided by the retrieved documents. Together, they create a powerful system that can deliver highly relevant and accurate answers by combining the strengths of both retrieval and generation." + "### Retrieval-Augmented Generation (RAG) with Couchbase and LangChain\n", + "\n", + "Couchbase and LangChain can be seamlessly integrated to create RAG (Retrieval-Augmented Generation) chains, enhancing the process of generating contextually relevant responses. In this setup, Couchbase serves as the vector store, where embeddings of documents are stored. When a query is made, LangChain retrieves the most relevant documents from Couchbase by comparing the query's embedding with the stored document embeddings. These documents, which provide contextual information, are then passed to a generative language model within LangChain.\n", + "\n", + "The language model, equipped with the context from the retrieved documents, generates a response that is both informed and contextually accurate. This integration allows the RAG chain to leverage Couchbase's efficient storage and retrieval capabilities, while LangChain handles the generation of responses based on the context provided by the retrieved documents. Together, they create a powerful system that can deliver highly relevant and accurate answers by combining the strengths of both retrieval and generation." ] }, { @@ -996,10 +1004,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Using Couchbase as a caching mechanism\n", + "### Using Couchbase as a Caching Mechanism\n", + "\n", "Couchbase can be effectively used as a caching mechanism for RAG (Retrieval-Augmented Generation) responses by storing and retrieving precomputed results for specific queries. This approach enhances the system's efficiency and speed, particularly when dealing with repeated or similar queries. When a query is first processed, the RAG chain retrieves relevant documents, generates a response using the language model, and then stores this response in Couchbase, with the query serving as the key.\n", "\n", - "For subsequent requests with the same query, the system checks Couchbase first. If a cached response is found, it is retrieved directly from Couchbase, bypassing the need to re-run the entire RAG process. This significantly reduces response time because the computationally expensive steps of document retrieval and response generation are skipped. Couchbase's role in this setup is to provide a fast and scalable storage solution for caching these responses, ensuring that frequently asked queries can be answered more quickly and efficiently.\n" + "For subsequent requests with the same query, the system checks Couchbase first. If a cached response is found, it is retrieved directly from Couchbase, bypassing the need to re-run the entire RAG process. This significantly reduces response time because the computationally expensive steps of document retrieval and response generation are skipped. Couchbase's role in this setup is to provide a fast and scalable storage solution for caching these responses, ensuring that frequently asked queries can be answered more quickly and efficiently." ] }, { @@ -1099,7 +1108,8 @@ "metadata": {}, "source": [ "## Conclusion\n", - "By following these steps, you'll have a fully functional semantic search engine that leverages the strengths of Couchbase and Deepseek(via Openrouter). This guide is designed not just to show you how to build the system, but also to explain why each step is necessary, giving you a deeper understanding of the principles behind semantic search and how to implement it effectively. Whether you're a newcomer to software development or an experienced developer looking to expand your skills, this guide will provide you with the knowledge and tools you need to create a powerful, AI-driven search engine." + "\n", + "By following these steps, you'll have a fully functional semantic search engine that leverages the strengths of Couchbase's Search Vector Index and Deepseek (via OpenRouter). This guide is designed not just to show you how to build the system, but also to explain why each step is necessary, giving you a deeper understanding of the principles behind semantic search and how to implement it effectively. Whether you're a newcomer to software development or an experienced developer looking to expand your skills, this guide will provide you with the knowledge and tools you need to create a powerful, AI-driven search engine." ] } ], diff --git a/openrouter-deepseek/fts/deepseek_index.json b/openrouter-deepseek/search_based/deepseek_index.json similarity index 100% rename from openrouter-deepseek/fts/deepseek_index.json rename to openrouter-deepseek/search_based/deepseek_index.json diff --git a/openrouter-deepseek/search_based/frontmatter.md b/openrouter-deepseek/search_based/frontmatter.md new file mode 100644 index 00000000..1ce3d507 --- /dev/null +++ b/openrouter-deepseek/search_based/frontmatter.md @@ -0,0 +1,23 @@ +--- +# frontmatter +path: "/tutorial-openrouter-deepseek-with-search-vector-index" +title: Retrieval-Augmented Generation (RAG) with OpenRouter Deepseek and Couchbase Search Vector Index +short_title: RAG with OpenRouter Deepseek and Couchbase Search Vector Index +description: + - Learn how to build a semantic search engine using OpenRouter Deepseek and Couchbase Search Vector Index. + - This tutorial demonstrates how to integrate Couchbase's Search Vector Index capabilities with OpenRouter Deepseek as both embeddings and language model provider. + - You'll understand how to perform Retrieval-Augmented Generation (RAG) using LangChain, OpenRouter Deepseek, and Couchbase Search Vector Index. +content_type: tutorial +filter: sdk +technology: + - vector search +tags: + - Search Vector Index + - Artificial Intelligence + - LangChain + - Deepseek + - OpenRouter +sdk_language: + - python +length: 60 Mins +--- From 68aea7c4d8ac5777f3d05f4e3f7f66438118f37e Mon Sep 17 00:00:00 2001 From: giriraj-singh-couchbase Date: Thu, 11 Dec 2025 13:38:13 +0530 Subject: [PATCH 12/13] reverted the openrouter changes --- openrouter-deepseek/fts/.env.sample | 14 + ...th_Couchbase_and_Openrouter_Deepseek.ipynb | 1127 +++++++++++++++++ openrouter-deepseek/fts/deepseek_index.json | 73 ++ openrouter-deepseek/fts/frontmatter.md | 23 + openrouter-deepseek/gsi/.env.sample | 13 + ...th_Couchbase_and_Openrouter_Deepseek.ipynb | 1089 ++++++++++++++++ openrouter-deepseek/gsi/frontmatter.md | 23 + 7 files changed, 2362 insertions(+) create mode 100644 openrouter-deepseek/fts/.env.sample create mode 100644 openrouter-deepseek/fts/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb create mode 100644 openrouter-deepseek/fts/deepseek_index.json create mode 100644 openrouter-deepseek/fts/frontmatter.md create mode 100644 openrouter-deepseek/gsi/.env.sample create mode 100644 openrouter-deepseek/gsi/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb create mode 100644 openrouter-deepseek/gsi/frontmatter.md diff --git a/openrouter-deepseek/fts/.env.sample b/openrouter-deepseek/fts/.env.sample new file mode 100644 index 00000000..861ba211 --- /dev/null +++ b/openrouter-deepseek/fts/.env.sample @@ -0,0 +1,14 @@ +DEEPSEEK_API_KEY="" +OPENAI_API_KEY="" +OPENROUTER_API_KEY="" + +# Couchbase Settings +CB_HOST=couchbase://localhost +CB_USERNAME=Administrator +CB_PASSWORD=password +CB_BUCKET_NAME=vector-search-testing + +INDEX_NAME=vector_search_deepseek +SCOPE_NAME=shared +COLLECTION_NAME=deepseek +CACHE_COLLECTION=cache diff --git a/openrouter-deepseek/fts/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb b/openrouter-deepseek/fts/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb new file mode 100644 index 00000000..387bb2bd --- /dev/null +++ b/openrouter-deepseek/fts/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb @@ -0,0 +1,1127 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Introduction \n", + "In this guide, we will walk you through building a powerful semantic search engine using Couchbase as the backend database and [Deepseek V3 as the language model provider (via OpenRouter or direct API)](https://deepseek.ai/) and OpenAI for embeddings. Semantic search goes beyond simple keyword matching by understanding the context and meaning behind the words in a query, making it an essential tool for applications that require intelligent information retrieval. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system using the FTS service from scratch. Alternatively if you want to perform semantic search using the GSI index, please take a look at [this.](https://developer.couchbase.com/tutorial-openrouter-deepseek-with-global-secondary-index/)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# How to run this tutorial\n", + "\n", + "This tutorial is available as a Jupyter Notebook (`.ipynb` file) that you can run interactively. You can access the original notebook [here](https://github.com/couchbase-examples/vector-search-cookbook/blob/main/openrouter-deepseek/fts/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb).\n", + "\n", + "You can either download the notebook file and run it on [Google Colab](https://colab.research.google.com/) or run it on your system by setting up the Python environment." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Before you start\n", + "\n", + "## Get Credentials for OpenRouter and Deepseek\n", + "* Sign up for an account at [OpenRouter](https://openrouter.ai/) to get your API key\n", + "* OpenRouter provides access to Deepseek models, so no separate Deepseek credentials are needed\n", + "* Store your OpenRouter API key securely as it will be used to access the models\n", + "* For [Deepseek](https://deepseek.ai/) models, you can use the default models provided by OpenRouter\n", + "\n", + "## Create and Deploy Your Free Tier Operational cluster on Capella\n", + "\n", + "To get started with Couchbase Capella, create an account and use it to deploy a forever free tier operational cluster. This account provides you with an environment where you can explore and learn about Capella with no time constraint.\n", + "\n", + "To learn more, please follow the [instructions](https://docs.couchbase.com/cloud/get-started/create-account.html).\n", + "\n", + "### Couchbase Capella Configuration\n", + "\n", + "When running Couchbase using [Capella](https://cloud.couchbase.com/sign-in), the following prerequisites need to be met.\n", + "\n", + "* Create the [database credentials](https://docs.couchbase.com/cloud/clusters/manage-database-users.html) to access the required bucket (Read and Write) used in the application.\n", + "* [Allow access](https://docs.couchbase.com/cloud/clusters/allow-ip-address.html) to the Cluster from the IP on which the application is running." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setting the Stage: Installing Necessary Libraries\n", + "\n", + "To build our semantic search engine, we need a robust set of tools. The libraries we install handle everything from connecting to databases to performing complex machine learning tasks." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install --quiet datasets==3.5.0 langchain-couchbase==0.3.0 langchain-deepseek==0.1.3 langchain-openai==0.3.13 python-dotenv==1.1.0" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Importing Necessary Libraries\n", + "\n", + "The script starts by importing a series of libraries required for various tasks, including handling JSON, logging, time tracking, Couchbase connections, embedding generation, and dataset loading." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import getpass\n", + "import json\n", + "import logging\n", + "import os\n", + "import time\n", + "from datetime import timedelta\n", + "\n", + "from couchbase.auth import PasswordAuthenticator\n", + "from couchbase.cluster import Cluster\n", + "from couchbase.exceptions import (CouchbaseException,\n", + " InternalServerFailureException,\n", + " QueryIndexAlreadyExistsException,ServiceUnavailableException)\n", + "from couchbase.management.buckets import CreateBucketSettings\n", + "from couchbase.management.search import SearchIndex\n", + "from couchbase.options import ClusterOptions\n", + "from datasets import load_dataset\n", + "from dotenv import load_dotenv\n", + "from langchain_core.globals import set_llm_cache\n", + "from langchain_core.output_parsers import StrOutputParser\n", + "from langchain_core.prompts.chat import ChatPromptTemplate\n", + "from langchain_core.runnables import RunnablePassthrough\n", + "from langchain_couchbase.cache import CouchbaseCache\n", + "from langchain_couchbase.vectorstores import CouchbaseSearchVectorStore\n", + "from langchain_openai import OpenAIEmbeddings" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup Logging\n", + "Logging is configured to track the progress of the script and capture any errors or warnings." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)\n", + "\n", + "# Suppress httpx logging\n", + "logging.getLogger('httpx').setLevel(logging.CRITICAL)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Environment Variables and Configuration\n", + "\n", + "This section handles loading and validating environment variables and configuration settings:\n", + "#\n", + "1. API Keys:\n", + " - Supports either direct Deepseek API or OpenRouter API access\n", + " - Prompts for API key input if not found in environment\n", + " - Requires OpenAI API key for embeddings\n", + "#\n", + "2. Couchbase Settings:\n", + " - Connection details (host, username, password)\n", + " - Bucket, scope and collection names\n", + " - Vector search index configuration\n", + " - Cache collection settings\n", + "#\n", + "The code validates that all required credentials are present before proceeding.\n", + "It allows flexible configuration through environment variables or interactive prompts,\n", + "with sensible defaults for local development.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Load environment variables from .env file if it exists\n", + "load_dotenv()\n", + "\n", + "# API Keys\n", + "# Allow either Deepseek API directly or via OpenRouter\n", + "DEEPSEEK_API_KEY = os.getenv('DEEPSEEK_API_KEY')\n", + "OPENROUTER_API_KEY = os.getenv('OPENROUTER_API_KEY')\n", + "\n", + "if not DEEPSEEK_API_KEY and not OPENROUTER_API_KEY:\n", + " api_choice = input('Choose API (1 for Deepseek direct, 2 for OpenRouter): ')\n", + " if api_choice == '1':\n", + " DEEPSEEK_API_KEY = getpass.getpass('Enter your Deepseek API Key: ')\n", + " else:\n", + " OPENROUTER_API_KEY = getpass.getpass('Enter your OpenRouter API Key: ')\n", + "\n", + "OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') or getpass.getpass('Enter your OpenAI API Key: ')\n", + "\n", + "# Couchbase Settings\n", + "CB_HOST = os.getenv('CB_HOST') or input('Enter your Couchbase host (default: couchbase://localhost): ') or 'couchbase://localhost'\n", + "CB_USERNAME = os.getenv('CB_USERNAME') or input('Enter your Couchbase username (default: Administrator): ') or 'Administrator'\n", + "CB_PASSWORD = os.getenv('CB_PASSWORD') or getpass.getpass('Enter your Couchbase password (default: password): ') or 'password'\n", + "CB_BUCKET_NAME = os.getenv('CB_BUCKET_NAME') or input('Enter your Couchbase bucket name (default: vector-search-testing): ') or 'vector-search-testing'\n", + "INDEX_NAME = os.getenv('INDEX_NAME') or input('Enter your index name (default: vector_search_deepseek): ') or 'vector_search_deepseek'\n", + "SCOPE_NAME = os.getenv('SCOPE_NAME') or input('Enter your scope name (default: shared): ') or 'shared'\n", + "COLLECTION_NAME = os.getenv('COLLECTION_NAME') or input('Enter your collection name (default: deepseek): ') or 'deepseek'\n", + "CACHE_COLLECTION = os.getenv('CACHE_COLLECTION') or input('Enter your cache collection name (default: cache): ') or 'cache'\n", + "\n", + "# Check if required credentials are set\n", + "required_creds = {\n", + " 'OPENAI_API_KEY': OPENAI_API_KEY,\n", + " 'CB_HOST': CB_HOST,\n", + " 'CB_USERNAME': CB_USERNAME,\n", + " 'CB_PASSWORD': CB_PASSWORD,\n", + " 'CB_BUCKET_NAME': CB_BUCKET_NAME\n", + "}\n", + "\n", + "# Add the API key that was chosen\n", + "if DEEPSEEK_API_KEY:\n", + " required_creds['DEEPSEEK_API_KEY'] = DEEPSEEK_API_KEY\n", + "elif OPENROUTER_API_KEY:\n", + " required_creds['OPENROUTER_API_KEY'] = OPENROUTER_API_KEY\n", + "else:\n", + " raise ValueError(\"Either Deepseek API Key or OpenRouter API Key must be provided\")\n", + "\n", + "for cred_name, cred_value in required_creds.items():\n", + " if not cred_value:\n", + " raise ValueError(f\"{cred_name} is not set\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Connecting to the Couchbase Cluster\n", + "Connecting to a Couchbase cluster is the foundation of our project. Couchbase will serve as our primary data store, handling all the storage and retrieval operations required for our semantic search engine. By establishing this connection, we enable our application to interact with the database, allowing us to perform operations such as storing embeddings, querying data, and managing collections. This connection is the gateway through which all data will flow, so ensuring it's set up correctly is paramount.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-05-25 14:39:18,465 - INFO - Successfully connected to Couchbase\n" + ] + } + ], + "source": [ + "try:\n", + " auth = PasswordAuthenticator(CB_USERNAME, CB_PASSWORD)\n", + " options = ClusterOptions(auth)\n", + " cluster = Cluster(CB_HOST, options)\n", + " cluster.wait_until_ready(timedelta(seconds=5))\n", + " logging.info(\"Successfully connected to Couchbase\")\n", + "except Exception as e:\n", + " raise ConnectionError(f\"Failed to connect to Couchbase: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setting Up Collections in Couchbase\n", + "\n", + "The setup_collection() function handles creating and configuring the hierarchical data organization in Couchbase:\n", + "\n", + "1. Bucket Creation:\n", + " - Checks if specified bucket exists, creates it if not\n", + " - Sets bucket properties like RAM quota (1024MB) and replication (disabled)\n", + " - Note: If you are using Capella, create a bucket manually called vector-search-testing(or any name you prefer) with the same properties.\n", + "\n", + "2. Scope Management: \n", + " - Verifies if requested scope exists within bucket\n", + " - Creates new scope if needed (unless it's the default \"_default\" scope)\n", + "\n", + "3. Collection Setup:\n", + " - Checks for collection existence within scope\n", + " - Creates collection if it doesn't exist\n", + " - Waits 2 seconds for collection to be ready\n", + "\n", + "Additional Tasks:\n", + "- Creates primary index on collection for query performance\n", + "- Clears any existing documents for clean state\n", + "- Implements comprehensive error handling and logging\n", + "\n", + "The function is called twice to set up:\n", + "1. Main collection for vector embeddings\n", + "2. Cache collection for storing results\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-05-25 14:39:19,580 - INFO - Bucket 'vector-search-testing' exists.\n", + "2025-05-25 14:39:21,409 - INFO - Collection 'deepseek' already exists. Skipping creation.\n", + "2025-05-25 14:39:24,342 - INFO - Primary index present or created successfully.\n", + "2025-05-25 14:39:24,604 - INFO - All documents cleared from the collection.\n", + "2025-05-25 14:39:24,606 - INFO - Bucket 'vector-search-testing' exists.\n", + "2025-05-25 14:39:26,535 - INFO - Collection 'cache' already exists. Skipping creation.\n", + "2025-05-25 14:39:29,589 - INFO - Primary index present or created successfully.\n", + "2025-05-25 14:39:29,813 - INFO - All documents cleared from the collection.\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def setup_collection(cluster, bucket_name, scope_name, collection_name):\n", + " try:\n", + " # Check if bucket exists, create if it doesn't\n", + " try:\n", + " bucket = cluster.bucket(bucket_name)\n", + " logging.info(f\"Bucket '{bucket_name}' exists.\")\n", + " except Exception as e:\n", + " logging.info(f\"Bucket '{bucket_name}' does not exist. Creating it...\")\n", + " bucket_settings = CreateBucketSettings(\n", + " name=bucket_name,\n", + " bucket_type='couchbase',\n", + " ram_quota_mb=1024,\n", + " flush_enabled=True,\n", + " num_replicas=0\n", + " )\n", + " cluster.buckets().create_bucket(bucket_settings)\n", + " time.sleep(2) # Wait for bucket creation to complete and become available\n", + " bucket = cluster.bucket(bucket_name)\n", + " logging.info(f\"Bucket '{bucket_name}' created successfully.\")\n", + "\n", + " bucket_manager = bucket.collections()\n", + "\n", + " # Check if scope exists, create if it doesn't\n", + " scopes = bucket_manager.get_all_scopes()\n", + " scope_exists = any(scope.name == scope_name for scope in scopes)\n", + " \n", + " if not scope_exists and scope_name != \"_default\":\n", + " logging.info(f\"Scope '{scope_name}' does not exist. Creating it...\")\n", + " bucket_manager.create_scope(scope_name)\n", + " logging.info(f\"Scope '{scope_name}' created successfully.\")\n", + "\n", + " # Check if collection exists, create if it doesn't\n", + " collections = bucket_manager.get_all_scopes()\n", + " collection_exists = any(\n", + " scope.name == scope_name and collection_name in [col.name for col in scope.collections]\n", + " for scope in collections\n", + " )\n", + "\n", + " if not collection_exists:\n", + " logging.info(f\"Collection '{collection_name}' does not exist. Creating it...\")\n", + " bucket_manager.create_collection(scope_name, collection_name)\n", + " logging.info(f\"Collection '{collection_name}' created successfully.\")\n", + " else:\n", + " logging.info(f\"Collection '{collection_name}' already exists. Skipping creation.\")\n", + "\n", + " # Wait for collection to be ready\n", + " collection = bucket.scope(scope_name).collection(collection_name)\n", + " time.sleep(2) # Give the collection time to be ready for queries\n", + "\n", + " # Ensure primary index exists\n", + " try:\n", + " cluster.query(f\"CREATE PRIMARY INDEX IF NOT EXISTS ON `{bucket_name}`.`{scope_name}`.`{collection_name}`\").execute()\n", + " logging.info(\"Primary index present or created successfully.\")\n", + " except Exception as e:\n", + " logging.warning(f\"Error creating primary index: {str(e)}\")\n", + "\n", + " # Clear all documents in the collection\n", + " try:\n", + " query = f\"DELETE FROM `{bucket_name}`.`{scope_name}`.`{collection_name}`\"\n", + " cluster.query(query).execute()\n", + " logging.info(\"All documents cleared from the collection.\")\n", + " except Exception as e:\n", + " logging.warning(f\"Error while clearing documents: {str(e)}. The collection might be empty.\")\n", + "\n", + " return collection\n", + " except Exception as e:\n", + " raise RuntimeError(f\"Error setting up collection: {str(e)}\")\n", + " \n", + "setup_collection(cluster, CB_BUCKET_NAME, SCOPE_NAME, COLLECTION_NAME)\n", + "setup_collection(cluster, CB_BUCKET_NAME, SCOPE_NAME, CACHE_COLLECTION)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Loading Couchbase Vector Search Index\n", + "\n", + "Semantic search requires an efficient way to retrieve relevant documents based on a user's query. This is where the Couchbase **Vector Search Index** comes into play. In this step, we load the Vector Search Index definition from a JSON file, which specifies how the index should be structured. This includes the fields to be indexed, the dimensions of the vectors, and other parameters that determine how the search engine processes queries based on vector similarity.\n", + "\n", + "This OpenRouter Deepseek vector search index configuration requires specific default settings to function properly. This tutorial uses the bucket named `vector-search-testing` with the scope `shared` and collection `deepseek`. The configuration is set up for vectors with exactly `1536 dimensions`, using dot product similarity and optimized for recall. If you want to use a different bucket, scope, or collection, you will need to modify the index configuration accordingly.\n", + "\n", + "For more information on creating a vector search index, please follow the [instructions](https://docs.couchbase.com/cloud/vector-search/create-vector-search-index-ui.html).\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " with open('deepseek_index.json', 'r') as file:\n", + " index_definition = json.load(file)\n", + "except Exception as e:\n", + " raise ValueError(f\"Error loading index definition: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Creating or Updating Search Indexes\n", + "\n", + "With the index definition loaded, the next step is to create or update the **Vector Search Index** in Couchbase. This step is crucial because it optimizes our database for vector similarity search operations, allowing us to perform searches based on the semantic content of documents rather than just keywords. By creating or updating a Vector Search Index, we enable our search engine to handle complex queries that involve finding semantically similar documents using vector embeddings, which is essential for a robust semantic search engine.Creating search indexes placeholder text." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-05-25 14:39:31,015 - INFO - Index 'vector_search_deepseek' found\n", + "2025-05-25 14:39:31,770 - INFO - Index 'vector_search_deepseek' already exists. Skipping creation/update.\n" + ] + } + ], + "source": [ + "try:\n", + " scope_index_manager = cluster.bucket(CB_BUCKET_NAME).scope(SCOPE_NAME).search_indexes()\n", + "\n", + " # Check if index already exists\n", + " existing_indexes = scope_index_manager.get_all_indexes()\n", + " index_name = index_definition[\"name\"]\n", + "\n", + " if index_name in [index.name for index in existing_indexes]:\n", + " logging.info(f\"Index '{index_name}' found\")\n", + " else:\n", + " logging.info(f\"Creating new index '{index_name}'...\")\n", + "\n", + " # Create SearchIndex object from JSON definition\n", + " search_index = SearchIndex.from_json(index_definition)\n", + "\n", + " # Upsert the index (create if not exists, update if exists)\n", + " scope_index_manager.upsert_index(search_index)\n", + " logging.info(f\"Index '{index_name}' successfully created/updated.\")\n", + "\n", + "except QueryIndexAlreadyExistsException:\n", + " logging.info(f\"Index '{index_name}' already exists. Skipping creation/update.\")\n", + "except ServiceUnavailableException:\n", + " raise RuntimeError(\"Search service is not available. Please ensure the Search service is enabled in your Couchbase cluster.\")\n", + "except InternalServerFailureException as e:\n", + " logging.error(f\"Internal server error: {str(e)}\")\n", + " raise" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Creating the Embeddings client\n", + "This section creates an OpenAI embeddings client using the OpenAI API key.\n", + "The embeddings client is configured to use the \"text-embedding-3-small\" model,\n", + "which converts text into numerical vector representations.\n", + "These vector embeddings are essential for semantic search and similarity matching.\n", + "The client will be used by the vector store to generate embeddings for documents." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-05-25 14:39:32,003 - INFO - Successfully created OpenAI embeddings client\n" + ] + } + ], + "source": [ + "try:\n", + " embeddings = OpenAIEmbeddings(\n", + " api_key=OPENAI_API_KEY,\n", + " model=\"text-embedding-3-small\"\n", + " )\n", + " logging.info(\"Successfully created OpenAI embeddings client\")\n", + "except Exception as e:\n", + " raise ValueError(f\"Error creating OpenAI embeddings client: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setting Up the Couchbase Vector Store\n", + "A vector store is where we'll keep our embeddings. Unlike the FTS index, which is used for text-based search, the vector store is specifically designed to handle embeddings and perform similarity searches. When a user inputs a query, the search engine converts the query into an embedding and compares it against the embeddings stored in the vector store. This allows the engine to find documents that are semantically similar to the query, even if they don't contain the exact same words. By setting up the vector store in Couchbase, we create a powerful tool that enables our search engine to understand and retrieve information based on the meaning and context of the query, rather than just the specific words used." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-05-25 14:39:35,246 - INFO - Successfully created vector store\n" + ] + } + ], + "source": [ + "try:\n", + " vector_store = CouchbaseSearchVectorStore(\n", + " cluster=cluster,\n", + " bucket_name=CB_BUCKET_NAME,\n", + " scope_name=SCOPE_NAME,\n", + " collection_name=COLLECTION_NAME,\n", + " embedding=embeddings,\n", + " index_name=INDEX_NAME,\n", + " )\n", + " logging.info(\"Successfully created vector store\")\n", + "except Exception as e:\n", + " raise ValueError(f\"Failed to create vector store: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load the BBC News Dataset\n", + "To build a search engine, we need data to search through. We use the BBC News dataset from RealTimeData, which provides real-world news articles. This dataset contains news articles from BBC covering various topics and time periods. Loading the dataset is a crucial step because it provides the raw material that our search engine will work with. The quality and diversity of the news articles make it an excellent choice for testing and refining our search engine, ensuring it can handle real-world news content effectively.\n", + "\n", + "The BBC News dataset allows us to work with authentic news articles, enabling us to build and test a search engine that can effectively process and retrieve relevant news content. The dataset is loaded using the Hugging Face datasets library, specifically accessing the \"RealTimeData/bbc_news_alltime\" dataset with the \"2024-12\" version." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-05-25 14:39:41,364 - INFO - Successfully loaded the BBC News dataset with 2687 rows.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded the BBC News dataset with 2687 rows\n" + ] + } + ], + "source": [ + "try:\n", + " news_dataset = load_dataset(\n", + " \"RealTimeData/bbc_news_alltime\", \"2024-12\", split=\"train\"\n", + " )\n", + " print(f\"Loaded the BBC News dataset with {len(news_dataset)} rows\")\n", + " logging.info(f\"Successfully loaded the BBC News dataset with {len(news_dataset)} rows.\")\n", + "except Exception as e:\n", + " raise ValueError(f\"Error loading the BBC News dataset: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cleaning up the Data\n", + "We will use the content of the news articles for our RAG system.\n", + "\n", + "The dataset contains a few duplicate records. We are removing them to avoid duplicate results in the retrieval stage of our RAG system." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "We have 1749 unique articles in our database.\n" + ] + } + ], + "source": [ + "news_articles = news_dataset[\"content\"]\n", + "unique_articles = set()\n", + "for article in news_articles:\n", + " if article:\n", + " unique_articles.add(article)\n", + "unique_news_articles = list(unique_articles)\n", + "print(f\"We have {len(unique_news_articles)} unique articles in our database.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Saving Data to the Vector Store\n", + "To efficiently handle the large number of articles, we process them in batches of articles at a time. This batch processing approach helps manage memory usage and provides better control over the ingestion process.\n", + "\n", + "We first filter out any articles that exceed 50,000 characters to avoid potential issues with token limits. Then, using the vector store's add_texts method, we add the filtered articles to our vector database. The batch_size parameter controls how many articles are processed in each iteration.\n", + "\n", + "This approach offers several benefits:\n", + "1. Memory Efficiency: Processing in smaller batches prevents memory overload\n", + "2. Progress Tracking: Easier to monitor and track the ingestion progress\n", + "3. Resource Management: Better control over CPU and network resource utilization\n", + "\n", + "We use a conservative batch size of 50 to ensure reliable operation.\n", + "The optimal batch size depends on many factors including:\n", + "- Document sizes being inserted\n", + "- Available system resources\n", + "- Network conditions\n", + "- Concurrent workload\n", + "\n", + "Consider measuring performance with your specific workload before adjusting.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-05-25 14:41:37,848 - INFO - Document ingestion completed successfully.\n" + ] + } + ], + "source": [ + "batch_size = 50\n", + "\n", + "# Automatic Batch Processing\n", + "articles = [article for article in unique_news_articles if article and len(article) <= 50000]\n", + "\n", + "try:\n", + " vector_store.add_texts(\n", + " texts=articles,\n", + " batch_size=batch_size\n", + " )\n", + " logging.info(\"Document ingestion completed successfully.\")\n", + "except Exception as e:\n", + " raise ValueError(f\"Failed to save documents to vector store: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setting Up a Couchbase Cache\n", + "To further optimize our system, we set up a Couchbase-based cache. A cache is a temporary storage layer that holds data that is frequently accessed, speeding up operations by reducing the need to repeatedly retrieve the same information from the database. In our setup, the cache will help us accelerate repetitive tasks, such as looking up similar documents. By implementing a cache, we enhance the overall performance of our search engine, ensuring that it can handle high query volumes and deliver results quickly.\n", + "\n", + "Caching is particularly valuable in scenarios where users may submit similar queries multiple times or where certain pieces of information are frequently requested. By storing these in a cache, we can significantly reduce the time it takes to respond to these queries, improving the user experience.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-05-25 14:41:40,203 - INFO - Successfully created cache\n" + ] + } + ], + "source": [ + "try:\n", + " cache = CouchbaseCache(\n", + " cluster=cluster,\n", + " bucket_name=CB_BUCKET_NAME,\n", + " scope_name=SCOPE_NAME,\n", + " collection_name=CACHE_COLLECTION,\n", + " )\n", + " logging.info(\"Successfully created cache\")\n", + " set_llm_cache(cache)\n", + "except Exception as e:\n", + " raise ValueError(f\"Failed to create cache: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setting Up the LLM Model\n", + "In this section, we set up the Large Language Model (LLM) for our RAG system. We're using the Deepseek model, which can be accessed through two different methods:\n", + "\n", + "1. **Deepseek API Key**: This is obtained directly from Deepseek's platform (https://deepseek.ai) by creating an account and subscribing to their API services. With this key, you can access Deepseek's models directly using the `ChatDeepSeek` class from the `langchain_deepseek` package.\n", + "\n", + "2. **OpenRouter API Key**: OpenRouter (https://openrouter.ai) is a service that provides unified access to multiple LLM providers, including Deepseek. You can obtain an API key by creating an account on OpenRouter's website. This approach uses the `ChatOpenAI` class from `langchain_openai` but with a custom base URL pointing to OpenRouter's API endpoint.\n", + "\n", + "The key difference is that OpenRouter acts as an intermediary service that can route your requests to various LLM providers, while the Deepseek API gives you direct access to only Deepseek's models. OpenRouter can be useful if you want to switch between different LLM providers without changing your code significantly.\n", + "\n", + "In our implementation, we check for both keys and prioritize using the Deepseek API directly if available, falling back to OpenRouter if not. The model is configured with temperature=0 to ensure deterministic, focused responses suitable for RAG applications.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-05-25 14:41:40,237 - INFO - Successfully created Deepseek LLM client through OpenRouter\n" + ] + } + ], + "source": [ + "from langchain_deepseek import ChatDeepSeek\n", + "from langchain_openai import ChatOpenAI\n", + "\n", + "if DEEPSEEK_API_KEY:\n", + " try:\n", + " llm = ChatDeepSeek(\n", + " api_key=DEEPSEEK_API_KEY,\n", + " model_name=\"deepseek-chat\",\n", + " temperature=0\n", + " )\n", + " logging.info(\"Successfully created Deepseek LLM client\")\n", + " except Exception as e:\n", + " raise ValueError(f\"Error creating Deepseek LLM client: {str(e)}\")\n", + "elif OPENROUTER_API_KEY:\n", + " try:\n", + " llm = ChatOpenAI(\n", + " api_key=OPENROUTER_API_KEY,\n", + " base_url=\"https://openrouter.ai/api/v1\",\n", + " model=\"deepseek/deepseek-chat-v3.1\", \n", + " temperature=0,\n", + " )\n", + " logging.info(\"Successfully created Deepseek LLM client through OpenRouter\")\n", + " except Exception as e:\n", + " raise ValueError(f\"Error creating Deepseek LLM client: {str(e)}\")\n", + "else:\n", + " raise ValueError(\"Either Deepseek API Key or OpenRouter API Key must be provided\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Perform Semantic Search\n", + "Semantic search in Couchbase involves converting queries and documents into vector representations using an embeddings model. These vectors capture the semantic meaning of the text and are stored directly in Couchbase. When a query is made, Couchbase performs a similarity search by comparing the query vector against the stored document vectors. The similarity metric used for this comparison is configurable, allowing flexibility in how the relevance of documents is determined.\n", + "\n", + "In the provided code, the search process begins by recording the start time, followed by executing the similarity_search_with_score method of the CouchbaseSearchVectorStore. This method searches Couchbase for the most relevant documents based on the vector similarity to the query. The search results include the document content and a similarity score that reflects how closely each document aligns with the query in the defined semantic space. The time taken to perform this search is then calculated and logged, and the results are displayed, showing the most relevant documents along with their similarity scores. This approach leverages Couchbase as both a storage and retrieval engine for vector data, enabling efficient and scalable semantic searches. The integration of vector storage and search capabilities within Couchbase allows for sophisticated semantic search operations without relying on external services for vector storage or comparison." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-05-25 14:41:41,802 - INFO - Semantic search completed in 1.56 seconds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Semantic Search Results (completed in 1.56 seconds):\n", + "--------------------------------------------------------------------------------\n", + "Score: 0.6303, Text: The Littler effect - how darts hit the bullseye\n", + "\n", + "Teenager Luke Littler began his bid to win the 2025 PDC World Darts Championship with a second-round win against Ryan Meikle. Here we assess Littler's impact after a remarkable rise which saw him named BBC Young Sports Personality of the Year and runner-up in the main award to athlete Keely Hodgkinson.\n", + "\n", + "One year ago, he was barely a household name in his own home. Now he is a sporting phenomenon. After emerging from obscurity aged 16 to reach the World Championship final, the life of Luke Littler and the sport he loves has been transformed. Viewing figures, ticket sales and social media interest have rocketed. Darts has hit the bullseye. This Christmas more than 100,000 children are expected to be opening Littler-branded magnetic dartboards as presents. His impact has helped double the number of junior academies, prompted plans to expand the World Championship and generated interest in darts from Saudi Arabian backers.\n", + "\n", + "Just months after taking his GCSE exams and ranked 164th in the world, Littler beat former champions Raymond van Barneveld and Rob Cross en route to the PDC World Championship final in January, before his run ended with a 7-4 loss to Luke Humphries. With his nickname 'The Nuke' on his purple and yellow shirt and the Alexandra Palace crowd belting out his walk-on song, Pitbull's tune Greenlight, he became an instant hit. Electric on the stage, calm off it. The down-to-earth teenager celebrated with a kebab and computer games. \"We've been watching his progress since he was about seven. He was on our radar, but we never anticipated what would happen. The next thing we know 'Littlermania' is spreading everywhere,\" PDC president Barry Hearn told BBC Sport. A peak TV audience of 3.7 million people watched the final - easily Sky's biggest figure for a non-football sporting event. The teenager from Warrington in Cheshire was too young to legally drive or drink alcohol, but earned £200,000 for finishing second - part of £1m prize money in his first year as a professional - and an invitation to the elite Premier League competition. He turned 17 later in January but was he too young for the demanding event over 17 Thursday nights in 17 locations? He ended up winning the whole thing, and hit a nine-dart finish against Humphries in the final. From Bahrain to Wolverhampton, Littler claimed 10 titles in 2024 and is now eyeing the World Championship.\n", + "\n", + "As he progressed at the Ally Pally, the Manchester United fan was sent a good luck message by the club's former midfielder and ex-England captain David Beckham. In 12 months, Littler's Instagram followers have risen from 4,000 to 1.3m. Commercial backers include a clothing range, cereal firm and train company and he will appear in a reboot of the TV darts show Bullseye. Google say he was the most searched-for athlete online in the UK during 2024. On the back of his success, Littler darts, boards, cabinets, shirts are being snapped up in big numbers. \"This Christmas the junior magnetic dartboard is selling out, we're talking over 100,000. They're 20 quid and a great introduction for young children,\" said Garry Plummer, the boss of sponsors Target Darts, who first signed a deal with Littler's family when he was aged 12. \"All the toy shops want it, they all want him - 17, clean, doesn't drink, wonderful.\"\n", + "\n", + "Littler beat Luke Humphries to win the Premier League title in May\n", + "\n", + "The number of academies for children under the age of 16 has doubled in the last year, says Junior Darts Corporation chairman Steve Brown. There are 115 dedicated groups offering youngsters equipment, tournaments and a place to develop, with bases including Australia, Bulgaria, Greece, Norway, USA and Mongolia. \"We've seen so many inquiries from around the world, it's been such a boom. It took us 14 years to get 1,600 members and within 12 months we have over 3,000, and waiting lists,\" said Brown. \"When I played darts as a child, I was quite embarrassed to tell my friends what my hobby was. All these kids playing darts now are pretty popular at school. It's a bit rock 'n roll and recognised as a cool thing to do.\" Plans are being hatched to extend the World Championship by four days and increase the number of players from 96 to 128. That will boost the number of tickets available by 25,000 to 115,000 but Hearn reckons he could sell three times as many. He says Saudi Arabia wants to host a tournament, which is likely to happen if no-alcohol regulations are relaxed. \"They will change their rules in the next 12 months probably for certain areas having alcohol, and we'll take darts there and have a party in Saudi,\" he said. \"When I got involved in darts, the total prize money was something like £300,000 for the year. This year it will go to £20m. I expect in five years' time, we'll be playing for £40m.\"\n", + "\n", + "Former electrician Cross charged to the 2018 world title in his first full season, while Adrian Lewis and Michael van Gerwen were multiple victors in their 20s and 16-time champion Phil ‘The Power’ Taylor is widely considered the greatest of all time. Littler is currently fourth in the world rankings, although that is based on a two-year Order of Merit. There have been suggestions from others the spotlight on the teenager means world number one Humphries, 29, has been denied the coverage he deserves, but no darts player has made a mark at such a young age as Littler. \"Luke Humphries is another fabulous player who is going to be around for years. Sport is a very brutal world. It is about winning and claiming the high ground. There will be envy around,\" Hearn said. \"Luke Littler is the next Tiger Woods for darts so they better get used to it, and the only way to compete is to get better.\" World number 38 Martin Lukeman was awestruck as he described facing a peak Littler after being crushed 16-3 in the Grand Slam final, with the teenager winning 15 consecutive legs. \"I can't compete with that, it was like Godly. He was relentless, he is so good it's ridiculous,\" he said. Lukeman can still see the benefits he brings, adding: \"What he's done for the sport is brilliant. If it wasn't for him, our wages wouldn't be going up. There's more sponsors, more money coming in, all good.\" Hearn feels future competition may come from players even younger than Littler. \"I watched a 10-year-old a few months ago who averaged 104.89 and checked out a 4-3 win with a 136 finish. They smell the money, the fame and put the hard work in,\" he said. How much better Littler can get is guesswork, although Plummer believes he wants to reach new heights. \"He never says 'how good was I?' But I think he wants to break records and beat Phil Taylor's 16 World Championships and 16 World Matchplay titles,\" he said. \"He's young enough to do it.\" A version of this article was originally published on 29 November.\n", + "• None Know a lot about Littler? Take our quiz\n", + "--------------------------------------------------------------------------------\n", + "Score: 0.6099, Text: Luke Littler has risen from 164th to fourth in the rankings in a year\n", + "\n", + "A tearful Luke Littler hit a tournament record 140.91 set average as he started his bid for the PDC World Championship title with a dramatic 3-1 win over Ryan Meikle. The 17-year-old made headlines around the world when he reached the tournament final in January, where he lost to Luke Humphries. Starting this campaign on Saturday, Littler was millimetres away from a nine-darter when he missed double 12 as he blew Meikle away in the fourth and final set of the second-round match. Littler was overcome with emotion at the end, cutting short his on-stage interview. \"It was probably the toughest game I've ever played. I had to fight until the end,\" he said later in a news conference. \"As soon as the question came on stage and then boom, the tears came. It was just a bit too much to speak on stage. \"It is the worst game I have played. I have never felt anything like that tonight.\" Admitting to nerves during the match, he told Sky Sports: \"Yes, probably the biggest time it's hit me. Coming into it I was fine, but as soon as [referee] George Noble said 'game on', I couldn't throw them.\" Littler started slowly against Meikle, who had two darts for the opening set, but he took the lead by twice hitting double 20. Meikle did not look overawed against his fellow Englishman and levelled, but Littler won the third set and exploded into life in the fourth. The tournament favourite hit four maximum 180s as he clinched three straight legs in 11, 10 and 11 darts for a record set average, and 100.85 overall. Meanwhile, two seeds crashed out on Saturday night – five-time world champion Raymond van Barneveld lost to Welshman Nick Kenny, while England's Ryan Joyce beat Danny Noppert. Australian Damon Heta was another to narrowly miss out on a nine-darter, just failing on double 12 when throwing for the match in a 3-1 win over Connor Scutt. Ninth seed Heta hit four 100-plus checkouts to come from a set down against Scutt in a match in which both men averaged more than 97.\n", + "\n", + "Littler was hugged by his parents after victory over Meikle\n", + "\n", + "Littler returned to Alexandra Palace to a boisterous reception from more than 3,000 spectators and delivered an astonishing display in the fourth set. He was on for a nine-darter after his opening two throws in both of the first two legs and completed the set in 32 darts - the minimum possible is 27. The teenager will next play after Christmas against European Championship winner Ritchie Edhouse, the 29th seed, or Ian White, and is seeded to meet Humphries in the semi-finals. Having entered last year's event ranked 164th, Littler is up to fourth in the world and will go to number two if he reaches the final again this time. He has won 10 titles in his debut professional year, including the Premier League and Grand Slam of Darts. After reaching the World Championship final as a debutant aged just 16, Littler's life has been transformed and interest in darts has rocketed. Google say he was the most searched-for athlete online in the UK during 2024. This Christmas, more than 100,000 children are expected to be opening Littler-branded magnetic dartboards as presents. His impact has helped double the number of junior academies and has prompted plans to expand the World Championship. Littler was named BBC Young Sports Personality of the Year on Tuesday and was runner-up to athlete Keely Hodgkinson for the main award.\n", + "\n", + "Nick Kenny will play world champion Luke Humphries in round three after Christmas\n", + "\n", + "Barneveld was shocked 3-1 by world number 76 Kenny, who was in tears after a famous victory. Kenny, 32, will face Humphries in round three after defeating the Dutchman, who won the BDO world title four times and the PDC crown in 2007. Van Barneveld, ranked 32nd, became the sixth seed to exit in the second round. His compatriot Noppert, the 13th seed, was stunned 3-1 by Joyce, who will face Ryan Searle or Matt Campbell next, with the winner of that tie potentially meeting Littler in the last 16. Elsewhere, 15th seed Chris Dobey booked his place in the third round with a 3-1 win over Alexander Merkx. Englishman Dobey concluded an afternoon session which started with a trio of 3-0 scorelines. Northern Ireland's Brendan Dolan beat Lok Yin Lee to set up a meeting with three-time champion Michael van Gerwen after Christmas. In the final two first-round matches of the 2025 competition, Wales' Rhys Griffin beat Karel Sedlacek of the Czech Republic before Asia number one Alexis Toylo cruised past Richard Veenstra.\n", + "--------------------------------------------------------------------------------\n", + "Score: 0.5980, Text: Luke Littler is one of six contenders for the 2024 BBC Sports Personality of the Year award.\n", + "\n", + "Here BBC Sport takes a look at the darts player's year in five photos.\n", + "--------------------------------------------------------------------------------\n", + "Score: 0.5590, Text: Littler is Young Sports Personality of the Year\n", + "\n", + "This video can not be played To play this video you need to enable JavaScript in your browser.\n", + "\n", + "Darts player Luke Littler has been named BBC Young Sports Personality of the Year 2024. The 17-year-old has enjoyed a breakthrough year after finishing runner-up at the 2024 PDC World Darts Championship in January. The Englishman, who has won 10 senior titles on the Professional Darts Corporation tour this year, is the first darts player to claim the award. \"It shows how well I have done this year, not only for myself, but I have changed the sport of darts,\" Littler told BBC One. \"I know the amount of academies that have been brought up in different locations, tickets selling out at Ally Pally in hours and the Premier League selling out - it just shows how much I have changed it.\"\n", + "\n", + "He was presented with the trophy by Harry Aikines-Aryeetey - a former sprinter who won the award in 2005 - and ex-rugby union player Jodie Ounsley, both of whom are stars of the BBC television show Gladiators. Skateboarder Sky Brown, 16, and Para-swimmer William Ellard, 18, were also shortlisted for the award. Littler became a household name at the start of 2024 by reaching the World Championship final aged just 16 years and 347 days. That achievement was just the start of a trophy-laden year, with Littler winning the Premier League Darts, Grand Slam and World Series of Darts Finals among his haul of titles. Littler has gone from 164th to fourth in the world rankings and earned more than £1m in prize money in 2024. The judging panel for Young Sports Personality of the Year included Paralympic gold medallist Sammi Kinghorn, Olympic silver medal-winning BMX freestyler Keiran Reilly, television presenter Qasa Alom and Radio 1 DJ Jeremiah Asiamah, as well as representatives from the Youth Sport Trust, Blue Peter and BBC Sport.\n", + "--------------------------------------------------------------------------------\n", + "Score: 0.5414, Text: Wright is the 17th seed at the World Championship\n", + "\n", + "Two-time champion Peter Wright won his opening game at the PDC World Championship, while Ryan Meikle edged out Fallon Sherrock to set up a match against teenage prodigy Luke Littler. Scotland's Wright, the 2020 and 2022 winner, has been out of form this year, but overcame Wesley Plaisier 3-1 in the second round at Alexandra Palace in London. \"It was this crowd that got me through, they wanted me to win. I thank you all,\" said Wright. Meikle came from a set down to claim a 3-2 victory in his first-round match against Sherrock, who was the first woman to win matches at the tournament five years ago. The 28-year-old will now play on Saturday against Littler, who was named BBC Young Sports Personality of the Year and runner-up in the main award to athlete Keely Hodgkinson on Tuesday night. Littler, 17, will be competing on the Ally Pally stage for the first time since his rise to stardom when finishing runner-up in January's world final to Luke Humphries. Earlier on Tuesday, World Grand Prix champion Mike de Decker – the 24th seed - suffered a surprise defeat to Luke Woodhouse in the second round. He is the second seed to exit following 16th seed James Wade's defeat on Monday to Jermaine Wattimena, who meets Wright in round three. Kevin Doets recovered from a set down to win 3-1 against Noa-Lynn van Leuven, who was making history as the first transgender woman to compete in the tournament.\n", + "\n", + "Sherrock drew level at 2-2 but lost the final set to Meikle\n", + "\n", + "The 54-year-old Wright only averaged 89.63 to his opponent's 93.77, but did enough to progress. Sporting a purple mohawk and festive outfit, crowd favourite 'Snakebite' showed glimpses of his best to win the first set and survived eight set darts to go 2-0 ahead. He lost the next but Dutchman Plaisier missed two more set darts in the fourth and Wright seized his opportunity. \"Wesley had his chances but he missed them and I took them,\" he said. \"He's got his tour card and he's going to be a dangerous player next year for all the players playing against him.\" Sherrock, 30, fought back from 2-1 down to force a decider against her English compatriot Meikle. She then narrowly missed the bull to take out 170 in the fourth leg before left-hander Meikle held his nerve to hit double 18 for a 96 finish to seal a hard-fought success. \"I felt under pressure from the start and to come through feels unbelievable,\" said Meikle. \"It's an unbelievable prize to play Luke here on this stage. It's the biggest stage of them all. I'm so happy.\" World number 81 Jeffrey de Graaf, who was born in the Netherlands but now represents Sweden, looked in trouble against Rashad Sweeting before prevailing 3-1. Sweeting, who was making history as the first player from the Bahamas to compete in the tournament, took the first set, but De Graaf fought back to clinch a second-round meeting with two-time champion Gary Anderson Germany's Ricardo Pietreczko, ranked 34, beat China's Xiaochen Zong 3-1 and will face Gian van Veen next.\n", + "--------------------------------------------------------------------------------\n", + "Score: 0.5402, Text: Second seed Smith knocked out of Worlds by Doets\n", + "\n", + "Michael Smith was 2-1 ahead but fell to a shock exit\n", + "\n", + "Former champion Michael Smith has been sensationally knocked out of the PDC World Championship by Kevin Doets. Englishman Smith, seeded second, went down 3-2 after a pulsating second-round duel at Alexandra Palace in London. Dutchman Doets prevailed 6-4 in the deciding set, despite checkouts of 123, 84, 94 and 76 from 2023 champion Smith. \"This was the most stressful game of my life and I've won it, yes,\" said world number 51 Doets. \"I felt if I can keep my focus, I won't lose this. It was so very tight, to get over the line was amazing.\" Doets, 26, took the first set and fought back after going 2-1 down to avenge his narrow defeat to Smith at the same stage last year. Having lost in the second round of the tournament for the first time since 2020, the 34-year-old Smith will now drop to at least 15 in the rankings.\n", + "\n", + "Doets won in the first round against Noa-Lynn van Leuven, who was the first transgender woman to play in the tournament\n", + "\n", + "England's Scott Williams, who made a shock run to the semi-finals in the 2024 tournament before losing to eventual champion Luke Humphries, overcame Niko Springer 3-1 in a thriller. German debutant Springer, second on this year's development tour, won all three legs in the opening set before the match exploded into life. Williams hit back, showing his old swagger as he went ahead after a sensational third set which featured seven 180s. The 34-year-old edged the deciding leg in the fourth and will meet 2018 champion Rob Cross in round two on Monday. Nick Kenny delighted the Ally Pally crowd with a fabulous 170 finish to seal a 3-0 victory in round one over American Stowe Buntz. The Welshman, 31, will face five-time world champion Raymond Barneveld on Saturday evening on a bill which also features teenage star Luke Littler against Ryan Meikle. Canadian Matt Campbell set up a second-round match against Ryan Searle with a 3-2 defeat of Austrian Mensur Suljovic.\n", + "\n", + "England's Callan Rydz averaged 107.06 to book his place in the second round, before Gabriel Clemens was knocked out by Wales' Robert Owen on Thursday afternoon. Rydz beat Croatian Romeo Grbavac 3-0, recording the tournament's highest average first-round match average in its current 96-player format. It was the competition's 26th highest match average overall and comfortably the best so far at the 2025 event. The previous record was held by teenager Luke Littler, who scored 106.12 at this stage last year. Rydz, from Bedlington in Northumberland, meets Germany's Martin Schindler in the second round on Sunday evening. The afternoon session concluded with Germany's 27th seed Clemens being beaten by Owen, who is ranked 50 places below him. Owen recorded a 3-1 victory, his second in a matter of days, to reach the third round, which begins on 27 December. Hong Kong's Lok Yin Lee came from a set down to beat Chris Landman 3-1 after winning nine straight legs. Lee will face Northern Ireland's Brendan Dolan in round two on Saturday afternoon. Meanwhile, 2024 Grand Slam of Darts runner-up Martin Lukeman came from a set down to beat Indian qualifier Nitin Kumar 3-1. Lukeman meets 21st seed Andrew Gilding on Monday afternoon for a place in the last 32.\n", + "--------------------------------------------------------------------------------\n", + "Score: 0.5328, Text: Cross loses as record number of seeds out of Worlds\n", + "\n", + "Rob Cross has suffered three second-round exits in his eight World Championships\n", + "\n", + "Former champion Rob Cross became the latest high-profile casualty as a record-breaking 14th seed exited the PDC World Darts Championship in the second round. The number five seed was beaten 3-1 by close friend Scott Williams, who will face Germany's Ricardo Pietreczko in round three. Cross, who won the event on his debut in 2018, took the opening set but failed to reach anywhere near his best as he suffered his third second-round exit. He was joined by number six seed David Chisnall, who was beaten 3-2 in a sudden-death leg by Ricky Evans, who came into the tournament 46th in the PDC's Order of Merit. The 2021 semi-finalist won the opening set, but then found himself 2-1 down to an inspired Evans, who was cheered on relentlessly by the Alexandra Palace crowd. He forced the game into a deciding set and faced match dart but Evans missed bullseye by the width of the wire. Chisnall then missed his own match dart on double tops, before he made a miscalculation when attempting to checkout 139 at 5-4 down. No real harm was done with a sudden-death leg forced but he was unable to hold off Evans, who reaches the third round for the third time in the last five years. \"It's not even what it is, again I've played a world-class darts player. I've played quite well and won,\" Evans told Sky Sports. \"Look at this [the crowd], wow. I don't understand it, why are they cheering me on? \"I don't get this reception in my household. Thank you very much. You've made a very fat guy very happy.\" Evans will face unseeded Welshman Robert Owen when the third round starts after the three-day Christmas break.\n", + "\n", + "World youth champion Gian van Veen had become the 12th seed to be knocked out when he lost 3-1 to Pietreczko. The 28th seed lost the opening set, having missed nine darts at double, but levelled. However, the Dutchman was unable to match Pietreczko, who closed out a comfortable win with a checkout percentage of 55.6%. Pietreczko said: \"I am over the moon to win. It is very important for me to be in the third round after Christmas. I love the big stage.\" The 26th seed trailed 1-0 and 2-1, and both players went on to miss match darts, before Gurney won the final set 3-1 on legs.\n", + "\n", + "Jonny Clayton is into the third round of the PDC World Darts Championship for a sixth consecutive year\n", + "\n", + "In the afternoon session, Welsh number seven seed Jonny Clayton also needed sudden death to pull off a sensational final-set comeback against Mickey Mansell in. He was a leg away from defeat twice to his Northern Irish opponent, but came from behind to win the final set 6-5 in a sudden-death leg to win 3-2. Clayton, who will play Gurney in round three, lost the opening set of the match, but fought back to lead 2-1, before being pegged back again by 51-year-old Mansell, who then missed match darts on double tops in the deciding set. \"I was very emotional. I've got to be honest, that meant a lot,\" said Clayton, who is in the favourable half of the draw following shock second-round exits for former world champions Michael Smith and Gary Anderson. \"I had chances before and Mickey definitely had chances before. It wasn't great to play in, not the best - I wouldn't wish that on my worst enemy. \"There is a lot of weight off my shoulders after that. I know there is another gear or two in the bank, but I'll be honest that meant a lot to me, it is a tester and will try and make me believe again.\" Clayton was 2-0 down in the fifth set after consecutive 136 and 154 checkouts from Mansell, but won three legs on the trot in 15, 12 and 10 darts to wrestle a 3-2 lead. He missed three darts for the match, before his unseeded opponent held and broke Clayton's throw to lead 4-3. Mansell missed a match dart at double 20, before Clayton won on double five after two missed checkouts. Elsewhere, Northern Ireland's Josh Rock booked his place in the third round against England's Chris Dobey with a 3-0 win over Wales' Rhys Griffin. Martin Lukeman, runner-up to Luke Littler at the Grand Slam of Darts last month, is out after a 3-1 loss to number 21 seed Andrew Gilding. The final day before the Christmas break started with Poland's number 31 seed Krzysztof Ratajski recording a 3-1 win over Alexis Toylo of the Philippines.\n", + "\n", + "All times are GMT and subject to change. Two fourth-round matches will also be played\n", + "--------------------------------------------------------------------------------\n", + "Score: 0.5116, Text: Michael van Gerwen has made just one major ranking event final in 2024\n", + "\n", + "Michael van Gerwen enjoyed a comfortable 3-0 victory over English debutant James Hurrell in his opening match of the PDC World Darts Championship. The three-time world champion has had a tough year by his standards, having fallen behind Luke Littler and Luke Humphries, so a relatively stress-free opening match at Alexandra Palace was just what was needed. Hurrell, 40, offered some resistance early on when taking the opening leg of the match, but he would win just two more as Van Gerwen proved far too strong. The third-seeded Dutchman averaged 94.85, took out two three-figure checkouts and hit 50% of his doubles - with six of his nine misses coming in one scrappy leg. Van Gerwen, 35, will now face either Brendan Dolan or Lok Yin Lee in the third round.\n", + "\n", + "\"I think I played OK,\" Van Gerwen told Sky Sports after his match. \"Of course, I was a bit nervous. Like everyone knows it's been a tough year for me. \"Overall, it was a good performance. I was confident. I won the game, that's the main thing.\" Also on Friday night, Germany's Florian Hempel showed why he loves playing on the Alexandra Palace stage with a thrilling 3-1 victory in a high-quality contest against Jeffrey de Zwaan. Both men hit seven 180s in a match played at a fast and furious pace, but 34-year-old Hempel's superior doubles gave him a fourth straight first-round victory in the competition. Hempel moves on to a tie with 26th seed Daryl Gurney but it was a damaging loss for De Zwaan, 28, who came through a late qualifier in November and needed a good run here to keep his PDC tour card for next season. Mickey Mansell earned a second-round date with world number seven Jonny Clayton after a scrappy 3-1 win over Japan's Tomoya Goto, while Dylan Slevin came through an all-Irish tie against William O'Connor to progress to a meeting with Dimitri van den Bergh.\n", + "\n", + "Stephen Bunting is in the third round of the PDC World Darts Championship for a third consecutive year\n", + "\n", + "In the afternoon session, Stephen Bunting came from behind to beat Kai Gotthardt 3-1 and book his place in the third round. Englishman Bunting, ranked eighth in the world, dropped the first set and almost went 2-0 down in the match before staging an impressive recovery. Tournament debutant Gotthardt missed three darts at double eight to win the second set, allowing Bunting to take out double 10 to level the match before powering away to victory by winning the third and fourth sets without losing a leg. Victory for \"The Bullet\" sets up a last 32 meeting with the winner of Dirk van Duijvenbode's meeting with Madars Razma after Christmas. Should Bunting progress further, he is seeded to face world number one and defending world champion Luke Humphries in the quarter-finals on New Year's Day. Elsewhere in Friday afternoon's session, the Dutch duo of Alexander Merkx and Wessel Nijman advanced to the second round with wins over Stephen Burton and Cameron Carolissen respectively. England's Ian White was handed a walkover victory against Sandro Eric Sosing of the Philippines. Sosing withdrew from the competition on medical grounds and was taken to hospital following chest pains.\n", + "--------------------------------------------------------------------------------\n", + "Score: 0.5113, Text: Gary Anderson was the fifth seed to be beaten on Sunday\n", + "\n", + "Two-time champion Gary Anderson has been dumped out of the PDC World Championship on his 54th birthday by Jeffrey de Graaf. The Scot, winner in 2015 and 2016, lost 3-0 to the Swede in a second-round shock at Alexandra Palace in London. \"Gary didn't really show up as he usually does. I'm very happy with the win,\" said De Graaf, 34, who had a 75% checkout success and began with an 11-dart finish. \"It's a dream come true for me. He's been my idol since I was 14 years old.\" Anderson, ranked 14th, became the 11th seed to be knocked out from the 24 who have played so far, and the fifth to fall on Sunday.\n", + "\n", + "He came into the competition with the year's highest overall three-dart average of 99.66 but hit just three of his 20 checkout attempts to lose his opening match of the tournament for the first time. De Graaf will now meet Filipino qualifier Paolo Nebrida after he stunned England's Ross Smith, the 19th seed, in straight sets. Ritchie Edhouse, Dirk van Duijvenbode and Martin Schindler were the other seeds beaten on day eight. England's Callan Rydz, who hit a record first-round average of 107.06 on Thursday, followed up with a 3-0 win over 23rd seed Schindler on Sunday. The German missed double 12 for a nine-darter in the first set – the third player to do so in 24 hours after Luke Littler and Damon Heta – and ended up losing the leg. Rydz next meets Belgian Dimitri van den Bergh, who hit six 180s and averaged 96 in a 3-0 win over Irishman Dylan Slevin.\n", + "\n", + "England's Joe Cullen abruptly left his post-match news conference and accused the media of not showing him respect after his 3-0 win over Dutchman Wessel Nijman. Nijman, who has previously served a ban for breaching betting and anti-corruption rules, had been billed as favourite beforehand to beat 23rd seed Cullen. \"Honestly, the media attention that Wessel's got, again this is not a reflection on him,\" Cullen said. \"He seems like a fantastic kid, he's been caught up in a few things beforehand, but he's served his time and he's held his hands up, like a lot haven't. \"I think the way I've been treated probably with the media and things like that - I know you guys have no control over the bookies - I've been shown no respect, so I won't be showing any respect to any of you guys tonight. \"I'm going to go home. Cheers.\" Ian 'Diamond' White beat European champion and 29th seed Edhouse 3-1 and will face teenage star Littler in the next round. White, born in the same Cheshire town as the 17-year-old, acknowledged he would need to up his game in round three. Asked if he knew who was waiting for him, White joked: \"Yeah, Runcorn's number two. I'm from Runcorn and I'm number one.\" Ryan Searle started Sunday afternoon's action off with a 10-dart leg and went on to beat Matt Campbell 3-0, while Latvian Madars Razma defeated 25th seed Van Duijvenbode 3-1. Seventh seed Jonny Clayton and 2018 champion Rob Cross are among the players in action on Monday as the second round concludes. The third round will start on Friday after a three-day break for Christmas.\n", + "--------------------------------------------------------------------------------\n", + "Score: 0.5105, Text: Christian Kist was sealing his first televised nine-darter\n", + "\n", + "Christian Kist hit a nine-darter but lost his PDC World Championship first-round match to Madars Razma. The Dutchman became the first player to seal a perfect leg in the tournament since Michael Smith did so on the way to beating Michael van Gerwen in the 2023 final. Kist, the 2012 BDO world champion at Lakeside, collects £60,000 for the feat, with the same amount being awarded by sponsors to a charity and to one spectator inside Alexandra Palace in London. The 38-year-old's brilliant finish sealed the opening set, but his Latvian opponent bounced back to win 3-1. Darts is one of the few sports that can measure perfection; snooker has the 147 maximum break, golf has the hole-in-one, darts has the nine-dart finish. Kist scored two maximum 180s to leave a 141 checkout which he completed with a double 12, to the delight of more than 3,000 spectators. The English 12th seed, who has been troubled by wrist and back injuries, could next play Andrew Gilding in the third round - which begins on 27 December - should Gilding beat the winner of Martin Lukeman's match against qualifier Nitin Kumar. Aspinall faces a tough task to reach the last four again, with 2018 champion Rob Cross and 2024 runner-up Luke Littler both in his side of the draw.\n", + "\n", + "Kist - who was knocked out of last year's tournament by teenager Littler - will still earn a bigger cheque than he would have got for a routine run to the quarter-finals. His nine-darter was the 15th in the history of the championship and first since the greatest leg in darts history when Smith struck, moments after Van Gerwen just missed his attempt. Darts fan Kris, a railway worker from Sutton in south London, was the random spectator picked out to receive £60,000, with Prostate Cancer UK getting the same sum from tournament sponsors Paddy Power. \"I'm speechless to be honest. I didn't expect it to happen to me,\" Kris said. \"This was a birthday present so it makes it even better. My grandad got me tickets. It was just a normal day - I came here after work.\" Kist said: \"Hitting the double 12 felt amazing. It was a lovely moment for everyone and I hope Kris enjoys the money. Maybe I will go on vacation next month.\" Earlier, Jim Williams was favourite against Paolo Nebrida but lost 3-2 in an epic lasting more than an hour. The Filipino took a surprise 2-1 lead and Williams only went ahead for the first time in the opening leg of the deciding set. The Welshman looked on course for victory but missed five match darts. UK Open semi-finalist Ricky Evans set up a second-round match against Dave Chisnall, checking out on 109 to edge past Gordon Mathers 3-2.\n", + "--------------------------------------------------------------------------------\n" + ] + } + ], + "source": [ + "query = \"What were Luke Littler's key achievements and records in his recent PDC World Championship match?\"\n", + "\n", + "try:\n", + " # Perform the semantic search\n", + " start_time = time.time()\n", + " search_results = vector_store.similarity_search_with_score(query, k=10)\n", + " search_elapsed_time = time.time() - start_time\n", + "\n", + " logging.info(f\"Semantic search completed in {search_elapsed_time:.2f} seconds\")\n", + "\n", + " # Display search results\n", + " print(f\"\\nSemantic Search Results (completed in {search_elapsed_time:.2f} seconds):\")\n", + " print(\"-\" * 80)\n", + "\n", + " for doc, score in search_results:\n", + " print(f\"Score: {score:.4f}, Text: {doc.page_content}\")\n", + " print(\"-\" * 80)\n", + "\n", + "except CouchbaseException as e:\n", + " raise RuntimeError(f\"Error performing semantic search: {str(e)}\")\n", + "except Exception as e:\n", + " raise RuntimeError(f\"Unexpected error: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Retrieval-Augmented Generation (RAG) with Couchbase and LangChain\n", + "Couchbase and LangChain can be seamlessly integrated to create RAG (Retrieval-Augmented Generation) chains, enhancing the process of generating contextually relevant responses. In this setup, Couchbase serves as the vector store, where embeddings of documents are stored. When a query is made, LangChain retrieves the most relevant documents from Couchbase by comparing the query’s embedding with the stored document embeddings. These documents, which provide contextual information, are then passed to a generative language model within LangChain.\n", + "\n", + "The language model, equipped with the context from the retrieved documents, generates a response that is both informed and contextually accurate. This integration allows the RAG chain to leverage Couchbase’s efficient storage and retrieval capabilities, while LangChain handles the generation of responses based on the context provided by the retrieved documents. Together, they create a powerful system that can deliver highly relevant and accurate answers by combining the strengths of both retrieval and generation." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-05-25 14:41:41,810 - INFO - Successfully created RAG chain\n" + ] + } + ], + "source": [ + "# Create RAG prompt template\n", + "rag_prompt = ChatPromptTemplate.from_messages([\n", + " (\"system\", \"You are a helpful assistant that answers questions based on the provided context.\"),\n", + " (\"human\", \"Context: {context}\\n\\nQuestion: {question}\")\n", + "])\n", + "\n", + "# Create RAG chain\n", + "rag_chain = (\n", + " {\"context\": vector_store.as_retriever(), \"question\": RunnablePassthrough()}\n", + " | rag_prompt\n", + " | llm\n", + " | StrOutputParser()\n", + ")\n", + "logging.info(\"Successfully created RAG chain\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RAG Response: In his recent 2025 PDC World Championship second-round match against Ryan Meikle, **Luke Littler** achieved several notable milestones and records:\n", + "\n", + "1. **Tournament Record Set Average**: \n", + " - Littler hit a **140.91 set average** in the fourth set, the highest ever recorded in the tournament for a single set. This included three consecutive legs finished in 11, 10, and 11 darts.\n", + "\n", + "2. **Near Nine-Darter**: \n", + " - He narrowly missed a nine-dart finish (the pinnacle of darts perfection) by millimeters when he failed to land double 12 in the fourth set.\n", + "\n", + "3. **Overall Performance**: \n", + " - Despite a slow start and admitted nerves, he secured a **3-1 victory** with a dominant fourth set, hitting **four maximum 180s** and maintaining an overall match average of **100.85**.\n", + "\n", + "4. **Emotional Impact**: \n", + " - The 17-year-old became emotional post-match, cutting short his on-stage interview due to the intensity of the moment, later calling it the \"toughest game\" he’d ever played.\n", + "\n", + "These achievements highlight his resilience and skill, further cementing his status as a rising star in darts.\n", + "RAG response generated in 21.84 seconds\n" + ] + } + ], + "source": [ + "try:\n", + " start_time = time.time()\n", + " rag_response = rag_chain.invoke(query)\n", + " rag_elapsed_time = time.time() - start_time\n", + "\n", + " print(f\"RAG Response: {rag_response}\")\n", + " print(f\"RAG response generated in {rag_elapsed_time:.2f} seconds\")\n", + "except InternalServerFailureException as e:\n", + " if \"query request rejected\" in str(e):\n", + " print(\"Error: Search request was rejected due to rate limiting. Please try again later.\")\n", + " else:\n", + " print(f\"Internal server error occurred: {str(e)}\")\n", + "except Exception as e:\n", + " print(f\"Unexpected error occurred: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using Couchbase as a caching mechanism\n", + "Couchbase can be effectively used as a caching mechanism for RAG (Retrieval-Augmented Generation) responses by storing and retrieving precomputed results for specific queries. This approach enhances the system's efficiency and speed, particularly when dealing with repeated or similar queries. When a query is first processed, the RAG chain retrieves relevant documents, generates a response using the language model, and then stores this response in Couchbase, with the query serving as the key.\n", + "\n", + "For subsequent requests with the same query, the system checks Couchbase first. If a cached response is found, it is retrieved directly from Couchbase, bypassing the need to re-run the entire RAG process. This significantly reduces response time because the computationally expensive steps of document retrieval and response generation are skipped. Couchbase's role in this setup is to provide a fast and scalable storage solution for caching these responses, ensuring that frequently asked queries can be answered more quickly and efficiently.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Query 1: What happened in the match between Fullham and Liverpool?\n", + "Response: In the match between Fulham and Liverpool, the game ended in a 2-2 draw. Key highlights include:\n", + "\n", + "1. **Red Card Incident**: Liverpool played most of the match with 10 men after Andy Robertson received a red card in the 17th minute for denying a goalscoring opportunity. He had earlier been injured in a tackle by Fulham's Issa Diop.\n", + "\n", + "2. **Comeback Resilience**: Despite the numerical disadvantage, Liverpool twice came from behind. Diogo Jota scored an 86th-minute equalizer to secure a point. Fulham's Antonee Robinson praised Liverpool, noting it \"didn’t feel like they had 10 men\" due to their aggressive, high-pressing approach.\n", + "\n", + "3. **Performance Metrics**: Liverpool dominated possession (over 60%) and led in key attacking stats (shots, big chances, touches in the opposition box), showcasing their determination even with a player deficit.\n", + "\n", + "4. **Manager & Player Reactions**: \n", + " - Manager Arne Slot commended his team’s \"outstanding\" character and resilience, particularly highlighting Robertson’s effort despite the red card.\n", + " - Captain Virgil van Dijk emphasized the team’s ability to \"stay calm\" and fight back under pressure.\n", + "\n", + "5. **League Impact**: The draw extended Liverpool’s lead at the top of the Premier League to five points, as rivals Arsenal also dropped points. Pundits, including Chris Sutton, lauded Liverpool’s \"phenomenal\" response to adversity. \n", + "\n", + "Fulham’s strong performance, described as \"brave,\" was also acknowledged, making the match a thrilling encounter between both sides.\n", + "Time taken: 14.14 seconds\n", + "\n", + "Query 2: What were Luke Littler's key achievements and records in his recent PDC World Championship match?\n", + "Response: In his recent 2025 PDC World Championship second-round match against Ryan Meikle, **Luke Littler** achieved several notable milestones and records:\n", + "\n", + "1. **Tournament Record Set Average**: \n", + " - Littler hit a **140.91 set average** in the fourth set, the highest ever recorded in the tournament for a single set. This included three consecutive legs finished in 11, 10, and 11 darts.\n", + "\n", + "2. **Near Nine-Darter**: \n", + " - He narrowly missed a nine-dart finish (the pinnacle of darts perfection) by millimeters when he failed to land double 12 in the fourth set.\n", + "\n", + "3. **Overall Performance**: \n", + " - Despite a slow start and admitted nerves, he secured a **3-1 victory** with a dominant fourth set, hitting **four maximum 180s** and maintaining an overall match average of **100.85**.\n", + "\n", + "4. **Emotional Impact**: \n", + " - The 17-year-old became emotional post-match, cutting short his on-stage interview due to the intensity of the moment, later calling it the \"toughest game\" he’d ever played.\n", + "\n", + "These achievements highlight his resilience and skill, further cementing his status as a rising star in darts.\n", + "Time taken: 1.82 seconds\n", + "\n", + "Query 3: What happened in the match between Fullham and Liverpool?\n", + "Response: In the match between Fulham and Liverpool, the game ended in a 2-2 draw. Key highlights include:\n", + "\n", + "1. **Red Card Incident**: Liverpool played most of the match with 10 men after Andy Robertson received a red card in the 17th minute for denying a goalscoring opportunity. He had earlier been injured in a tackle by Fulham's Issa Diop.\n", + "\n", + "2. **Comeback Resilience**: Despite the numerical disadvantage, Liverpool twice came from behind. Diogo Jota scored an 86th-minute equalizer to secure a point. Fulham's Antonee Robinson praised Liverpool, noting it \"didn’t feel like they had 10 men\" due to their aggressive, high-pressing approach.\n", + "\n", + "3. **Performance Metrics**: Liverpool dominated possession (over 60%) and led in key attacking stats (shots, big chances, touches in the opposition box), showcasing their determination even with a player deficit.\n", + "\n", + "4. **Manager & Player Reactions**: \n", + " - Manager Arne Slot commended his team’s \"outstanding\" character and resilience, particularly highlighting Robertson’s effort despite the red card.\n", + " - Captain Virgil van Dijk emphasized the team’s ability to \"stay calm\" and fight back under pressure.\n", + "\n", + "5. **League Impact**: The draw extended Liverpool’s lead at the top of the Premier League to five points, as rivals Arsenal also dropped points. Pundits, including Chris Sutton, lauded Liverpool’s \"phenomenal\" response to adversity. \n", + "\n", + "Fulham’s strong performance, described as \"brave,\" was also acknowledged, making the match a thrilling encounter between both sides.\n", + "Time taken: 1.52 seconds\n" + ] + } + ], + "source": [ + "try:\n", + " queries = [\n", + " \"What happened in the match between Fullham and Liverpool?\",\n", + " \"What were Luke Littler's key achievements and records in his recent PDC World Championship match?\", # Repeated query\n", + " \"What happened in the match between Fullham and Liverpool?\", # Repeated query\n", + " ]\n", + "\n", + " for i, query in enumerate(queries, 1):\n", + " print(f\"\\nQuery {i}: {query}\")\n", + " start_time = time.time()\n", + "\n", + " response = rag_chain.invoke(query)\n", + " elapsed_time = time.time() - start_time\n", + " print(f\"Response: {response}\")\n", + " print(f\"Time taken: {elapsed_time:.2f} seconds\")\n", + "\n", + "except InternalServerFailureException as e:\n", + " if \"query request rejected\" in str(e):\n", + " print(\"Error: Search request was rejected due to rate limiting. Please try again later.\")\n", + " else:\n", + " print(f\"Internal server error occurred: {str(e)}\")\n", + "except Exception as e:\n", + " print(f\"Unexpected error occurred: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion\n", + "By following these steps, you'll have a fully functional semantic search engine that leverages the strengths of Couchbase and Deepseek(via Openrouter). This guide is designed not just to show you how to build the system, but also to explain why each step is necessary, giving you a deeper understanding of the principles behind semantic search and how to implement it effectively. Whether you're a newcomer to software development or an experienced developer looking to expand your skills, this guide will provide you with the knowledge and tools you need to create a powerful, AI-driven search engine." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/openrouter-deepseek/fts/deepseek_index.json b/openrouter-deepseek/fts/deepseek_index.json new file mode 100644 index 00000000..a16840c8 --- /dev/null +++ b/openrouter-deepseek/fts/deepseek_index.json @@ -0,0 +1,73 @@ +{ + "type": "fulltext-index", + "name": "vector_search_deepseek", + "uuid": "", + "sourceType": "gocbcore", + "sourceName": "vector-search-testing", + "planParams": { + "maxPartitionsPerPIndex": 64, + "indexPartitions": 16 + }, + "params": { + "doc_config": { + "docid_prefix_delim": "", + "docid_regexp": "", + "mode": "scope.collection.type_field", + "type_field": "type" + }, + "mapping": { + "analysis": {}, + "default_analyzer": "standard", + "default_datetime_parser": "dateTimeOptional", + "default_field": "_all", + "default_mapping": { + "dynamic": true, + "enabled": false + }, + "default_type": "_default", + "docvalues_dynamic": false, + "index_dynamic": true, + "store_dynamic": false, + "type_field": "_type", + "types": { + "shared.deepseek": { + "dynamic": true, + "enabled": true, + "properties": { + "embedding": { + "dynamic": false, + "enabled": true, + "fields": [ + { + "dims": 1536, + "index": true, + "name": "embedding", + "similarity": "dot_product", + "type": "vector", + "vector_index_optimized_for": "recall" + } + ] + }, + "text": { + "dynamic": false, + "enabled": true, + "fields": [ + { + "index": true, + "name": "text", + "store": true, + "type": "text" + } + ] + } + } + } + } + }, + "store": { + "indexType": "scorch", + "segmentVersion": 16 + } + }, + "sourceParams": {} +} diff --git a/openrouter-deepseek/fts/frontmatter.md b/openrouter-deepseek/fts/frontmatter.md new file mode 100644 index 00000000..3c33a3b9 --- /dev/null +++ b/openrouter-deepseek/fts/frontmatter.md @@ -0,0 +1,23 @@ +--- +# frontmatter +path: "/tutorial-openrouter-deepseek-with-fts" +title: Retrieval-Augmented Generation with Couchbase and OpenRouter Deepseek using FTS service +short_title: RAG with Couchbase and OpenRouter Deepseek using FTS service +description: + - Learn how to build a semantic search engine using Couchbase and OpenRouter with Deepseek using FTS service. + - This tutorial demonstrates how to integrate Couchbase's vector search capabilities with OpenRouter Deepseek as both embeddings and language model provider. + - You'll understand how to perform Retrieval-Augmented Generation (RAG) using LangChain and Couchbase. +content_type: tutorial +filter: sdk +technology: + - vector search +tags: + - FTS + - Artificial Intelligence + - LangChain + - Deepseek + - OpenRouter +sdk_language: + - python +length: 60 Mins +--- diff --git a/openrouter-deepseek/gsi/.env.sample b/openrouter-deepseek/gsi/.env.sample new file mode 100644 index 00000000..0cab2fc6 --- /dev/null +++ b/openrouter-deepseek/gsi/.env.sample @@ -0,0 +1,13 @@ +DEEPSEEK_API_KEY="" +OPENAI_API_KEY="" +OPENROUTER_API_KEY="" + +# Couchbase Settings +CB_HOST=couchbase://localhost +CB_USERNAME=Administrator +CB_PASSWORD=password +CB_BUCKET_NAME=query-vector-search-testing + +SCOPE_NAME=shared +COLLECTION_NAME=deepseek +CACHE_COLLECTION=cache diff --git a/openrouter-deepseek/gsi/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb b/openrouter-deepseek/gsi/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb new file mode 100644 index 00000000..cbde7f8c --- /dev/null +++ b/openrouter-deepseek/gsi/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb @@ -0,0 +1,1089 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Introduction \n", + "In this guide, we will walk you through building a powerful semantic search engine using Couchbase as the backend database and [Deepseek V3 as the language model provider (via OpenRouter or direct API)](https://deepseek.ai/) and OpenAI for embeddings. Semantic search goes beyond simple keyword matching by understanding the context and meaning behind the words in a query, making it an essential tool for applications that require intelligent information retrieval. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system using GSI( Global Secondary Index) from scratch. Alternatively if you want to perform semantic search using the FTS index, please take a look at [this.](https://developer.couchbase.com/tutorial-openrouter-deepseek-with-fts/)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# How to run this tutorial\n", + "\n", + "This tutorial is available as a Jupyter Notebook (`.ipynb` file) that you can run interactively. You can access the original notebook [here](https://github.com/couchbase-examples/vector-search-cookbook/blob/main/openrouter-deepseek/gsi/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb).\n", + "\n", + "You can either download the notebook file and run it on [Google Colab](https://colab.research.google.com/) or run it on your system by setting up the Python environment." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Before you start\n", + "\n", + "## Get Credentials for OpenRouter and Deepseek\n", + "* Sign up for an account at [OpenRouter](https://openrouter.ai/) to get your API key\n", + "* OpenRouter provides access to Deepseek models, so no separate Deepseek credentials are needed\n", + "* Store your OpenRouter API key securely as it will be used to access the models\n", + "* For [Deepseek](https://deepseek.ai/) models, you can use the default models provided by OpenRouter\n", + "\n", + "## Create and Deploy Your Free Tier Operational cluster on Capella\n", + "\n", + "To get started with Couchbase Capella, create an account and use it to deploy a forever free tier operational cluster. This account provides you with an environment where you can explore and learn about Capella with no time constraint.\n", + "\n", + "To learn more, please follow the [instructions](https://docs.couchbase.com/cloud/get-started/create-account.html).\n", + "\n", + "Note: To run this this tutorial, you will need Capella with Couchbase Server version 8.0 or above as GSI vector search is supported only from version 8.0\n", + "\n", + "### Couchbase Capella Configuration\n", + "\n", + "When running Couchbase using [Capella](https://cloud.couchbase.com/sign-in), the following prerequisites need to be met.\n", + "\n", + "* Create the [database credentials](https://docs.couchbase.com/cloud/clusters/manage-database-users.html) to access the required bucket (Read and Write) used in the application.\n", + "* [Allow access](https://docs.couchbase.com/cloud/clusters/allow-ip-address.html) to the Cluster from the IP on which the application is running." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setting the Stage: Installing Necessary Libraries\n", + "\n", + "To build our semantic search engine, we need a robust set of tools. The libraries we install handle everything from connecting to databases to performing complex machine learning tasks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install --quiet datasets==3.5.0 langchain-couchbase==0.5.0 langchain-deepseek==0.1.3 langchain-openai==0.3.13 python-dotenv==1.1.1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Importing Necessary Libraries\n", + "\n", + "The script starts by importing a series of libraries required for various tasks, including handling JSON, logging, time tracking, Couchbase connections, embedding generation, and dataset loading." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "import getpass\n", + "import json\n", + "import logging\n", + "import os\n", + "import time\n", + "from datetime import timedelta\n", + "\n", + "from couchbase.auth import PasswordAuthenticator\n", + "from couchbase.cluster import Cluster\n", + "from couchbase.exceptions import (CouchbaseException,\n", + " InternalServerFailureException,\n", + " QueryIndexAlreadyExistsException,ServiceUnavailableException)\n", + "from couchbase.management.buckets import CreateBucketSettings\n", + "from couchbase.management.search import SearchIndex\n", + "from couchbase.options import ClusterOptions\n", + "from datasets import load_dataset\n", + "from dotenv import load_dotenv\n", + "from langchain_core.globals import set_llm_cache\n", + "from langchain_core.output_parsers import StrOutputParser\n", + "from langchain_core.prompts.chat import ChatPromptTemplate\n", + "from langchain_core.runnables import RunnablePassthrough\n", + "from langchain_couchbase.cache import CouchbaseCache\n", + "from langchain_couchbase.vectorstores import CouchbaseQueryVectorStore\n", + "from langchain_couchbase.vectorstores import DistanceStrategy\n", + "from langchain_couchbase.vectorstores import IndexType\n", + "from langchain_openai import OpenAIEmbeddings" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup Logging\n", + "Logging is configured to track the progress of the script and capture any errors or warnings." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)\n", + "\n", + "# Suppress httpx logging\n", + "logging.getLogger('httpx').setLevel(logging.CRITICAL)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Environment Variables and Configuration\n", + "\n", + "This section handles loading and validating environment variables and configuration settings:\n", + "#\n", + "1. API Keys:\n", + " - Supports either direct Deepseek API or OpenRouter API access\n", + " - Prompts for API key input if not found in environment\n", + " - Requires OpenAI API key for embeddings\n", + "#\n", + "2. Couchbase Settings:\n", + " - Connection details (host, username, password)\n", + " - Bucket, scope and collection names\n", + " - Vector search index configuration\n", + " - Cache collection settings\n", + "#\n", + "The code validates that all required credentials are present before proceeding.\n", + "It allows flexible configuration through environment variables or interactive prompts,\n", + "with sensible defaults for local development.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# Load environment variables from .env file if it exists\n", + "load_dotenv(override= True)\n", + "\n", + "# API Keys\n", + "# Allow either Deepseek API directly or via OpenRouter\n", + "DEEPSEEK_API_KEY = os.getenv('DEEPSEEK_API_KEY')\n", + "OPENROUTER_API_KEY = os.getenv('OPENROUTER_API_KEY')\n", + "\n", + "if not DEEPSEEK_API_KEY and not OPENROUTER_API_KEY:\n", + " api_choice = input('Choose API (1 for Deepseek direct, 2 for OpenRouter): ')\n", + " if api_choice == '1':\n", + " DEEPSEEK_API_KEY = getpass.getpass('Enter your Deepseek API Key: ')\n", + " else:\n", + " OPENROUTER_API_KEY = getpass.getpass('Enter your OpenRouter API Key: ')\n", + "\n", + "OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') or getpass.getpass('Enter your OpenAI API Key: ')\n", + "\n", + "# Couchbase Settings\n", + "CB_HOST = os.getenv('CB_HOST') or input('Enter your Couchbase host (default: couchbase://localhost): ') or 'couchbase://localhost'\n", + "CB_USERNAME = os.getenv('CB_USERNAME') or input('Enter your Couchbase username (default: Administrator): ') or 'Administrator'\n", + "CB_PASSWORD = os.getenv('CB_PASSWORD') or getpass.getpass('Enter your Couchbase password (default: password): ') or 'password'\n", + "CB_BUCKET_NAME = os.getenv('CB_BUCKET_NAME') or input('Enter your Couchbase bucket name (default: query-vector-search-testing): ') or 'query-vector-search-testing'\n", + "SCOPE_NAME = os.getenv('SCOPE_NAME') or input('Enter your scope name (default: shared): ') or 'shared'\n", + "COLLECTION_NAME = os.getenv('COLLECTION_NAME') or input('Enter your collection name (default: deepseek): ') or 'deepseek'\n", + "CACHE_COLLECTION = os.getenv('CACHE_COLLECTION') or input('Enter your cache collection name (default: cache): ') or 'cache'\n", + "\n", + "# Check if required credentials are set\n", + "required_creds = {\n", + " 'OPENAI_API_KEY': OPENAI_API_KEY,\n", + " 'CB_HOST': CB_HOST,\n", + " 'CB_USERNAME': CB_USERNAME,\n", + " 'CB_PASSWORD': CB_PASSWORD,\n", + " 'CB_BUCKET_NAME': CB_BUCKET_NAME\n", + "}\n", + "\n", + "# Add the API key that was chosen\n", + "if DEEPSEEK_API_KEY:\n", + " required_creds['DEEPSEEK_API_KEY'] = DEEPSEEK_API_KEY\n", + "elif OPENROUTER_API_KEY:\n", + " required_creds['OPENROUTER_API_KEY'] = OPENROUTER_API_KEY\n", + "else:\n", + " raise ValueError(\"Either Deepseek API Key or OpenRouter API Key must be provided\")\n", + "\n", + "for cred_name, cred_value in required_creds.items():\n", + " if not cred_value:\n", + " raise ValueError(f\"{cred_name} is not set\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Connecting to the Couchbase Cluster\n", + "Connecting to a Couchbase cluster is the foundation of our project. Couchbase will serve as our primary data store, handling all the storage and retrieval operations required for our semantic search engine. By establishing this connection, we enable our application to interact with the database, allowing us to perform operations such as storing embeddings, querying data, and managing collections. This connection is the gateway through which all data will flow, so ensuring it's set up correctly is paramount.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-17 15:40:27,133 - INFO - Successfully connected to Couchbase\n" + ] + } + ], + "source": [ + "try:\n", + " auth = PasswordAuthenticator(CB_USERNAME, CB_PASSWORD)\n", + " options = ClusterOptions(auth)\n", + " cluster = Cluster(CB_HOST, options)\n", + " cluster.wait_until_ready(timedelta(seconds=5))\n", + " logging.info(\"Successfully connected to Couchbase\")\n", + "except Exception as e:\n", + " raise ConnectionError(f\"Failed to connect to Couchbase: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setting Up Collections in Couchbase\n", + "\n", + "The setup_collection() function handles creating and configuring the hierarchical data organization in Couchbase:\n", + "\n", + "1. Bucket Creation:\n", + " - Checks if specified bucket exists, creates it if not\n", + " - Sets bucket properties like RAM quota (1024MB) and replication (disabled)\n", + " - Note: If you are using Capella, create a bucket manually called vector-search-testing(or any name you prefer) with the same properties.\n", + "\n", + "2. Scope Management: \n", + " - Verifies if requested scope exists within bucket\n", + " - Creates new scope if needed (unless it's the default \"_default\" scope)\n", + "\n", + "3. Collection Setup:\n", + " - Checks for collection existence within scope\n", + " - Creates collection if it doesn't exist\n", + " - Waits 2 seconds for collection to be ready\n", + "\n", + "Additional Tasks:\n", + "- Clears any existing documents for clean state\n", + "- Implements comprehensive error handling and logging\n", + "\n", + "The function is called twice to set up:\n", + "1. Main collection for vector embeddings\n", + "2. Cache collection for storing results\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-17 15:41:01,398 - INFO - Bucket 'query-vector-search-testing' exists.\n", + "2025-09-17 15:41:01,410 - INFO - Collection 'deepseek' does not exist. Creating it...\n", + "2025-09-17 15:41:01,453 - INFO - Collection 'deepseek' created successfully.\n", + "2025-09-17 15:41:03,712 - INFO - All documents cleared from the collection.\n", + "2025-09-17 15:41:03,713 - INFO - Bucket 'query-vector-search-testing' exists.\n", + "2025-09-17 15:41:03,728 - INFO - Collection 'cache' already exists. Skipping creation.\n", + "2025-09-17 15:41:05,821 - INFO - All documents cleared from the collection.\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def setup_collection(cluster, bucket_name, scope_name, collection_name):\n", + " try:\n", + " # Check if bucket exists, create if it doesn't\n", + " try:\n", + " bucket = cluster.bucket(bucket_name)\n", + " logging.info(f\"Bucket '{bucket_name}' exists.\")\n", + " except Exception as e:\n", + " logging.info(f\"Bucket '{bucket_name}' does not exist. Creating it...\")\n", + " bucket_settings = CreateBucketSettings(\n", + " name=bucket_name,\n", + " bucket_type='couchbase',\n", + " ram_quota_mb=1024,\n", + " flush_enabled=True,\n", + " num_replicas=0\n", + " )\n", + " cluster.buckets().create_bucket(bucket_settings)\n", + " time.sleep(2) # Wait for bucket creation to complete and become available\n", + " bucket = cluster.bucket(bucket_name)\n", + " logging.info(f\"Bucket '{bucket_name}' created successfully.\")\n", + "\n", + " bucket_manager = bucket.collections()\n", + "\n", + " # Check if scope exists, create if it doesn't\n", + " scopes = bucket_manager.get_all_scopes()\n", + " scope_exists = any(scope.name == scope_name for scope in scopes)\n", + " \n", + " if not scope_exists and scope_name != \"_default\":\n", + " logging.info(f\"Scope '{scope_name}' does not exist. Creating it...\")\n", + " bucket_manager.create_scope(scope_name)\n", + " logging.info(f\"Scope '{scope_name}' created successfully.\")\n", + "\n", + " # Check if collection exists, create if it doesn't\n", + " collections = bucket_manager.get_all_scopes()\n", + " collection_exists = any(\n", + " scope.name == scope_name and collection_name in [col.name for col in scope.collections]\n", + " for scope in collections\n", + " )\n", + "\n", + " if not collection_exists:\n", + " logging.info(f\"Collection '{collection_name}' does not exist. Creating it...\")\n", + " bucket_manager.create_collection(scope_name, collection_name)\n", + " logging.info(f\"Collection '{collection_name}' created successfully.\")\n", + " else:\n", + " logging.info(f\"Collection '{collection_name}' already exists. Skipping creation.\")\n", + "\n", + " # Wait for collection to be ready\n", + " collection = bucket.scope(scope_name).collection(collection_name)\n", + " time.sleep(2) # Give the collection time to be ready for queries\n", + "\n", + " # Clear all documents in the collection\n", + " try:\n", + " query = f\"DELETE FROM `{bucket_name}`.`{scope_name}`.`{collection_name}`\"\n", + " cluster.query(query).execute()\n", + " logging.info(\"All documents cleared from the collection.\")\n", + " except Exception as e:\n", + " logging.warning(f\"Error while clearing documents: {str(e)}. The collection might be empty.\")\n", + "\n", + " return collection\n", + " except Exception as e:\n", + " raise RuntimeError(f\"Error setting up collection: {str(e)}\")\n", + " \n", + "setup_collection(cluster, CB_BUCKET_NAME, SCOPE_NAME, COLLECTION_NAME)\n", + "setup_collection(cluster, CB_BUCKET_NAME, SCOPE_NAME, CACHE_COLLECTION)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Creating the Embeddings client\n", + "This section creates an OpenAI embeddings client using the OpenAI API key.\n", + "The embeddings client is configured to use the \"text-embedding-3-small\" model,\n", + "which converts text into numerical vector representations.\n", + "These vector embeddings are essential for semantic search and similarity matching.\n", + "The client will be used by the vector store to generate embeddings for documents." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-17 15:41:27,149 - INFO - Successfully created OpenAI embeddings client\n" + ] + } + ], + "source": [ + "try:\n", + " embeddings = OpenAIEmbeddings(\n", + " api_key=OPENAI_API_KEY,\n", + " model=\"text-embedding-3-small\"\n", + " )\n", + " logging.info(\"Successfully created OpenAI embeddings client\")\n", + "except Exception as e:\n", + " raise ValueError(f\"Error creating OpenAI embeddings client: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setting Up the Couchbase Vector Store\n", + "A vector store is where we'll keep our embeddings. Unlike the FTS index, which is used for text-based search, the vector store is specifically designed to handle embeddings and perform similarity searches. When a user inputs a query, the search engine converts the query into an embedding and compares it against the embeddings stored in the vector store. This allows the engine to find documents that are semantically similar to the query, even if they don't contain the exact same words. By setting up the vector store in Couchbase, we create a powerful tool that enables our search engine to understand and retrieve information based on the meaning and context of the query, rather than just the specific words used." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-17 15:41:55,394 - INFO - Successfully created vector store\n" + ] + } + ], + "source": [ + "try:\n", + " vector_store = CouchbaseQueryVectorStore(\n", + " cluster=cluster,\n", + " bucket_name=CB_BUCKET_NAME,\n", + " scope_name=SCOPE_NAME,\n", + " collection_name=COLLECTION_NAME,\n", + " embedding = embeddings,\n", + " distance_metric=DistanceStrategy.COSINE\n", + " )\n", + " logging.info(\"Successfully created vector store\")\n", + "except Exception as e:\n", + " raise ValueError(f\"Failed to create vector store: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load the BBC News Dataset\n", + "To build a search engine, we need data to search through. We use the BBC News dataset from RealTimeData, which provides real-world news articles. This dataset contains news articles from BBC covering various topics and time periods. Loading the dataset is a crucial step because it provides the raw material that our search engine will work with. The quality and diversity of the news articles make it an excellent choice for testing and refining our search engine, ensuring it can handle real-world news content effectively.\n", + "\n", + "The BBC News dataset allows us to work with authentic news articles, enabling us to build and test a search engine that can effectively process and retrieve relevant news content. The dataset is loaded using the Hugging Face datasets library, specifically accessing the \"RealTimeData/bbc_news_alltime\" dataset with the \"2024-12\" version." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-17 15:42:04,530 - INFO - Successfully loaded the BBC News dataset with 2687 rows.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded the BBC News dataset with 2687 rows\n" + ] + } + ], + "source": [ + "try:\n", + " news_dataset = load_dataset(\n", + " \"RealTimeData/bbc_news_alltime\", \"2024-12\", split=\"train\"\n", + " )\n", + " print(f\"Loaded the BBC News dataset with {len(news_dataset)} rows\")\n", + " logging.info(f\"Successfully loaded the BBC News dataset with {len(news_dataset)} rows.\")\n", + "except Exception as e:\n", + " raise ValueError(f\"Error loading the BBC News dataset: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cleaning up the Data\n", + "We will use the content of the news articles for our RAG system.\n", + "\n", + "The dataset contains a few duplicate records. We are removing them to avoid duplicate results in the retrieval stage of our RAG system." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "We have 1749 unique articles in our database.\n" + ] + } + ], + "source": [ + "news_articles = news_dataset[\"content\"]\n", + "unique_articles = set()\n", + "for article in news_articles:\n", + " if article:\n", + " unique_articles.add(article)\n", + "unique_news_articles = list(unique_articles)\n", + "print(f\"We have {len(unique_news_articles)} unique articles in our database.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Saving Data to the Vector Store\n", + "To efficiently handle the large number of articles, we process them in batches of articles at a time. This batch processing approach helps manage memory usage and provides better control over the ingestion process.\n", + "\n", + "We first filter out any articles that exceed 50,000 characters to avoid potential issues with token limits. Then, using the vector store's add_texts method, we add the filtered articles to our vector database. The batch_size parameter controls how many articles are processed in each iteration.\n", + "\n", + "This approach offers several benefits:\n", + "1. Memory Efficiency: Processing in smaller batches prevents memory overload\n", + "2. Progress Tracking: Easier to monitor and track the ingestion progress\n", + "3. Resource Management: Better control over CPU and network resource utilization\n", + "\n", + "We use a conservative batch size of 50 to ensure reliable operation.\n", + "The optimal batch size depends on many factors including:\n", + "- Document sizes being inserted\n", + "- Available system resources\n", + "- Network conditions\n", + "- Concurrent workload\n", + "\n", + "Consider measuring performance with your specific workload before adjusting.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-17 16:08:51,054 - INFO - Document ingestion completed successfully.\n" + ] + } + ], + "source": [ + "batch_size = 50\n", + "\n", + "# Automatic Batch Processing\n", + "articles = [article for article in unique_news_articles if article and len(article) <= 50000]\n", + "\n", + "try:\n", + " vector_store.add_texts(\n", + " texts=articles,\n", + " batch_size=batch_size\n", + " )\n", + " logging.info(\"Document ingestion completed successfully.\")\n", + "except Exception as e:\n", + " raise ValueError(f\"Failed to save documents to vector store: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setting Up the LLM Model\n", + "In this section, we set up the Large Language Model (LLM) for our RAG system. We're using the Deepseek model, which can be accessed through two different methods:\n", + "\n", + "1. **Deepseek API Key**: This is obtained directly from Deepseek's platform (https://deepseek.ai) by creating an account and subscribing to their API services. With this key, you can access Deepseek's models directly using the `ChatDeepSeek` class from the `langchain_deepseek` package.\n", + "\n", + "2. **OpenRouter API Key**: OpenRouter (https://openrouter.ai) is a service that provides unified access to multiple LLM providers, including Deepseek. You can obtain an API key by creating an account on OpenRouter's website. This approach uses the `ChatOpenAI` class from `langchain_openai` but with a custom base URL pointing to OpenRouter's API endpoint.\n", + "\n", + "The key difference is that OpenRouter acts as an intermediary service that can route your requests to various LLM providers, while the Deepseek API gives you direct access to only Deepseek's models. OpenRouter can be useful if you want to switch between different LLM providers without changing your code significantly.\n", + "\n", + "In our implementation, we check for both keys and prioritize using the Deepseek API directly if available, falling back to OpenRouter if not. The model is configured with temperature=0 to ensure deterministic, focused responses suitable for RAG applications.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-18 11:18:25,192 - INFO - Successfully created Deepseek LLM client through OpenRouter\n" + ] + } + ], + "source": [ + "from langchain_deepseek import ChatDeepSeek\n", + "from langchain_openai import ChatOpenAI\n", + "\n", + "if DEEPSEEK_API_KEY:\n", + " try:\n", + " llm = ChatDeepSeek(\n", + " api_key=DEEPSEEK_API_KEY,\n", + " model_name=\"deepseek-chat\",\n", + " temperature=0\n", + " )\n", + " logging.info(\"Successfully created Deepseek LLM client\")\n", + " except Exception as e:\n", + " raise ValueError(f\"Error creating Deepseek LLM client: {str(e)}\")\n", + "elif OPENROUTER_API_KEY:\n", + " try:\n", + " llm = ChatOpenAI(\n", + " api_key=OPENROUTER_API_KEY,\n", + " base_url=\"https://openrouter.ai/api/v1\",\n", + " model=\"deepseek/deepseek-chat-v3.1\", \n", + " temperature=0,\n", + " )\n", + " logging.info(\"Successfully created Deepseek LLM client through OpenRouter\")\n", + " except Exception as e:\n", + " raise ValueError(f\"Error creating Deepseek LLM client: {str(e)}\")\n", + "else:\n", + " raise ValueError(\"Either Deepseek API Key or OpenRouter API Key must be provided\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Perform Semantic Search\n", + "Semantic search in Couchbase involves converting queries and documents into vector representations using an embeddings model. These vectors capture the semantic meaning of the text and are stored directly in Couchbase. When a query is made, Couchbase performs a similarity search by comparing the query vector against the stored document vectors. The similarity metric used for this comparison is configurable, allowing flexibility in how the relevance of documents is determined. Common metrics include cosine similarity, Euclidean distance, or dot product, but other metrics can be implemented based on specific use cases. Different embedding models like BERT, Word2Vec, or GloVe can also be used depending on the application's needs, with the vectors generated by these models stored and searched within Couchbase itself.\n", + "\n", + "In the provided code, the search process begins by recording the start time, followed by executing the `similarity_search_with_score` method of the `CouchbaseQueryVectorStore`. This method searches Couchbase for the most relevant documents based on the vector similarity to the query. The search results include the document content and the distance that reflects how closely each document aligns with the query in the defined semantic space. The time taken to perform this search is then calculated and logged, and the results are displayed, showing the most relevant documents along with their similarity scores. This approach leverages Couchbase as both a storage and retrieval engine for vector data, enabling efficient and scalable semantic searches. The integration of vector storage and search capabilities within Couchbase allows for sophisticated semantic search operations without relying on external services for vector storage or comparison." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-17 16:11:07,177 - INFO - Semantic search completed in 2.46 seconds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Semantic Search Results (completed in 2.46 seconds):\n", + "--------------------------------------------------------------------------------\n", + "Distance: 0.3693, Text: The Littler effect - how darts hit the bullseye\n", + "\n", + "Teenager Luke Littler began his bid to win the 2025 PDC World Darts Championship with a second-round win against Ryan Meikle. Here we assess Littler's impact after a remarkable rise which saw him named BBC Young Sports Personality of the Year and runner-up in the main award to athlete Keely Hodgkinson.\n", + "\n", + "One year ago, he was barely a household name in his own home. Now he is a sporting phenomenon. After emerging from obscurity aged 16 to reach the World Championship final, the life of Luke Littler and the sport he loves has been transformed. Viewing figures, ticket sales and social media interest have rocketed. Darts has hit the bullseye. This Christmas more than 100,000 children are expected to be opening Littler-branded magnetic dartboards as presents. His impact has helped double the number of junior academies, prompted plans to expand the World Championship and generated interest in darts from Saudi Arabian backers.\n", + "\n", + "Just months after taking his GCSE exams and ranked 164th in the world, Littler beat former champions Raymond van Barneveld and Rob Cross en route to the PDC World Championship final in January, before his run ended with a 7-4 loss to Luke Humphries. With his nickname 'The Nuke' on his purple and yellow shirt and the Alexandra Palace crowd belting out his walk-on song, Pitbull's tune Greenlight, he became an instant hit. Electric on the stage, calm off it. The down-to-earth teenager celebrated with a kebab and computer games. \"We've been watching his progress since he was about seven. He was on our radar, but we never anticipated what would happen. The next thing we know 'Littlermania' is spreading everywhere,\" PDC president Barry Hearn told BBC Sport. A peak TV audience of 3.7 million people watched the final - easily Sky's biggest figure for a non-football sporting event. The teenager from Warrington in Cheshire was too young to legally drive or drink alcohol, but earned £200,000 for finishing second - part of £1m prize money in his first year as a professional - and an invitation to the elite Premier League competition. He turned 17 later in January but was he too young for the demanding event over 17 Thursday nights in 17 locations? He ended up winning the whole thing, and hit a nine-dart finish against Humphries in the final. From Bahrain to Wolverhampton, Littler claimed 10 titles in 2024 and is now eyeing the World Championship.\n", + "\n", + "As he progressed at the Ally Pally, the Manchester United fan was sent a good luck message by the club's former midfielder and ex-England captain David Beckham. In 12 months, Littler's Instagram followers have risen from 4,000 to 1.3m. Commercial backers include a clothing range, cereal firm and train company and he will appear in a reboot of the TV darts show Bullseye. Google say he was the most searched-for athlete online in the UK during 2024. On the back of his success, Littler darts, boards, cabinets, shirts are being snapped up in big numbers. \"This Christmas the junior magnetic dartboard is selling out, we're talking over 100,000. They're 20 quid and a great introduction for young children,\" said Garry Plummer, the boss of sponsors Target Darts, who first signed a deal with Littler's family when he was aged 12. \"All the toy shops want it, they all want him - 17, clean, doesn't drink, wonderful.\"\n", + "\n", + "Littler beat Luke Humphries to win the Premier League title in May\n", + "\n", + "The number of academies for children under the age of 16 has doubled in the last year, says Junior Darts Corporation chairman Steve Brown. There are 115 dedicated groups offering youngsters equipment, tournaments and a place to develop, with bases including Australia, Bulgaria, Greece, Norway, USA and Mongolia. \"We've seen so many inquiries from around the world, it's been such a boom. It took us 14 years to get 1,600 members and within 12 months we have over 3,000, and waiting lists,\" said Brown. \"When I played darts as a child, I was quite embarrassed to tell my friends what my hobby was. All these kids playing darts now are pretty popular at school. It's a bit rock 'n roll and recognised as a cool thing to do.\" Plans are being hatched to extend the World Championship by four days and increase the number of players from 96 to 128. That will boost the number of tickets available by 25,000 to 115,000 but Hearn reckons he could sell three times as many. He says Saudi Arabia wants to host a tournament, which is likely to happen if no-alcohol regulations are relaxed. \"They will change their rules in the next 12 months probably for certain areas having alcohol, and we'll take darts there and have a party in Saudi,\" he said. \"When I got involved in darts, the total prize money was something like £300,000 for the year. This year it will go to £20m. I expect in five years' time, we'll be playing for £40m.\"\n", + "\n", + "Former electrician Cross charged to the 2018 world title in his first full season, while Adrian Lewis and Michael van Gerwen were multiple victors in their 20s and 16-time champion Phil ‘The Power’ Taylor is widely considered the greatest of all time. Littler is currently fourth in the world rankings, although that is based on a two-year Order of Merit. There have been suggestions from others the spotlight on the teenager means world number one Humphries, 29, has been denied the coverage he deserves, but no darts player has made a mark at such a young age as Littler. \"Luke Humphries is another fabulous player who is going to be around for years. Sport is a very brutal world. It is about winning and claiming the high ground. There will be envy around,\" Hearn said. \"Luke Littler is the next Tiger Woods for darts so they better get used to it, and the only way to compete is to get better.\" World number 38 Martin Lukeman was awestruck as he described facing a peak Littler after being crushed 16-3 in the Grand Slam final, with the teenager winning 15 consecutive legs. \"I can't compete with that, it was like Godly. He was relentless, he is so good it's ridiculous,\" he said. Lukeman can still see the benefits he brings, adding: \"What he's done for the sport is brilliant. If it wasn't for him, our wages wouldn't be going up. There's more sponsors, more money coming in, all good.\" Hearn feels future competition may come from players even younger than Littler. \"I watched a 10-year-old a few months ago who averaged 104.89 and checked out a 4-3 win with a 136 finish. They smell the money, the fame and put the hard work in,\" he said. How much better Littler can get is guesswork, although Plummer believes he wants to reach new heights. \"He never says 'how good was I?' But I think he wants to break records and beat Phil Taylor's 16 World Championships and 16 World Matchplay titles,\" he said. \"He's young enough to do it.\" A version of this article was originally published on 29 November.\n", + "• None Know a lot about Littler? Take our quiz\n", + "--------------------------------------------------------------------------------\n", + "Distance: 0.3900, Text: Luke Littler has risen from 164th to fourth in the rankings in a year\n", + "\n", + "A tearful Luke Littler hit a tournament record 140.91 set average as he started his bid for the PDC World Championship title with a dramatic 3-1 win over Ryan Meikle. The 17-year-old made headlines around the world when he reached the tournament final in January, where he lost to Luke Humphries. Starting this campaign on Saturday, Littler was millimetres away from a nine-darter when he missed double 12 as he blew Meikle away in the fourth and final set of the second-round match. Littler was overcome with emotion at the end, cutting short his on-stage interview. \"It was probably the toughest game I've ever played. I had to fight until the end,\" he said later in a news conference. \"As soon as the question came on stage and then boom, the tears came. It was just a bit too much to speak on stage. \"It is the worst game I have played. I have never felt anything like that tonight.\" Admitting to nerves during the match, he told Sky Sports: \"Yes, probably the biggest time it's hit me. Coming into it I was fine, but as soon as [referee] George Noble said 'game on', I couldn't throw them.\" Littler started slowly against Meikle, who had two darts for the opening set, but he took the lead by twice hitting double 20. Meikle did not look overawed against his fellow Englishman and levelled, but Littler won the third set and exploded into life in the fourth. The tournament favourite hit four maximum 180s as he clinched three straight legs in 11, 10 and 11 darts for a record set average, and 100.85 overall. Meanwhile, two seeds crashed out on Saturday night – five-time world champion Raymond van Barneveld lost to Welshman Nick Kenny, while England's Ryan Joyce beat Danny Noppert. Australian Damon Heta was another to narrowly miss out on a nine-darter, just failing on double 12 when throwing for the match in a 3-1 win over Connor Scutt. Ninth seed Heta hit four 100-plus checkouts to come from a set down against Scutt in a match in which both men averaged more than 97.\n", + "\n", + "Littler was hugged by his parents after victory over Meikle\n", + "\n", + "... (output truncated for brevity)\n" + ] + } + ], + "source": [ + "query = \"What were Luke Littler's key achievements and records in his recent PDC World Championship match?\"\n", + "\n", + "try:\n", + " # Perform the semantic search\n", + " start_time = time.time()\n", + " search_results = vector_store.similarity_search_with_score(query, k=10)\n", + " search_elapsed_time = time.time() - start_time\n", + "\n", + " logging.info(f\"Semantic search completed in {search_elapsed_time:.2f} seconds\")\n", + "\n", + " # Display search results\n", + " print(f\"\\nSemantic Search Results (completed in {search_elapsed_time:.2f} seconds):\")\n", + " print(\"-\" * 80)\n", + "\n", + " for doc, score in search_results:\n", + " print(f\"Distance: {score:.4f}, Text: {doc.page_content}\")\n", + " print(\"-\" * 80)\n", + "\n", + "except CouchbaseException as e:\n", + " raise RuntimeError(f\"Error performing semantic search: {str(e)}\")\n", + "except Exception as e:\n", + " raise RuntimeError(f\"Unexpected error: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Optimizing Vector Search with Global Secondary Index (GSI)\n", + "\n", + "While the above semantic search using similarity_search_with_score works effectively, we can significantly improve query performance by leveraging Global Secondary Index (GSI) in Couchbase.\n", + "\n", + "Couchbase offers three types of vector indexes, but for GSI-based vector search we focus on two main types:\n", + "\n", + "Hyperscale Vector Indexes (BHIVE)\n", + "- Best for pure vector searches - content discovery, recommendations, semantic search\n", + "- High performance with low memory footprint - designed to scale to billions of vectors\n", + "- Optimized for concurrent operations - supports simultaneous searches and inserts\n", + "- Use when: You primarily perform vector-only queries without complex scalar filtering\n", + "- Ideal for: Large-scale semantic search, recommendation systems, content discovery\n", + "\n", + "Composite Vector Indexes \n", + "- Best for filtered vector searches - combines vector search with scalar value filtering\n", + "- Efficient pre-filtering - scalar attributes reduce the vector comparison scope\n", + "- Use when: Your queries combine vector similarity with scalar filters that eliminate large portions of data\n", + "- Ideal for: Compliance-based filtering, user-specific searches, time-bounded queries\n", + "\n", + "Choosing the Right Index Type\n", + "- Start with Hyperscale Vector Index for pure vector searches and large datasets\n", + "- Use Composite Vector Index when scalar filters significantly reduce your search space\n", + "- Consider your dataset size: Hyperscale scales to billions, Composite works well for tens of millions to billions\n", + "\n", + "For more information on GSI vector indexes, see [Couchbase GSI Vector Documentation](https://docs.couchbase.com/cloud/vector-index/use-vector-indexes.html).\n", + "\n", + "\n", + "## Understanding Index Configuration (Couchbase 8.0 Feature)\n", + "\n", + "The index_description parameter controls how Couchbase optimizes vector storage and search performance through centroids and quantization:\n", + "\n", + "Format: `'IVF[],{PQ|SQ}'`\n", + "\n", + "Centroids (IVF - Inverted File):\n", + "- Controls how the dataset is subdivided for faster searches\n", + "- More centroids = faster search, slower training \n", + "- Fewer centroids = slower search, faster training\n", + "- If omitted (like IVF,SQ8), Couchbase auto-selects based on dataset size\n", + "\n", + "Quantization Options:\n", + "- SQ (Scalar Quantization): SQ4, SQ6, SQ8 (4, 6, or 8 bits per dimension)\n", + "- PQ (Product Quantization): PQx (e.g., PQ32x8)\n", + "- Higher values = better accuracy, larger index size\n", + "\n", + "Common Examples:\n", + "- IVF,SQ8 - Auto centroids, 8-bit scalar quantization (good default)\n", + "- IVF1000,SQ6 - 1000 centroids, 6-bit scalar quantization \n", + "- IVF,PQ32x8 - Auto centroids, 32 subquantizers with 8 bits\n", + "\n", + "For detailed configuration options, see the [Quantization & Centroid Settings](https://docs.couchbase.com/cloud/vector-index/hyperscale-vector-index.html#algo_settings).\n", + "\n", + "In the code below, we demonstrate creating a BHIVE index. This method takes an index type (BHIVE or COMPOSITE) and description parameter for optimization settings. Alternatively, GSI indexes can be created manually from the Couchbase UI." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "vector_store.create_index(index_type=IndexType.BHIVE, index_name=\"openrouterdeepseek_bhive_index\",index_description=\"IVF,SQ8\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The example below shows running the same similarity search, but now using the BHIVE GSI index we created above. You'll notice improved performance as the index efficiently retrieves data.\n", + "\n", + "**Important**: When using Composite indexes, scalar filters take precedence over vector similarity, which can improve performance for filtered searches but may miss some semantically relevant results that don't match the scalar criteria.\n", + "\n", + "Note: In GSI vector search, the distance represents the vector distance between the query and document embeddings. Lower distance indicate higher similarity, while higher distance indicate lower similarity." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-18 11:17:19,626 - INFO - Semantic search completed in 0.88 seconds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Semantic Search Results (completed in 0.88 seconds):\n", + "--------------------------------------------------------------------------------\n", + "Distance: 0.3694, Text: The Littler effect - how darts hit the bullseye\n", + "\n", + "Teenager Luke Littler began his bid to win the 2025 PDC World Darts Championship with a second-round win against Ryan Meikle. Here we assess Littler's impact after a remarkable rise which saw him named BBC Young Sports Personality of the Year and runner-up in the main award to athlete Keely Hodgkinson.\n", + "\n", + "One year ago, he was barely a household name in his own home. Now he is a sporting phenomenon. After emerging from obscurity aged 16 to reach the World Championship final, the life of Luke Littler and the sport he loves has been transformed. Viewing figures, ticket sales and social media interest have rocketed. Darts has hit the bullseye. This Christmas more than 100,000 children are expected to be opening Littler-branded magnetic dartboards as presents. His impact has helped double the number of junior academies, prompted plans to expand the World Championship and generated interest in darts from Saudi Arabian backers.\n", + "\n", + "Just months after taking his GCSE exams and ranked 164th in the world, Littler beat former champions Raymond van Barneveld and Rob Cross en route to the PDC World Championship final in January, before his run ended with a 7-4 loss to Luke Humphries. With his nickname 'The Nuke' on his purple and yellow shirt and the Alexandra Palace crowd belting out his walk-on song, Pitbull's tune Greenlight, he became an instant hit. Electric on the stage, calm off it. The down-to-earth teenager celebrated with a kebab and computer games. \"We've been watching his progress since he was about seven. He was on our radar, but we never anticipated what would happen. The next thing we know 'Littlermania' is spreading everywhere,\" PDC president Barry Hearn told BBC Sport. A peak TV audience of 3.7 million people watched the final - easily Sky's biggest figure for a non-football sporting event. The teenager from Warrington in Cheshire was too young to legally drive or drink alcohol, but earned £200,000 for finishing second - part of £1m prize money in his first year as a professional - and an invitation to the elite Premier League competition. He turned 17 later in January but was he too young for the demanding event over 17 Thursday nights in 17 locations? He ended up winning the whole thing, and hit a nine-dart finish against Humphries in the final. From Bahrain to Wolverhampton, Littler claimed 10 titles in 2024 and is now eyeing the World Championship.\n", + "\n", + "As he progressed at the Ally Pally, the Manchester United fan was sent a good luck message by the club's former midfielder and ex-England captain David Beckham. In 12 months, Littler's Instagram followers have risen from 4,000 to 1.3m. Commercial backers include a clothing range, cereal firm and train company and he will appear in a reboot of the TV darts show Bullseye. Google say he was the most searched-for athlete online in the UK during 2024. On the back of his success, Littler darts, boards, cabinets, shirts are being snapped up in big numbers. \"This Christmas the junior magnetic dartboard is selling out, we're talking over 100,000. They're 20 quid and a great introduction for young children,\" said Garry Plummer, the boss of sponsors Target Darts, who first signed a deal with Littler's family when he was aged 12. \"All the toy shops want it, they all want him - 17, clean, doesn't drink, wonderful.\"\n", + "\n", + "Littler beat Luke Humphries to win the Premier League title in May\n", + "\n", + "The number of academies for children under the age of 16 has doubled in the last year, says Junior Darts Corporation chairman Steve Brown. There are 115 dedicated groups offering youngsters equipment, tournaments and a place to develop, with bases including Australia, Bulgaria, Greece, Norway, USA and Mongolia. \"We've seen so many inquiries from around the world, it's been such a boom. It took us 14 years to get 1,600 members and within 12 months we have over 3,000, and waiting lists,\" said Brown. \"When I played darts as a child, I was quite embarrassed to tell my friends what my hobby was. All these kids playing darts now are pretty popular at school. It's a bit rock 'n roll and recognised as a cool thing to do.\" Plans are being hatched to extend the World Championship by four days and increase the number of players from 96 to 128. That will boost the number of tickets available by 25,000 to 115,000 but Hearn reckons he could sell three times as many. He says Saudi Arabia wants to host a tournament, which is likely to happen if no-alcohol regulations are relaxed. \"They will change their rules in the next 12 months probably for certain areas having alcohol, and we'll take darts there and have a party in Saudi,\" he said. \"When I got involved in darts, the total prize money was something like £300,000 for the year. This year it will go to £20m. I expect in five years' time, we'll be playing for £40m.\"\n", + "\n", + "Former electrician Cross charged to the 2018 world title in his first full season, while Adrian Lewis and Michael van Gerwen were multiple victors in their 20s and 16-time champion Phil ‘The Power’ Taylor is widely considered the greatest of all time. Littler is currently fourth in the world rankings, although that is based on a two-year Order of Merit. There have been suggestions from others the spotlight on the teenager means world number one Humphries, 29, has been denied the coverage he deserves, but no darts player has made a mark at such a young age as Littler. \"Luke Humphries is another fabulous player who is going to be around for years. Sport is a very brutal world. It is about winning and claiming the high ground. There will be envy around,\" Hearn said. \"Luke Littler is the next Tiger Woods for darts so they better get used to it, and the only way to compete is to get better.\" World number 38 Martin Lukeman was awestruck as he described facing a peak Littler after being crushed 16-3 in the Grand Slam final, with the teenager winning 15 consecutive legs. \"I can't compete with that, it was like Godly. He was relentless, he is so good it's ridiculous,\" he said. Lukeman can still see the benefits he brings, adding: \"What he's done for the sport is brilliant. If it wasn't for him, our wages wouldn't be going up. There's more sponsors, more money coming in, all good.\" Hearn feels future competition may come from players even younger than Littler. \"I watched a 10-year-old a few months ago who averaged 104.89 and checked out a 4-3 win with a 136 finish. They smell the money, the fame and put the hard work in,\" he said. How much better Littler can get is guesswork, although Plummer believes he wants to reach new heights. \"He never says 'how good was I?' But I think he wants to break records and beat Phil Taylor's 16 World Championships and 16 World Matchplay titles,\" he said. \"He's young enough to do it.\" A version of this article was originally published on 29 November.\n", + "• None Know a lot about Littler? Take our quiz\n", + "--------------------------------------------------------------------------------\n", + "Distance: 0.3901, Text: Luke Littler has risen from 164th to fourth in the rankings in a year\n", + "\n", + "A tearful Luke Littler hit a tournament record 140.91 set average as he started his bid for the PDC World Championship title with a dramatic 3-1 win over Ryan Meikle. The 17-year-old made headlines around the world when he reached the tournament final in January, where he lost to Luke Humphries. Starting this campaign on Saturday, Littler was millimetres away from a nine-darter when he missed double 12 as he blew Meikle away in the fourth and final set of the second-round match. Littler was overcome with emotion at the end, cutting short his on-stage interview. \"It was probably the toughest game I've ever played. I had to fight until the end,\" he said later in a news conference. \"As soon as the question came on stage and then boom, the tears came. It was just a bit too much to speak on stage. \"It is the worst game I have played. I have never felt anything like that tonight.\" Admitting to nerves during the match, he told Sky Sports: \"Yes, probably the biggest time it's hit me. Coming into it I was fine, but as soon as [referee] George Noble said 'game on', I couldn't throw them.\" Littler started slowly against Meikle, who had two darts for the opening set, but he took the lead by twice hitting double 20. Meikle did not look overawed against his fellow Englishman and levelled, but Littler won the third set and exploded into life in the fourth. The tournament favourite hit four maximum 180s as he clinched three straight legs in 11, 10 and 11 darts for a record set average, and 100.85 overall. Meanwhile, two seeds crashed out on Saturday night – five-time world champion Raymond van Barneveld lost to Welshman Nick Kenny, while England's Ryan Joyce beat Danny Noppert. Australian Damon Heta was another to narrowly miss out on a nine-darter, just failing on double 12 when throwing for the match in a 3-1 win over Connor Scutt. Ninth seed Heta hit four 100-plus checkouts to come from a set down against Scutt in a match in which both men averaged more than 97.\n", + "\n", + "Littler was hugged by his parents after victory over Meikle\n", + "\n", + "... (output truncated for brevity)\n" + ] + } + ], + "source": [ + "\n", + "query = \"What were Luke Littler's key achievements and records in his recent PDC World Championship match?\"\n", + "\n", + "try:\n", + " # Perform the semantic search\n", + " start_time = time.time()\n", + " search_results = vector_store.similarity_search_with_score(query, k=10)\n", + " search_elapsed_time = time.time() - start_time\n", + "\n", + " logging.info(f\"Semantic search completed in {search_elapsed_time:.2f} seconds\")\n", + "\n", + " # Display search results\n", + " print(f\"\\nSemantic Search Results (completed in {search_elapsed_time:.2f} seconds):\")\n", + " print(\"-\" * 80)\n", + "\n", + " for doc, score in search_results:\n", + " print(f\"Distance: {score:.4f}, Text: {doc.page_content}\")\n", + " print(\"-\" * 80)\n", + "\n", + "except CouchbaseException as e:\n", + " raise RuntimeError(f\"Error performing semantic search: {str(e)}\")\n", + "except Exception as e:\n", + " raise RuntimeError(f\"Unexpected error: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note: To create a COMPOSITE index, the below code can be used.\n", + "Choose based on your specific use case and query patterns. For this tutorial's news search scenario, either index type would work, but BHIVE might be more efficient for pure semantic search across news articles." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vector_store.create_index(index_type=IndexType.COMPOSITE, index_name=\"openrouterdeepseek_composite_index\", index_description=\"IVF,SQ8\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setting Up a Couchbase Cache\n", + "To further optimize our system, we set up a Couchbase-based cache. A cache is a temporary storage layer that holds data that is frequently accessed, speeding up operations by reducing the need to repeatedly retrieve the same information from the database. In our setup, the cache will help us accelerate repetitive tasks, such as looking up similar documents. By implementing a cache, we enhance the overall performance of our search engine, ensuring that it can handle high query volumes and deliver results quickly.\n", + "\n", + "Caching is particularly valuable in scenarios where users may submit similar queries multiple times or where certain pieces of information are frequently requested. By storing these in a cache, we can significantly reduce the time it takes to respond to these queries, improving the user experience.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-17 16:10:11,473 - INFO - Successfully created cache\n" + ] + } + ], + "source": [ + "try:\n", + " cache = CouchbaseCache(\n", + " cluster=cluster,\n", + " bucket_name=CB_BUCKET_NAME,\n", + " scope_name=SCOPE_NAME,\n", + " collection_name=CACHE_COLLECTION,\n", + " )\n", + " logging.info(\"Successfully created cache\")\n", + " set_llm_cache(cache)\n", + "except Exception as e:\n", + " raise ValueError(f\"Failed to create cache: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Retrieval-Augmented Generation (RAG) with Couchbase and LangChain\n", + "Couchbase and LangChain can be seamlessly integrated to create RAG (Retrieval-Augmented Generation) chains, enhancing the process of generating contextually relevant responses. In this setup, Couchbase serves as the vector store, where embeddings of documents are stored. When a query is made, LangChain retrieves the most relevant documents from Couchbase by comparing the query’s embedding with the stored document embeddings. These documents, which provide contextual information, are then passed to a generative language model within LangChain.\n", + "\n", + "The language model, equipped with the context from the retrieved documents, generates a response that is both informed and contextually accurate. This integration allows the RAG chain to leverage Couchbase’s efficient storage and retrieval capabilities, while LangChain handles the generation of responses based on the context provided by the retrieved documents. Together, they create a powerful system that can deliver highly relevant and accurate answers by combining the strengths of both retrieval and generation." + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-18 11:18:34,032 - INFO - Successfully created RAG chain\n" + ] + } + ], + "source": [ + "# Create RAG prompt template\n", + "rag_prompt = ChatPromptTemplate.from_messages([\n", + " (\"system\", \"You are a helpful assistant that answers questions based on the provided context.\"),\n", + " (\"human\", \"Context: {context}\\n\\nQuestion: {question}\")\n", + "])\n", + "\n", + "# Create RAG chain\n", + "rag_chain = (\n", + " {\"context\": vector_store.as_retriever(), \"question\": RunnablePassthrough()}\n", + " | rag_prompt\n", + " | llm\n", + " | StrOutputParser()\n", + ")\n", + "logging.info(\"Successfully created RAG chain\")" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RAG Response: Based on the provided context, Luke Littler's key achievements and records in his recent PDC World Championship match (second-round win against Ryan Meikle) were:\n", + "\n", + "* **Tournament Record Set Average:** He hit a tournament record 140.91 set average during the match.\n", + "* **Near Nine-Darter:** He was \"millimetres away from a nine-darter\" when he missed double 12.\n", + "* **Dominant Final Set:** He won the fourth and final set in just 32 darts (the minimum possible is 27), which included hitting four maximum 180s and clinching three straight legs in 11, 10, and 11 darts.\n", + "* **Overall High Average:** He maintained a high overall match average of 100.85.\n", + "RAG response generated in 0.49 seconds\n" + ] + } + ], + "source": [ + "try:\n", + " start_time = time.time()\n", + " rag_response = rag_chain.invoke(query)\n", + " rag_elapsed_time = time.time() - start_time\n", + "\n", + " print(f\"RAG Response: {rag_response}\")\n", + " print(f\"RAG response generated in {rag_elapsed_time:.2f} seconds\")\n", + "except InternalServerFailureException as e:\n", + " if \"query request rejected\" in str(e):\n", + " print(\"Error: Search request was rejected due to rate limiting. Please try again later.\")\n", + " else:\n", + " print(f\"Internal server error occurred: {str(e)}\")\n", + "except Exception as e:\n", + " print(f\"Unexpected error occurred: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using Couchbase as a caching mechanism\n", + "Couchbase can be effectively used as a caching mechanism for RAG (Retrieval-Augmented Generation) responses by storing and retrieving precomputed results for specific queries. This approach enhances the system's efficiency and speed, particularly when dealing with repeated or similar queries. When a query is first processed, the RAG chain retrieves relevant documents, generates a response using the language model, and then stores this response in Couchbase, with the query serving as the key.\n", + "\n", + "For subsequent requests with the same query, the system checks Couchbase first. If a cached response is found, it is retrieved directly from Couchbase, bypassing the need to re-run the entire RAG process. This significantly reduces response time because the computationally expensive steps of document retrieval and response generation are skipped. Couchbase's role in this setup is to provide a fast and scalable storage solution for caching these responses, ensuring that frequently asked queries can be answered more quickly and efficiently.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Query 1: What happened in the match between Fullham and Liverpool?\n", + "Response: In the match between Fulham and Liverpool, Liverpool played the majority of the game with 10 men after Andy Robertson received a red card in the 17th minute. Despite being a player down, Liverpool came from behind twice to secure a 2-2 draw. Diogo Jota scored an 86th-minute equalizer to earn Liverpool a point. The performance was praised for its resilience, with Fulham's Antonee Robinson noting that Liverpool \"didn't feel like they had 10 men at all.\" Liverpool maintained over 60% possession and led in attacking metrics such as shots and chances. Both managers acknowledged the strong efforts of their teams in what was described as an enthralling encounter.\n", + "Time taken: 4.65 seconds\n", + "\n", + "Query 2: What were Luke Littler's key achievements and records in his recent PDC World Championship match?\n", + "Response: Based on the provided context, Luke Littler's key achievements and records in his recent PDC World Championship match (second-round win against Ryan Meikle) were:\n", + "\n", + "* **Tournament Record Set Average:** He hit a tournament record 140.91 set average during the match.\n", + "* **Near Nine-Darter:** He was \"millimetres away from a nine-darter\" when he missed double 12.\n", + "* **Dominant Final Set:** He won the fourth and final set in just 32 darts (the minimum possible is 27), which included hitting four maximum 180s and clinching three straight legs in 11, 10, and 11 darts.\n", + "* **Overall High Average:** He maintained a high overall match average of 100.85.\n", + "Time taken: 0.45 seconds\n", + "\n", + "Query 3: What happened in the match between Fullham and Liverpool?\n", + "Response: In the match between Fulham and Liverpool, Liverpool played the majority of the game with 10 men after Andy Robertson received a red card in the 17th minute. Despite being a player down, Liverpool came from behind twice to secure a 2-2 draw. Diogo Jota scored an 86th-minute equalizer to earn Liverpool a point. The performance was praised for its resilience, with Fulham's Antonee Robinson noting that Liverpool \"didn't feel like they had 10 men at all.\" Liverpool maintained over 60% possession and led in attacking metrics such as shots and chances. Both managers acknowledged the strong efforts of their teams in what was described as an enthralling encounter.\n", + "Time taken: 1.15 seconds\n" + ] + } + ], + "source": [ + "try:\n", + " queries = [\n", + " \"What happened in the match between Fullham and Liverpool?\",\n", + " \"What were Luke Littler's key achievements and records in his recent PDC World Championship match?\", # Repeated query\n", + " \"What happened in the match between Fullham and Liverpool?\", # Repeated query\n", + " ]\n", + "\n", + " for i, query in enumerate(queries, 1):\n", + " print(f\"\\nQuery {i}: {query}\")\n", + " start_time = time.time()\n", + "\n", + " response = rag_chain.invoke(query)\n", + " elapsed_time = time.time() - start_time\n", + " print(f\"Response: {response}\")\n", + " print(f\"Time taken: {elapsed_time:.2f} seconds\")\n", + "\n", + "except InternalServerFailureException as e:\n", + " if \"query request rejected\" in str(e):\n", + " print(\"Error: Search request was rejected due to rate limiting. Please try again later.\")\n", + " else:\n", + " print(f\"Internal server error occurred: {str(e)}\")\n", + "except Exception as e:\n", + " print(f\"Unexpected error occurred: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion\n", + "By following these steps, you'll have a fully functional semantic search engine that leverages the strengths of Couchbase and Deepseek(via Openrouter). This guide is designed not just to show you how to build the system, but also to explain why each step is necessary, giving you a deeper understanding of the principles behind semantic search and how to implement it effectively. Whether you're a newcomer to software development or an experienced developer looking to expand your skills, this guide will provide you with the knowledge and tools you need to create a powerful, AI-driven search engine." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/openrouter-deepseek/gsi/frontmatter.md b/openrouter-deepseek/gsi/frontmatter.md new file mode 100644 index 00000000..2cd04153 --- /dev/null +++ b/openrouter-deepseek/gsi/frontmatter.md @@ -0,0 +1,23 @@ +--- +# frontmatter +path: "/tutorial-openrouter-deepseek-with-global-secondary-index" +title: Retrieval-Augmented Generation with Couchbase and OpenRouter Deepseek using GSI index +short_title: RAG with Couchbase and OpenRouter Deepseek using GSI index +description: + - Learn how to build a semantic search engine using Couchbase and OpenRouter with Deepseek using GSI index. + - This tutorial demonstrates how to integrate Couchbase's vector search capabilities with OpenRouter Deepseek as both embeddings and language model provider. + - You'll understand how to perform Retrieval-Augmented Generation (RAG) using LangChain and Couchbase. +content_type: tutorial +filter: sdk +technology: + - vector search +tags: + - GSI + - Artificial Intelligence + - LangChain + - Deepseek + - OpenRouter +sdk_language: + - python +length: 60 Mins +--- From cdef3ce43d144adc646d08430fb4a400c66c18c1 Mon Sep 17 00:00:00 2001 From: giriraj-singh-couchbase Date: Thu, 11 Dec 2025 13:40:27 +0530 Subject: [PATCH 13/13] reverted openrouter changes --- openrouter-deepseek/query_based/.env.sample | 13 - ...th_Couchbase_and_Openrouter_Deepseek.ipynb | 1081 ---------------- .../query_based/frontmatter.md | 26 - openrouter-deepseek/search_based/.env.sample | 14 - ...th_Couchbase_and_Openrouter_Deepseek.ipynb | 1137 ----------------- .../search_based/deepseek_index.json | 73 -- .../search_based/frontmatter.md | 23 - 7 files changed, 2367 deletions(-) delete mode 100644 openrouter-deepseek/query_based/.env.sample delete mode 100644 openrouter-deepseek/query_based/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb delete mode 100644 openrouter-deepseek/query_based/frontmatter.md delete mode 100644 openrouter-deepseek/search_based/.env.sample delete mode 100644 openrouter-deepseek/search_based/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb delete mode 100644 openrouter-deepseek/search_based/deepseek_index.json delete mode 100644 openrouter-deepseek/search_based/frontmatter.md diff --git a/openrouter-deepseek/query_based/.env.sample b/openrouter-deepseek/query_based/.env.sample deleted file mode 100644 index 0cab2fc6..00000000 --- a/openrouter-deepseek/query_based/.env.sample +++ /dev/null @@ -1,13 +0,0 @@ -DEEPSEEK_API_KEY="" -OPENAI_API_KEY="" -OPENROUTER_API_KEY="" - -# Couchbase Settings -CB_HOST=couchbase://localhost -CB_USERNAME=Administrator -CB_PASSWORD=password -CB_BUCKET_NAME=query-vector-search-testing - -SCOPE_NAME=shared -COLLECTION_NAME=deepseek -CACHE_COLLECTION=cache diff --git a/openrouter-deepseek/query_based/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb b/openrouter-deepseek/query_based/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb deleted file mode 100644 index d3c64b16..00000000 --- a/openrouter-deepseek/query_based/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb +++ /dev/null @@ -1,1081 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Introduction\n", - "\n", - "In this guide, we will walk you through building a powerful semantic search engine using Couchbase as the backend database and [Deepseek V3 as the language model provider (via OpenRouter or direct API)](https://deepseek.ai/) and OpenAI for embeddings. Semantic search goes beyond simple keyword matching by understanding the context and meaning behind the words in a query, making it an essential tool for applications that require intelligent information retrieval. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system using Couchbase's **Hyperscale and Composite Vector Index** capabilities from scratch. Hyperscale Vector Indexes are designed for pure vector searches, while Composite Vector Indexes enable filtered searches combining vector similarity with scalar attributes. Learn more about these index types in the [Couchbase Vector Index Documentation](https://docs.couchbase.com/cloud/vector-index/use-vector-indexes.html)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## How to Run This Tutorial\n", - "\n", - "This tutorial is available as a Jupyter Notebook (`.ipynb` file) that you can run interactively. You can access the original notebook [here](https://github.com/couchbase-examples/vector-search-cookbook/blob/main/openrouter-deepseek/query_based/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb).\n", - "\n", - "You can either download the notebook file and run it on [Google Colab](https://colab.research.google.com/) or run it on your system by setting up the Python environment." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prerequisites\n", - "\n", - "### Get Credentials for OpenRouter and Deepseek\n", - "\n", - "* Sign up for an account at [OpenRouter](https://openrouter.ai/) to get your API key\n", - "* OpenRouter provides access to Deepseek models, so no separate Deepseek credentials are needed\n", - "* Store your OpenRouter API key securely as it will be used to access the models\n", - "* For [Deepseek](https://deepseek.ai/) models, you can use the default models provided by OpenRouter\n", - "\n", - "### Couchbase Requirements\n", - "\n", - "Create and Deploy Your Free Tier Operational cluster on [Capella](https://cloud.couchbase.com/sign-up)\n", - "\n", - "To get started with Couchbase Capella, create an account and use it to deploy a forever free tier operational cluster. This account provides you with an environment where you can explore and learn about Capella with no time constraint.\n", - "\n", - "To learn more, please follow the [instructions](https://docs.couchbase.com/cloud/get-started/create-account.html).\n", - "\n", - "**Note**: To run this tutorial, you will need Capella with Couchbase Server version 8.0 or above as Hyperscale and Composite vector search is supported only from version 8.0\n", - "\n", - "### Couchbase Capella Configuration\n", - "\n", - "When running Couchbase using [Capella](https://cloud.couchbase.com/sign-in), the following prerequisites need to be met.\n", - "\n", - "* Create the [database credentials](https://docs.couchbase.com/cloud/clusters/manage-database-users.html) to access the required bucket (Read and Write) used in the application.\n", - "* [Allow access](https://docs.couchbase.com/cloud/clusters/allow-ip-address.html) to the Cluster from the IP on which the application is running." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setup and Installation\n", - "\n", - "### Installing Necessary Libraries\n", - "\n", - "To build our semantic search engine, we need a robust set of tools. The libraries we install handle everything from connecting to databases to performing complex machine learning tasks." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], - "source": [ - "%pip install --quiet datasets==3.5.0 langchain-couchbase==0.5.0 langchain-deepseek==0.1.3 langchain-openai==0.3.13 python-dotenv==1.1.1" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Import Required Modules\n", - "\n", - "The script starts by importing a series of libraries required for various tasks, including handling JSON, logging, time tracking, Couchbase connections, embedding generation, and dataset loading." - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [], - "source": [ - "import getpass\n", - "import json\n", - "import logging\n", - "import os\n", - "import time\n", - "from datetime import timedelta\n", - "\n", - "from couchbase.auth import PasswordAuthenticator\n", - "from couchbase.cluster import Cluster\n", - "from couchbase.exceptions import (CouchbaseException,\n", - " InternalServerFailureException,\n", - " QueryIndexAlreadyExistsException,ServiceUnavailableException)\n", - "from couchbase.management.buckets import CreateBucketSettings\n", - "from couchbase.management.search import SearchIndex\n", - "from couchbase.options import ClusterOptions\n", - "from datasets import load_dataset\n", - "from dotenv import load_dotenv\n", - "from langchain_core.globals import set_llm_cache\n", - "from langchain_core.output_parsers import StrOutputParser\n", - "from langchain_core.prompts.chat import ChatPromptTemplate\n", - "from langchain_core.runnables import RunnablePassthrough\n", - "from langchain_couchbase.cache import CouchbaseCache\n", - "from langchain_couchbase.vectorstores import CouchbaseQueryVectorStore\n", - "from langchain_couchbase.vectorstores import DistanceStrategy\n", - "from langchain_couchbase.vectorstores import IndexType\n", - "from langchain_openai import OpenAIEmbeddings" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Configure Logging\n", - "\n", - "Logging is configured to track the progress of the script and capture any errors or warnings." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)\n", - "\n", - "# Suppress httpx logging\n", - "logging.getLogger('httpx').setLevel(logging.CRITICAL)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Load Environment Configuration\n", - "\n", - "This section handles loading and validating environment variables and configuration settings:\n", - "\n", - "1. **API Keys:**\n", - " - Supports either direct Deepseek API or OpenRouter API access\n", - " - Prompts for API key input if not found in environment\n", - " - Requires OpenAI API key for embeddings\n", - "\n", - "2. **Couchbase Settings:**\n", - " - Connection details (host, username, password)\n", - " - Bucket, scope and collection names\n", - " - Vector search index configuration\n", - " - Cache collection settings\n", - "\n", - "The code validates that all required credentials are present before proceeding. It allows flexible configuration through environment variables or interactive prompts, with sensible defaults for local development." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "# Load environment variables from .env file if it exists\n", - "load_dotenv(override= True)\n", - "\n", - "# API Keys\n", - "# Allow either Deepseek API directly or via OpenRouter\n", - "DEEPSEEK_API_KEY = os.getenv('DEEPSEEK_API_KEY')\n", - "OPENROUTER_API_KEY = os.getenv('OPENROUTER_API_KEY')\n", - "\n", - "if not DEEPSEEK_API_KEY and not OPENROUTER_API_KEY:\n", - " api_choice = input('Choose API (1 for Deepseek direct, 2 for OpenRouter): ')\n", - " if api_choice == '1':\n", - " DEEPSEEK_API_KEY = getpass.getpass('Enter your Deepseek API Key: ')\n", - " else:\n", - " OPENROUTER_API_KEY = getpass.getpass('Enter your OpenRouter API Key: ')\n", - "\n", - "OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') or getpass.getpass('Enter your OpenAI API Key: ')\n", - "\n", - "# Couchbase Settings\n", - "CB_HOST = os.getenv('CB_HOST') or input('Enter your Couchbase host (default: couchbase://localhost): ') or 'couchbase://localhost'\n", - "CB_USERNAME = os.getenv('CB_USERNAME') or input('Enter your Couchbase username (default: Administrator): ') or 'Administrator'\n", - "CB_PASSWORD = os.getenv('CB_PASSWORD') or getpass.getpass('Enter your Couchbase password (default: password): ') or 'password'\n", - "CB_BUCKET_NAME = os.getenv('CB_BUCKET_NAME') or input('Enter your Couchbase bucket name (default: query-vector-search-testing): ') or 'query-vector-search-testing'\n", - "SCOPE_NAME = os.getenv('SCOPE_NAME') or input('Enter your scope name (default: shared): ') or 'shared'\n", - "COLLECTION_NAME = os.getenv('COLLECTION_NAME') or input('Enter your collection name (default: deepseek): ') or 'deepseek'\n", - "CACHE_COLLECTION = os.getenv('CACHE_COLLECTION') or input('Enter your cache collection name (default: cache): ') or 'cache'\n", - "\n", - "# Check if required credentials are set\n", - "required_creds = {\n", - " 'OPENAI_API_KEY': OPENAI_API_KEY,\n", - " 'CB_HOST': CB_HOST,\n", - " 'CB_USERNAME': CB_USERNAME,\n", - " 'CB_PASSWORD': CB_PASSWORD,\n", - " 'CB_BUCKET_NAME': CB_BUCKET_NAME\n", - "}\n", - "\n", - "# Add the API key that was chosen\n", - "if DEEPSEEK_API_KEY:\n", - " required_creds['DEEPSEEK_API_KEY'] = DEEPSEEK_API_KEY\n", - "elif OPENROUTER_API_KEY:\n", - " required_creds['OPENROUTER_API_KEY'] = OPENROUTER_API_KEY\n", - "else:\n", - " raise ValueError(\"Either Deepseek API Key or OpenRouter API Key must be provided\")\n", - "\n", - "for cred_name, cred_value in required_creds.items():\n", - " if not cred_value:\n", - " raise ValueError(f\"{cred_name} is not set\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Couchbase Connection Setup\n", - "\n", - "### Connect to Cluster\n", - "\n", - "Connecting to a Couchbase cluster is the foundation of our project. Couchbase will serve as our primary data store, handling all the storage and retrieval operations required for our semantic search engine. By establishing this connection, we enable our application to interact with the database, allowing us to perform operations such as storing embeddings, querying data, and managing collections. This connection is the gateway through which all data will flow, so ensuring it's set up correctly is paramount." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-09-17 15:40:27,133 - INFO - Successfully connected to Couchbase\n" - ] - } - ], - "source": [ - "try:\n", - " auth = PasswordAuthenticator(CB_USERNAME, CB_PASSWORD)\n", - " options = ClusterOptions(auth)\n", - " cluster = Cluster(CB_HOST, options)\n", - " cluster.wait_until_ready(timedelta(seconds=5))\n", - " logging.info(\"Successfully connected to Couchbase\")\n", - "except Exception as e:\n", - " raise ConnectionError(f\"Failed to connect to Couchbase: {str(e)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Setup Collections\n", - "\n", - "The setup_collection() function handles creating and configuring the hierarchical data organization in Couchbase:\n", - "\n", - "1. **Bucket Creation:**\n", - " - Checks if specified bucket exists, creates it if not\n", - " - Sets bucket properties like RAM quota (1024MB) and replication (disabled)\n", - " - Note: If you are using Capella, create a bucket manually called vector-search-testing (or any name you prefer) with the same properties.\n", - "\n", - "2. **Scope Management:** \n", - " - Verifies if requested scope exists within bucket\n", - " - Creates new scope if needed (unless it's the default \"_default\" scope)\n", - "\n", - "3. **Collection Setup:**\n", - " - Checks for collection existence within scope\n", - " - Creates collection if it doesn't exist\n", - " - Waits 2 seconds for collection to be ready\n", - "\n", - "**Additional Tasks:**\n", - "- Clears any existing documents for clean state\n", - "- Implements comprehensive error handling and logging\n", - "\n", - "The function is called twice to set up:\n", - "1. Main collection for vector embeddings\n", - "2. Cache collection for storing results" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-09-17 15:41:01,398 - INFO - Bucket 'query-vector-search-testing' exists.\n", - "2025-09-17 15:41:01,410 - INFO - Collection 'deepseek' does not exist. Creating it...\n", - "2025-09-17 15:41:01,453 - INFO - Collection 'deepseek' created successfully.\n", - "2025-09-17 15:41:03,712 - INFO - All documents cleared from the collection.\n", - "2025-09-17 15:41:03,713 - INFO - Bucket 'query-vector-search-testing' exists.\n", - "2025-09-17 15:41:03,728 - INFO - Collection 'cache' already exists. Skipping creation.\n", - "2025-09-17 15:41:05,821 - INFO - All documents cleared from the collection.\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def setup_collection(cluster, bucket_name, scope_name, collection_name):\n", - " try:\n", - " # Check if bucket exists, create if it doesn't\n", - " try:\n", - " bucket = cluster.bucket(bucket_name)\n", - " logging.info(f\"Bucket '{bucket_name}' exists.\")\n", - " except Exception as e:\n", - " logging.info(f\"Bucket '{bucket_name}' does not exist. Creating it...\")\n", - " bucket_settings = CreateBucketSettings(\n", - " name=bucket_name,\n", - " bucket_type='couchbase',\n", - " ram_quota_mb=1024,\n", - " flush_enabled=True,\n", - " num_replicas=0\n", - " )\n", - " cluster.buckets().create_bucket(bucket_settings)\n", - " time.sleep(2) # Wait for bucket creation to complete and become available\n", - " bucket = cluster.bucket(bucket_name)\n", - " logging.info(f\"Bucket '{bucket_name}' created successfully.\")\n", - "\n", - " bucket_manager = bucket.collections()\n", - "\n", - " # Check if scope exists, create if it doesn't\n", - " scopes = bucket_manager.get_all_scopes()\n", - " scope_exists = any(scope.name == scope_name for scope in scopes)\n", - " \n", - " if not scope_exists and scope_name != \"_default\":\n", - " logging.info(f\"Scope '{scope_name}' does not exist. Creating it...\")\n", - " bucket_manager.create_scope(scope_name)\n", - " logging.info(f\"Scope '{scope_name}' created successfully.\")\n", - "\n", - " # Check if collection exists, create if it doesn't\n", - " collections = bucket_manager.get_all_scopes()\n", - " collection_exists = any(\n", - " scope.name == scope_name and collection_name in [col.name for col in scope.collections]\n", - " for scope in collections\n", - " )\n", - "\n", - " if not collection_exists:\n", - " logging.info(f\"Collection '{collection_name}' does not exist. Creating it...\")\n", - " bucket_manager.create_collection(scope_name, collection_name)\n", - " logging.info(f\"Collection '{collection_name}' created successfully.\")\n", - " else:\n", - " logging.info(f\"Collection '{collection_name}' already exists. Skipping creation.\")\n", - "\n", - " # Wait for collection to be ready\n", - " collection = bucket.scope(scope_name).collection(collection_name)\n", - " time.sleep(2) # Give the collection time to be ready for queries\n", - "\n", - " # Clear all documents in the collection\n", - " try:\n", - " query = f\"DELETE FROM `{bucket_name}`.`{scope_name}`.`{collection_name}`\"\n", - " cluster.query(query).execute()\n", - " logging.info(\"All documents cleared from the collection.\")\n", - " except Exception as e:\n", - " logging.warning(f\"Error while clearing documents: {str(e)}. The collection might be empty.\")\n", - "\n", - " return collection\n", - " except Exception as e:\n", - " raise RuntimeError(f\"Error setting up collection: {str(e)}\")\n", - " \n", - "setup_collection(cluster, CB_BUCKET_NAME, SCOPE_NAME, COLLECTION_NAME)\n", - "setup_collection(cluster, CB_BUCKET_NAME, SCOPE_NAME, CACHE_COLLECTION)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## OpenAI Configuration\n", - "\n", - "### Creating the Embeddings Client\n", - "\n", - "This section creates an OpenAI embeddings client using the OpenAI API key. The embeddings client is configured to use the \"text-embedding-3-small\" model, which converts text into numerical vector representations. These vector embeddings are essential for semantic search and similarity matching. The client will be used by the vector store to generate embeddings for documents." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-09-17 15:41:27,149 - INFO - Successfully created OpenAI embeddings client\n" - ] - } - ], - "source": [ - "try:\n", - " embeddings = OpenAIEmbeddings(\n", - " api_key=OPENAI_API_KEY,\n", - " model=\"text-embedding-3-small\"\n", - " )\n", - " logging.info(\"Successfully created OpenAI embeddings client\")\n", - "except Exception as e:\n", - " raise ValueError(f\"Error creating OpenAI embeddings client: {str(e)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Document Processing and Vector Store Setup\n", - "\n", - "### Create Couchbase Hyperscale Vector Store\n", - "\n", - "A vector store is where we'll keep our embeddings. Unlike traditional text-based search, the Hyperscale Vector Store is specifically designed to handle embeddings and perform similarity searches. When a user inputs a query, the search engine converts the query into an embedding and compares it against the embeddings stored in the vector store. This allows the engine to find documents that are semantically similar to the query, even if they don't contain the exact same words. By setting up the Hyperscale Vector Store in Couchbase, we create a powerful tool that enables our search engine to understand and retrieve information based on the meaning and context of the query, rather than just the specific words used." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-09-17 15:41:55,394 - INFO - Successfully created vector store\n" - ] - } - ], - "source": [ - "try:\n", - " vector_store = CouchbaseQueryVectorStore(\n", - " cluster=cluster,\n", - " bucket_name=CB_BUCKET_NAME,\n", - " scope_name=SCOPE_NAME,\n", - " collection_name=COLLECTION_NAME,\n", - " embedding = embeddings,\n", - " distance_metric=DistanceStrategy.COSINE\n", - " )\n", - " logging.info(\"Successfully created vector store\")\n", - "except Exception as e:\n", - " raise ValueError(f\"Failed to create vector store: {str(e)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Load BBC News Dataset\n", - "\n", - "To build a search engine, we need data to search through. We use the BBC News dataset from RealTimeData, which provides real-world news articles. This dataset contains news articles from BBC covering various topics and time periods. Loading the dataset is a crucial step because it provides the raw material that our search engine will work with. The quality and diversity of the news articles make it an excellent choice for testing and refining our search engine, ensuring it can handle real-world news content effectively.\n", - "\n", - "The BBC News dataset allows us to work with authentic news articles, enabling us to build and test a search engine that can effectively process and retrieve relevant news content. The dataset is loaded using the Hugging Face datasets library, specifically accessing the \"RealTimeData/bbc_news_alltime\" dataset with the \"2024-12\" version." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-09-17 15:42:04,530 - INFO - Successfully loaded the BBC News dataset with 2687 rows.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loaded the BBC News dataset with 2687 rows\n" - ] - } - ], - "source": [ - "try:\n", - " news_dataset = load_dataset(\n", - " \"RealTimeData/bbc_news_alltime\", \"2024-12\", split=\"train\"\n", - " )\n", - " print(f\"Loaded the BBC News dataset with {len(news_dataset)} rows\")\n", - " logging.info(f\"Successfully loaded the BBC News dataset with {len(news_dataset)} rows.\")\n", - "except Exception as e:\n", - " raise ValueError(f\"Error loading the BBC News dataset: {str(e)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Data Cleaning\n", - "\n", - "We will use the content of the news articles for our RAG system.\n", - "\n", - "The dataset contains a few duplicate records. We are removing them to avoid duplicate results in the retrieval stage of our RAG system." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "We have 1749 unique articles in our database.\n" - ] - } - ], - "source": [ - "news_articles = news_dataset[\"content\"]\n", - "unique_articles = set()\n", - "for article in news_articles:\n", - " if article:\n", - " unique_articles.add(article)\n", - "unique_news_articles = list(unique_articles)\n", - "print(f\"We have {len(unique_news_articles)} unique articles in our database.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Save Data to Vector Store\n", - "\n", - "To efficiently handle the large number of articles, we process them in batches of articles at a time. This batch processing approach helps manage memory usage and provides better control over the ingestion process.\n", - "\n", - "We first filter out any articles that exceed 50,000 characters to avoid potential issues with token limits. Then, using the vector store's add_texts method, we add the filtered articles to our vector database. The batch_size parameter controls how many articles are processed in each iteration.\n", - "\n", - "This approach offers several benefits:\n", - "1. **Memory Efficiency**: Processing in smaller batches prevents memory overload\n", - "2. **Progress Tracking**: Easier to monitor and track the ingestion progress\n", - "3. **Resource Management**: Better control over CPU and network resource utilization\n", - "\n", - "We use a conservative batch size of 50 to ensure reliable operation. The optimal batch size depends on many factors including document sizes, available system resources, network conditions, and concurrent workload." - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-09-17 16:08:51,054 - INFO - Document ingestion completed successfully.\n" - ] - } - ], - "source": [ - "batch_size = 50\n", - "\n", - "# Automatic Batch Processing\n", - "articles = [article for article in unique_news_articles if article and len(article) <= 50000]\n", - "\n", - "try:\n", - " vector_store.add_texts(\n", - " texts=articles,\n", - " batch_size=batch_size\n", - " )\n", - " logging.info(\"Document ingestion completed successfully.\")\n", - "except Exception as e:\n", - " raise ValueError(f\"Failed to save documents to vector store: {str(e)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Deepseek LLM Configuration\n", - "\n", - "### Setting Up the LLM Model\n", - "\n", - "In this section, we set up the Large Language Model (LLM) for our RAG system. We're using the Deepseek model, which can be accessed through two different methods:\n", - "\n", - "1. **Deepseek API Key**: This is obtained directly from Deepseek's platform (https://deepseek.ai) by creating an account and subscribing to their API services. With this key, you can access Deepseek's models directly using the `ChatDeepSeek` class from the `langchain_deepseek` package.\n", - "\n", - "2. **OpenRouter API Key**: OpenRouter (https://openrouter.ai) is a service that provides unified access to multiple LLM providers, including Deepseek. You can obtain an API key by creating an account on OpenRouter's website. This approach uses the `ChatOpenAI` class from `langchain_openai` but with a custom base URL pointing to OpenRouter's API endpoint.\n", - "\n", - "The key difference is that OpenRouter acts as an intermediary service that can route your requests to various LLM providers, while the Deepseek API gives you direct access to only Deepseek's models. OpenRouter can be useful if you want to switch between different LLM providers without changing your code significantly.\n", - "\n", - "In our implementation, we check for both keys and prioritize using the Deepseek API directly if available, falling back to OpenRouter if not. The model is configured with temperature=0 to ensure deterministic, focused responses suitable for RAG applications." - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-09-18 11:18:25,192 - INFO - Successfully created Deepseek LLM client through OpenRouter\n" - ] - } - ], - "source": [ - "from langchain_deepseek import ChatDeepSeek\n", - "from langchain_openai import ChatOpenAI\n", - "\n", - "if DEEPSEEK_API_KEY:\n", - " try:\n", - " llm = ChatDeepSeek(\n", - " api_key=DEEPSEEK_API_KEY,\n", - " model_name=\"deepseek-chat\",\n", - " temperature=0\n", - " )\n", - " logging.info(\"Successfully created Deepseek LLM client\")\n", - " except Exception as e:\n", - " raise ValueError(f\"Error creating Deepseek LLM client: {str(e)}\")\n", - "elif OPENROUTER_API_KEY:\n", - " try:\n", - " llm = ChatOpenAI(\n", - " api_key=OPENROUTER_API_KEY,\n", - " base_url=\"https://openrouter.ai/api/v1\",\n", - " model=\"deepseek/deepseek-chat-v3.1\", \n", - " temperature=0,\n", - " )\n", - " logging.info(\"Successfully created Deepseek LLM client through OpenRouter\")\n", - " except Exception as e:\n", - " raise ValueError(f\"Error creating Deepseek LLM client: {str(e)}\")\n", - "else:\n", - " raise ValueError(\"Either Deepseek API Key or OpenRouter API Key must be provided\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Semantic Search Demo\n", - "\n", - "### Perform Semantic Search\n", - "\n", - "Semantic search in Couchbase involves converting queries and documents into vector representations using an embeddings model. These vectors capture the semantic meaning of the text and are stored directly in Couchbase. When a query is made, Couchbase performs a similarity search by comparing the query vector against the stored document vectors. The similarity metric used for this comparison is configurable, allowing flexibility in how the relevance of documents is determined. Common metrics include cosine similarity, Euclidean distance, or dot product, but other metrics can be implemented based on specific use cases. Different embedding models like BERT, Word2Vec, or GloVe can also be used depending on the application's needs, with the vectors generated by these models stored and searched within Couchbase itself.\n", - "\n", - "In the provided code, the search process begins by recording the start time, followed by executing the `similarity_search_with_score` method of the `CouchbaseQueryVectorStore`. This method searches Couchbase for the most relevant documents based on the vector similarity to the query. The search results include the document content and the distance that reflects how closely each document aligns with the query in the defined semantic space. The time taken to perform this search is then calculated and logged, and the results are displayed, showing the most relevant documents along with their similarity scores. This approach leverages Couchbase as both a storage and retrieval engine for vector data, enabling efficient and scalable semantic searches. The integration of vector storage and search capabilities within Couchbase allows for sophisticated semantic search operations without relying on external services for vector storage or comparison." - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-09-17 16:11:07,177 - INFO - Semantic search completed in 2.46 seconds\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Semantic Search Results (completed in 2.46 seconds):\n", - "--------------------------------------------------------------------------------\n", - "Distance: 0.3693, Text: The Littler effect - how darts hit the bullseye\n", - "\n", - "Teenager Luke Littler began his bid to win the 2025 PDC World Darts Championship with a second-round win against Ryan Meikle. Here we assess Littler's impact after a remarkable rise which saw him named BBC Young Sports Personality of the Year and runner-up in the main award to athlete Keely Hodgkinson.\n", - "\n", - "One year ago, he was barely a household name in his own home. Now he is a sporting phenomenon. After emerging from obscurity aged 16 to reach the World Championship final, the life of Luke Littler and the sport he loves has been transformed. Viewing figures, ticket sales and social media interest have rocketed. Darts has hit the bullseye. This Christmas more than 100,000 children are expected to be opening Littler-branded magnetic dartboards as presents. His impact has helped double the number of junior academies, prompted plans to expand the World Championship and generated interest in darts from Saudi Arabian backers.\n", - "\n", - "Just months after taking his GCSE exams and ranked 164th in the world, Littler beat former champions Raymond van Barneveld and Rob Cross en route to the PDC World Championship final in January, before his run ended with a 7-4 loss to Luke Humphries. With his nickname 'The Nuke' on his purple and yellow shirt and the Alexandra Palace crowd belting out his walk-on song, Pitbull's tune Greenlight, he became an instant hit. Electric on the stage, calm off it. The down-to-earth teenager celebrated with a kebab and computer games. \"We've been watching his progress since he was about seven. He was on our radar, but we never anticipated what would happen. The next thing we know 'Littlermania' is spreading everywhere,\" PDC president Barry Hearn told BBC Sport. A peak TV audience of 3.7 million people watched the final - easily Sky's biggest figure for a non-football sporting event. The teenager from Warrington in Cheshire was too young to legally drive or drink alcohol, but earned £200,000 for finishing second - part of £1m prize money in his first year as a professional - and an invitation to the elite Premier League competition. He turned 17 later in January but was he too young for the demanding event over 17 Thursday nights in 17 locations? He ended up winning the whole thing, and hit a nine-dart finish against Humphries in the final. From Bahrain to Wolverhampton, Littler claimed 10 titles in 2024 and is now eyeing the World Championship.\n", - "\n", - "As he progressed at the Ally Pally, the Manchester United fan was sent a good luck message by the club's former midfielder and ex-England captain David Beckham. In 12 months, Littler's Instagram followers have risen from 4,000 to 1.3m. Commercial backers include a clothing range, cereal firm and train company and he will appear in a reboot of the TV darts show Bullseye. Google say he was the most searched-for athlete online in the UK during 2024. On the back of his success, Littler darts, boards, cabinets, shirts are being snapped up in big numbers. \"This Christmas the junior magnetic dartboard is selling out, we're talking over 100,000. They're 20 quid and a great introduction for young children,\" said Garry Plummer, the boss of sponsors Target Darts, who first signed a deal with Littler's family when he was aged 12. \"All the toy shops want it, they all want him - 17, clean, doesn't drink, wonderful.\"\n", - "\n", - "Littler beat Luke Humphries to win the Premier League title in May\n", - "\n", - "The number of academies for children under the age of 16 has doubled in the last year, says Junior Darts Corporation chairman Steve Brown. There are 115 dedicated groups offering youngsters equipment, tournaments and a place to develop, with bases including Australia, Bulgaria, Greece, Norway, USA and Mongolia. \"We've seen so many inquiries from around the world, it's been such a boom. It took us 14 years to get 1,600 members and within 12 months we have over 3,000, and waiting lists,\" said Brown. \"When I played darts as a child, I was quite embarrassed to tell my friends what my hobby was. All these kids playing darts now are pretty popular at school. It's a bit rock 'n roll and recognised as a cool thing to do.\" Plans are being hatched to extend the World Championship by four days and increase the number of players from 96 to 128. That will boost the number of tickets available by 25,000 to 115,000 but Hearn reckons he could sell three times as many. He says Saudi Arabia wants to host a tournament, which is likely to happen if no-alcohol regulations are relaxed. \"They will change their rules in the next 12 months probably for certain areas having alcohol, and we'll take darts there and have a party in Saudi,\" he said. \"When I got involved in darts, the total prize money was something like £300,000 for the year. This year it will go to £20m. I expect in five years' time, we'll be playing for £40m.\"\n", - "\n", - "Former electrician Cross charged to the 2018 world title in his first full season, while Adrian Lewis and Michael van Gerwen were multiple victors in their 20s and 16-time champion Phil ‘The Power’ Taylor is widely considered the greatest of all time. Littler is currently fourth in the world rankings, although that is based on a two-year Order of Merit. There have been suggestions from others the spotlight on the teenager means world number one Humphries, 29, has been denied the coverage he deserves, but no darts player has made a mark at such a young age as Littler. \"Luke Humphries is another fabulous player who is going to be around for years. Sport is a very brutal world. It is about winning and claiming the high ground. There will be envy around,\" Hearn said. \"Luke Littler is the next Tiger Woods for darts so they better get used to it, and the only way to compete is to get better.\" World number 38 Martin Lukeman was awestruck as he described facing a peak Littler after being crushed 16-3 in the Grand Slam final, with the teenager winning 15 consecutive legs. \"I can't compete with that, it was like Godly. He was relentless, he is so good it's ridiculous,\" he said. Lukeman can still see the benefits he brings, adding: \"What he's done for the sport is brilliant. If it wasn't for him, our wages wouldn't be going up. There's more sponsors, more money coming in, all good.\" Hearn feels future competition may come from players even younger than Littler. \"I watched a 10-year-old a few months ago who averaged 104.89 and checked out a 4-3 win with a 136 finish. They smell the money, the fame and put the hard work in,\" he said. How much better Littler can get is guesswork, although Plummer believes he wants to reach new heights. \"He never says 'how good was I?' But I think he wants to break records and beat Phil Taylor's 16 World Championships and 16 World Matchplay titles,\" he said. \"He's young enough to do it.\" A version of this article was originally published on 29 November.\n", - "• None Know a lot about Littler? Take our quiz\n", - "--------------------------------------------------------------------------------\n", - "Distance: 0.3900, Text: Luke Littler has risen from 164th to fourth in the rankings in a year\n", - "\n", - "A tearful Luke Littler hit a tournament record 140.91 set average as he started his bid for the PDC World Championship title with a dramatic 3-1 win over Ryan Meikle. The 17-year-old made headlines around the world when he reached the tournament final in January, where he lost to Luke Humphries. Starting this campaign on Saturday, Littler was millimetres away from a nine-darter when he missed double 12 as he blew Meikle away in the fourth and final set of the second-round match. Littler was overcome with emotion at the end, cutting short his on-stage interview. \"It was probably the toughest game I've ever played. I had to fight until the end,\" he said later in a news conference. \"As soon as the question came on stage and then boom, the tears came. It was just a bit too much to speak on stage. \"It is the worst game I have played. I have never felt anything like that tonight.\" Admitting to nerves during the match, he told Sky Sports: \"Yes, probably the biggest time it's hit me. Coming into it I was fine, but as soon as [referee] George Noble said 'game on', I couldn't throw them.\" Littler started slowly against Meikle, who had two darts for the opening set, but he took the lead by twice hitting double 20. Meikle did not look overawed against his fellow Englishman and levelled, but Littler won the third set and exploded into life in the fourth. The tournament favourite hit four maximum 180s as he clinched three straight legs in 11, 10 and 11 darts for a record set average, and 100.85 overall. Meanwhile, two seeds crashed out on Saturday night – five-time world champion Raymond van Barneveld lost to Welshman Nick Kenny, while England's Ryan Joyce beat Danny Noppert. Australian Damon Heta was another to narrowly miss out on a nine-darter, just failing on double 12 when throwing for the match in a 3-1 win over Connor Scutt. Ninth seed Heta hit four 100-plus checkouts to come from a set down against Scutt in a match in which both men averaged more than 97.\n", - "\n", - "Littler was hugged by his parents after victory over Meikle\n", - "\n", - "... (output truncated for brevity)\n" - ] - } - ], - "source": [ - "query = \"What were Luke Littler's key achievements and records in his recent PDC World Championship match?\"\n", - "\n", - "try:\n", - " # Perform the semantic search\n", - " start_time = time.time()\n", - " search_results = vector_store.similarity_search_with_score(query, k=10)\n", - " search_elapsed_time = time.time() - start_time\n", - "\n", - " logging.info(f\"Semantic search completed in {search_elapsed_time:.2f} seconds\")\n", - "\n", - " # Display search results\n", - " print(f\"\\nSemantic Search Results (completed in {search_elapsed_time:.2f} seconds):\")\n", - " print(\"-\" * 80)\n", - "\n", - " for doc, score in search_results:\n", - " print(f\"Distance: {score:.4f}, Text: {doc.page_content}\")\n", - " print(\"-\" * 80)\n", - "\n", - "except CouchbaseException as e:\n", - " raise RuntimeError(f\"Error performing semantic search: {str(e)}\")\n", - "except Exception as e:\n", - " raise RuntimeError(f\"Unexpected error: {str(e)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Optimizing Vector Search with Hyperscale and Composite Vector Indexes\n", - "\n", - "While the above semantic search using similarity_search_with_score works effectively, we can significantly improve query performance by leveraging Hyperscale and Composite Vector Indexes in Couchbase.\n", - "\n", - "Couchbase offers different types of vector indexes. For Hyperscale and Composite vector search we focus on two main types as detailed in the [Couchbase Vector Index Documentation](https://docs.couchbase.com/cloud/vector-index/use-vector-indexes.html):\n", - "\n", - "### Vector Index Types\n", - "\n", - "**1. Hyperscale Vector Indexes (BHIVE)**\n", - "- Optimized for pure vector searches like content discovery, recommendations, and semantic search\n", - "- High performance with low memory footprint, optimized for concurrent operations\n", - "- Designed to scale to billions of vectors\n", - "- Use when you primarily perform vector-only queries without complex scalar filtering\n", - "\n", - "**2. Composite Vector Indexes**\n", - "- Best for filtered vector searches that combine vector search with scalar value filtering\n", - "- Efficient pre-filtering where scalar attributes reduce the vector comparison scope\n", - "- Use when your queries combine vector similarity with scalar filters that eliminate large portions of data\n", - "- Note: Scalar filters take precedence over vector similarity\n", - "\n", - "### Understanding Index Configuration\n", - "\n", - "The `index_description` parameter controls how Couchbase optimizes vector storage and search through centroids and quantization. For detailed configuration options, see the [Quantization & Centroid Settings](https://docs.couchbase.com/cloud/vector-index/hyperscale-vector-index.html#algo_settings).\n", - "\n", - "Let's create a BHIVE Hyperscale Vector Index to optimize our search performance:" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [], - "source": [ - "vector_store.create_index(index_type=IndexType.BHIVE, index_name=\"openrouterdeepseek_bhive_index\",index_description=\"IVF,SQ8\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The example below shows running the same similarity search, but now using the BHIVE GSI index we created above. You'll notice improved performance as the index efficiently retrieves data.\n", - "\n", - "**Important**: When using Composite indexes, scalar filters take precedence over vector similarity, which can improve performance for filtered searches but may miss some semantically relevant results that don't match the scalar criteria.\n", - "\n", - "Note: In GSI vector search, the distance represents the vector distance between the query and document embeddings. Lower distance indicate higher similarity, while higher distance indicate lower similarity." - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-09-18 11:17:19,626 - INFO - Semantic search completed in 0.88 seconds\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Semantic Search Results (completed in 0.88 seconds):\n", - "--------------------------------------------------------------------------------\n", - "Distance: 0.3694, Text: The Littler effect - how darts hit the bullseye\n", - "\n", - "Teenager Luke Littler began his bid to win the 2025 PDC World Darts Championship with a second-round win against Ryan Meikle. Here we assess Littler's impact after a remarkable rise which saw him named BBC Young Sports Personality of the Year and runner-up in the main award to athlete Keely Hodgkinson.\n", - "\n", - "One year ago, he was barely a household name in his own home. Now he is a sporting phenomenon. After emerging from obscurity aged 16 to reach the World Championship final, the life of Luke Littler and the sport he loves has been transformed. Viewing figures, ticket sales and social media interest have rocketed. Darts has hit the bullseye. This Christmas more than 100,000 children are expected to be opening Littler-branded magnetic dartboards as presents. His impact has helped double the number of junior academies, prompted plans to expand the World Championship and generated interest in darts from Saudi Arabian backers.\n", - "\n", - "Just months after taking his GCSE exams and ranked 164th in the world, Littler beat former champions Raymond van Barneveld and Rob Cross en route to the PDC World Championship final in January, before his run ended with a 7-4 loss to Luke Humphries. With his nickname 'The Nuke' on his purple and yellow shirt and the Alexandra Palace crowd belting out his walk-on song, Pitbull's tune Greenlight, he became an instant hit. Electric on the stage, calm off it. The down-to-earth teenager celebrated with a kebab and computer games. \"We've been watching his progress since he was about seven. He was on our radar, but we never anticipated what would happen. The next thing we know 'Littlermania' is spreading everywhere,\" PDC president Barry Hearn told BBC Sport. A peak TV audience of 3.7 million people watched the final - easily Sky's biggest figure for a non-football sporting event. The teenager from Warrington in Cheshire was too young to legally drive or drink alcohol, but earned £200,000 for finishing second - part of £1m prize money in his first year as a professional - and an invitation to the elite Premier League competition. He turned 17 later in January but was he too young for the demanding event over 17 Thursday nights in 17 locations? He ended up winning the whole thing, and hit a nine-dart finish against Humphries in the final. From Bahrain to Wolverhampton, Littler claimed 10 titles in 2024 and is now eyeing the World Championship.\n", - "\n", - "As he progressed at the Ally Pally, the Manchester United fan was sent a good luck message by the club's former midfielder and ex-England captain David Beckham. In 12 months, Littler's Instagram followers have risen from 4,000 to 1.3m. Commercial backers include a clothing range, cereal firm and train company and he will appear in a reboot of the TV darts show Bullseye. Google say he was the most searched-for athlete online in the UK during 2024. On the back of his success, Littler darts, boards, cabinets, shirts are being snapped up in big numbers. \"This Christmas the junior magnetic dartboard is selling out, we're talking over 100,000. They're 20 quid and a great introduction for young children,\" said Garry Plummer, the boss of sponsors Target Darts, who first signed a deal with Littler's family when he was aged 12. \"All the toy shops want it, they all want him - 17, clean, doesn't drink, wonderful.\"\n", - "\n", - "Littler beat Luke Humphries to win the Premier League title in May\n", - "\n", - "The number of academies for children under the age of 16 has doubled in the last year, says Junior Darts Corporation chairman Steve Brown. There are 115 dedicated groups offering youngsters equipment, tournaments and a place to develop, with bases including Australia, Bulgaria, Greece, Norway, USA and Mongolia. \"We've seen so many inquiries from around the world, it's been such a boom. It took us 14 years to get 1,600 members and within 12 months we have over 3,000, and waiting lists,\" said Brown. \"When I played darts as a child, I was quite embarrassed to tell my friends what my hobby was. All these kids playing darts now are pretty popular at school. It's a bit rock 'n roll and recognised as a cool thing to do.\" Plans are being hatched to extend the World Championship by four days and increase the number of players from 96 to 128. That will boost the number of tickets available by 25,000 to 115,000 but Hearn reckons he could sell three times as many. He says Saudi Arabia wants to host a tournament, which is likely to happen if no-alcohol regulations are relaxed. \"They will change their rules in the next 12 months probably for certain areas having alcohol, and we'll take darts there and have a party in Saudi,\" he said. \"When I got involved in darts, the total prize money was something like £300,000 for the year. This year it will go to £20m. I expect in five years' time, we'll be playing for £40m.\"\n", - "\n", - "Former electrician Cross charged to the 2018 world title in his first full season, while Adrian Lewis and Michael van Gerwen were multiple victors in their 20s and 16-time champion Phil ‘The Power’ Taylor is widely considered the greatest of all time. Littler is currently fourth in the world rankings, although that is based on a two-year Order of Merit. There have been suggestions from others the spotlight on the teenager means world number one Humphries, 29, has been denied the coverage he deserves, but no darts player has made a mark at such a young age as Littler. \"Luke Humphries is another fabulous player who is going to be around for years. Sport is a very brutal world. It is about winning and claiming the high ground. There will be envy around,\" Hearn said. \"Luke Littler is the next Tiger Woods for darts so they better get used to it, and the only way to compete is to get better.\" World number 38 Martin Lukeman was awestruck as he described facing a peak Littler after being crushed 16-3 in the Grand Slam final, with the teenager winning 15 consecutive legs. \"I can't compete with that, it was like Godly. He was relentless, he is so good it's ridiculous,\" he said. Lukeman can still see the benefits he brings, adding: \"What he's done for the sport is brilliant. If it wasn't for him, our wages wouldn't be going up. There's more sponsors, more money coming in, all good.\" Hearn feels future competition may come from players even younger than Littler. \"I watched a 10-year-old a few months ago who averaged 104.89 and checked out a 4-3 win with a 136 finish. They smell the money, the fame and put the hard work in,\" he said. How much better Littler can get is guesswork, although Plummer believes he wants to reach new heights. \"He never says 'how good was I?' But I think he wants to break records and beat Phil Taylor's 16 World Championships and 16 World Matchplay titles,\" he said. \"He's young enough to do it.\" A version of this article was originally published on 29 November.\n", - "• None Know a lot about Littler? Take our quiz\n", - "--------------------------------------------------------------------------------\n", - "Distance: 0.3901, Text: Luke Littler has risen from 164th to fourth in the rankings in a year\n", - "\n", - "A tearful Luke Littler hit a tournament record 140.91 set average as he started his bid for the PDC World Championship title with a dramatic 3-1 win over Ryan Meikle. The 17-year-old made headlines around the world when he reached the tournament final in January, where he lost to Luke Humphries. Starting this campaign on Saturday, Littler was millimetres away from a nine-darter when he missed double 12 as he blew Meikle away in the fourth and final set of the second-round match. Littler was overcome with emotion at the end, cutting short his on-stage interview. \"It was probably the toughest game I've ever played. I had to fight until the end,\" he said later in a news conference. \"As soon as the question came on stage and then boom, the tears came. It was just a bit too much to speak on stage. \"It is the worst game I have played. I have never felt anything like that tonight.\" Admitting to nerves during the match, he told Sky Sports: \"Yes, probably the biggest time it's hit me. Coming into it I was fine, but as soon as [referee] George Noble said 'game on', I couldn't throw them.\" Littler started slowly against Meikle, who had two darts for the opening set, but he took the lead by twice hitting double 20. Meikle did not look overawed against his fellow Englishman and levelled, but Littler won the third set and exploded into life in the fourth. The tournament favourite hit four maximum 180s as he clinched three straight legs in 11, 10 and 11 darts for a record set average, and 100.85 overall. Meanwhile, two seeds crashed out on Saturday night – five-time world champion Raymond van Barneveld lost to Welshman Nick Kenny, while England's Ryan Joyce beat Danny Noppert. Australian Damon Heta was another to narrowly miss out on a nine-darter, just failing on double 12 when throwing for the match in a 3-1 win over Connor Scutt. Ninth seed Heta hit four 100-plus checkouts to come from a set down against Scutt in a match in which both men averaged more than 97.\n", - "\n", - "Littler was hugged by his parents after victory over Meikle\n", - "\n", - "... (output truncated for brevity)\n" - ] - } - ], - "source": [ - "\n", - "query = \"What were Luke Littler's key achievements and records in his recent PDC World Championship match?\"\n", - "\n", - "try:\n", - " # Perform the semantic search\n", - " start_time = time.time()\n", - " search_results = vector_store.similarity_search_with_score(query, k=10)\n", - " search_elapsed_time = time.time() - start_time\n", - "\n", - " logging.info(f\"Semantic search completed in {search_elapsed_time:.2f} seconds\")\n", - "\n", - " # Display search results\n", - " print(f\"\\nSemantic Search Results (completed in {search_elapsed_time:.2f} seconds):\")\n", - " print(\"-\" * 80)\n", - "\n", - " for doc, score in search_results:\n", - " print(f\"Distance: {score:.4f}, Text: {doc.page_content}\")\n", - " print(\"-\" * 80)\n", - "\n", - "except CouchbaseException as e:\n", - " raise RuntimeError(f\"Error performing semantic search: {str(e)}\")\n", - "except Exception as e:\n", - " raise RuntimeError(f\"Unexpected error: {str(e)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Alternative: Create Composite Index\n", - "\n", - "Note: To create a COMPOSITE index instead of BHIVE, the below code can be used. Choose based on your specific use case and query patterns. For this tutorial's news search scenario, either index type would work, but BHIVE might be more efficient for pure semantic search across news articles." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vector_store.create_index(index_type=IndexType.COMPOSITE, index_name=\"openrouterdeepseek_composite_index\", index_description=\"IVF,SQ8\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## RAG System Setup\n", - "\n", - "### Setup Couchbase Cache\n", - "\n", - "To further optimize our system, we set up a Couchbase-based cache. A cache is a temporary storage layer that holds data that is frequently accessed, speeding up operations by reducing the need to repeatedly retrieve the same information from the database. In our setup, the cache will help us accelerate repetitive tasks, such as looking up similar documents. By implementing a cache, we enhance the overall performance of our search engine, ensuring that it can handle high query volumes and deliver results quickly.\n", - "\n", - "Caching is particularly valuable in scenarios where users may submit similar queries multiple times or where certain pieces of information are frequently requested. By storing these in a cache, we can significantly reduce the time it takes to respond to these queries, improving the user experience." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-09-17 16:10:11,473 - INFO - Successfully created cache\n" - ] - } - ], - "source": [ - "try:\n", - " cache = CouchbaseCache(\n", - " cluster=cluster,\n", - " bucket_name=CB_BUCKET_NAME,\n", - " scope_name=SCOPE_NAME,\n", - " collection_name=CACHE_COLLECTION,\n", - " )\n", - " logging.info(\"Successfully created cache\")\n", - " set_llm_cache(cache)\n", - "except Exception as e:\n", - " raise ValueError(f\"Failed to create cache: {str(e)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## RAG System Demo\n", - "\n", - "### Retrieval-Augmented Generation (RAG) with Couchbase and LangChain\n", - "\n", - "Couchbase and LangChain can be seamlessly integrated to create RAG (Retrieval-Augmented Generation) chains, enhancing the process of generating contextually relevant responses. In this setup, Couchbase serves as the vector store, where embeddings of documents are stored. When a query is made, LangChain retrieves the most relevant documents from Couchbase by comparing the query's embedding with the stored document embeddings. These documents, which provide contextual information, are then passed to a generative language model within LangChain.\n", - "\n", - "The language model, equipped with the context from the retrieved documents, generates a response that is both informed and contextually accurate. This integration allows the RAG chain to leverage Couchbase's efficient storage and retrieval capabilities, while LangChain handles the generation of responses based on the context provided by the retrieved documents. Together, they create a powerful system that can deliver highly relevant and accurate answers by combining the strengths of both retrieval and generation." - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-09-18 11:18:34,032 - INFO - Successfully created RAG chain\n" - ] - } - ], - "source": [ - "# Create RAG prompt template\n", - "rag_prompt = ChatPromptTemplate.from_messages([\n", - " (\"system\", \"You are a helpful assistant that answers questions based on the provided context.\"),\n", - " (\"human\", \"Context: {context}\\n\\nQuestion: {question}\")\n", - "])\n", - "\n", - "# Create RAG chain\n", - "rag_chain = (\n", - " {\"context\": vector_store.as_retriever(), \"question\": RunnablePassthrough()}\n", - " | rag_prompt\n", - " | llm\n", - " | StrOutputParser()\n", - ")\n", - "logging.info(\"Successfully created RAG chain\")" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "RAG Response: Based on the provided context, Luke Littler's key achievements and records in his recent PDC World Championship match (second-round win against Ryan Meikle) were:\n", - "\n", - "* **Tournament Record Set Average:** He hit a tournament record 140.91 set average during the match.\n", - "* **Near Nine-Darter:** He was \"millimetres away from a nine-darter\" when he missed double 12.\n", - "* **Dominant Final Set:** He won the fourth and final set in just 32 darts (the minimum possible is 27), which included hitting four maximum 180s and clinching three straight legs in 11, 10, and 11 darts.\n", - "* **Overall High Average:** He maintained a high overall match average of 100.85.\n", - "RAG response generated in 0.49 seconds\n" - ] - } - ], - "source": [ - "try:\n", - " start_time = time.time()\n", - " rag_response = rag_chain.invoke(query)\n", - " rag_elapsed_time = time.time() - start_time\n", - "\n", - " print(f\"RAG Response: {rag_response}\")\n", - " print(f\"RAG response generated in {rag_elapsed_time:.2f} seconds\")\n", - "except InternalServerFailureException as e:\n", - " if \"query request rejected\" in str(e):\n", - " print(\"Error: Search request was rejected due to rate limiting. Please try again later.\")\n", - " else:\n", - " print(f\"Internal server error occurred: {str(e)}\")\n", - "except Exception as e:\n", - " print(f\"Unexpected error occurred: {str(e)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Using Couchbase as a Caching Mechanism\n", - "\n", - "Couchbase can be effectively used as a caching mechanism for RAG (Retrieval-Augmented Generation) responses by storing and retrieving precomputed results for specific queries. This approach enhances the system's efficiency and speed, particularly when dealing with repeated or similar queries. When a query is first processed, the RAG chain retrieves relevant documents, generates a response using the language model, and then stores this response in Couchbase, with the query serving as the key.\n", - "\n", - "For subsequent requests with the same query, the system checks Couchbase first. If a cached response is found, it is retrieved directly from Couchbase, bypassing the need to re-run the entire RAG process. This significantly reduces response time because the computationally expensive steps of document retrieval and response generation are skipped. Couchbase's role in this setup is to provide a fast and scalable storage solution for caching these responses, ensuring that frequently asked queries can be answered more quickly and efficiently." - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Query 1: What happened in the match between Fullham and Liverpool?\n", - "Response: In the match between Fulham and Liverpool, Liverpool played the majority of the game with 10 men after Andy Robertson received a red card in the 17th minute. Despite being a player down, Liverpool came from behind twice to secure a 2-2 draw. Diogo Jota scored an 86th-minute equalizer to earn Liverpool a point. The performance was praised for its resilience, with Fulham's Antonee Robinson noting that Liverpool \"didn't feel like they had 10 men at all.\" Liverpool maintained over 60% possession and led in attacking metrics such as shots and chances. Both managers acknowledged the strong efforts of their teams in what was described as an enthralling encounter.\n", - "Time taken: 4.65 seconds\n", - "\n", - "Query 2: What were Luke Littler's key achievements and records in his recent PDC World Championship match?\n", - "Response: Based on the provided context, Luke Littler's key achievements and records in his recent PDC World Championship match (second-round win against Ryan Meikle) were:\n", - "\n", - "* **Tournament Record Set Average:** He hit a tournament record 140.91 set average during the match.\n", - "* **Near Nine-Darter:** He was \"millimetres away from a nine-darter\" when he missed double 12.\n", - "* **Dominant Final Set:** He won the fourth and final set in just 32 darts (the minimum possible is 27), which included hitting four maximum 180s and clinching three straight legs in 11, 10, and 11 darts.\n", - "* **Overall High Average:** He maintained a high overall match average of 100.85.\n", - "Time taken: 0.45 seconds\n", - "\n", - "Query 3: What happened in the match between Fullham and Liverpool?\n", - "Response: In the match between Fulham and Liverpool, Liverpool played the majority of the game with 10 men after Andy Robertson received a red card in the 17th minute. Despite being a player down, Liverpool came from behind twice to secure a 2-2 draw. Diogo Jota scored an 86th-minute equalizer to earn Liverpool a point. The performance was praised for its resilience, with Fulham's Antonee Robinson noting that Liverpool \"didn't feel like they had 10 men at all.\" Liverpool maintained over 60% possession and led in attacking metrics such as shots and chances. Both managers acknowledged the strong efforts of their teams in what was described as an enthralling encounter.\n", - "Time taken: 1.15 seconds\n" - ] - } - ], - "source": [ - "try:\n", - " queries = [\n", - " \"What happened in the match between Fullham and Liverpool?\",\n", - " \"What were Luke Littler's key achievements and records in his recent PDC World Championship match?\", # Repeated query\n", - " \"What happened in the match between Fullham and Liverpool?\", # Repeated query\n", - " ]\n", - "\n", - " for i, query in enumerate(queries, 1):\n", - " print(f\"\\nQuery {i}: {query}\")\n", - " start_time = time.time()\n", - "\n", - " response = rag_chain.invoke(query)\n", - " elapsed_time = time.time() - start_time\n", - " print(f\"Response: {response}\")\n", - " print(f\"Time taken: {elapsed_time:.2f} seconds\")\n", - "\n", - "except InternalServerFailureException as e:\n", - " if \"query request rejected\" in str(e):\n", - " print(\"Error: Search request was rejected due to rate limiting. Please try again later.\")\n", - " else:\n", - " print(f\"Internal server error occurred: {str(e)}\")\n", - "except Exception as e:\n", - " print(f\"Unexpected error occurred: {str(e)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Conclusion\n", - "By following these steps, you'll have a fully functional semantic search engine that leverages the strengths of Couchbase and Deepseek(via Openrouter). This guide is designed not just to show you how to build the system, but also to explain why each step is necessary, giving you a deeper understanding of the principles behind semantic search and how to implement it effectively. Whether you're a newcomer to software development or an experienced developer looking to expand your skills, this guide will provide you with the knowledge and tools you need to create a powerful, AI-driven search engine." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/openrouter-deepseek/query_based/frontmatter.md b/openrouter-deepseek/query_based/frontmatter.md deleted file mode 100644 index d9bc1b6e..00000000 --- a/openrouter-deepseek/query_based/frontmatter.md +++ /dev/null @@ -1,26 +0,0 @@ ---- -# frontmatter -path: "/tutorial-openrouter-deepseek-with-hyperscale-or-composite-vector-index" -title: Retrieval-Augmented Generation (RAG) with OpenRouter Deepseek and Couchbase Hyperscale and Composite Vector Index -short_title: RAG with OpenRouter Deepseek and Couchbase Hyperscale and Composite Vector Index -description: - - Learn how to build a semantic search engine using OpenRouter Deepseek and Couchbase Hyperscale and Composite Vector Index. - - This tutorial demonstrates how to integrate Couchbase's Hyperscale and Composite Vector Index capabilities with OpenRouter Deepseek as both embeddings and language model provider. - - You'll understand how to perform Retrieval-Augmented Generation (RAG) using LangChain, OpenRouter Deepseek, and Couchbase Hyperscale and Composite Vector Index. - - Alternatively if you want to perform semantic search using the Search Vector Index, please take a look at [this.](https://developer.couchbase.com/tutorial-openrouter-deepseek-with-search-vector-index/) -content_type: tutorial -filter: sdk -technology: - - vector search -tags: - - Hyperscale Vector Index - - Composite Vector Index - - Artificial Intelligence - - LangChain - - Deepseek - - OpenRouter -sdk_language: - - python -length: 60 Mins -alt_paths: ["/tutorial-openrouter-deepseek-with-hyperscale-vector-index", "/tutorial-openrouter-deepseek-with-composite-vector-index"] ---- diff --git a/openrouter-deepseek/search_based/.env.sample b/openrouter-deepseek/search_based/.env.sample deleted file mode 100644 index 861ba211..00000000 --- a/openrouter-deepseek/search_based/.env.sample +++ /dev/null @@ -1,14 +0,0 @@ -DEEPSEEK_API_KEY="" -OPENAI_API_KEY="" -OPENROUTER_API_KEY="" - -# Couchbase Settings -CB_HOST=couchbase://localhost -CB_USERNAME=Administrator -CB_PASSWORD=password -CB_BUCKET_NAME=vector-search-testing - -INDEX_NAME=vector_search_deepseek -SCOPE_NAME=shared -COLLECTION_NAME=deepseek -CACHE_COLLECTION=cache diff --git a/openrouter-deepseek/search_based/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb b/openrouter-deepseek/search_based/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb deleted file mode 100644 index 65c9f5d2..00000000 --- a/openrouter-deepseek/search_based/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb +++ /dev/null @@ -1,1137 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## How to Run This Tutorial\n", - "\n", - "This tutorial is available as a Jupyter Notebook (`.ipynb` file) that you can run interactively. You can access the original notebook [here](https://github.com/couchbase-examples/vector-search-cookbook/blob/main/openrouter-deepseek/search_based/RAG_with_Couchbase_and_Openrouter_Deepseek.ipynb).\n", - "\n", - "You can either download the notebook file and run it on [Google Colab](https://colab.research.google.com/) or run it on your system by setting up the Python environment." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prerequisites\n", - "\n", - "### Get Credentials for OpenRouter and Deepseek\n", - "\n", - "* Sign up for an account at [OpenRouter](https://openrouter.ai/) to get your API key\n", - "* OpenRouter provides access to Deepseek models, so no separate Deepseek credentials are needed\n", - "* Store your OpenRouter API key securely as it will be used to access the models\n", - "* For [Deepseek](https://deepseek.ai/) models, you can use the default models provided by OpenRouter\n", - "\n", - "### Couchbase Requirements\n", - "\n", - "Create and Deploy Your Free Tier Operational cluster on [Capella](https://cloud.couchbase.com/sign-up)\n", - "\n", - "To get started with Couchbase Capella, create an account and use it to deploy a forever free tier operational cluster. This account provides you with an environment where you can explore and learn about Capella with no time constraint.\n", - "\n", - "To learn more, please follow the [instructions](https://docs.couchbase.com/cloud/get-started/create-account.html).\n", - "\n", - "### Couchbase Capella Configuration\n", - "\n", - "When running Couchbase using [Capella](https://cloud.couchbase.com/sign-in), the following prerequisites need to be met.\n", - "\n", - "* Create the [database credentials](https://docs.couchbase.com/cloud/clusters/manage-database-users.html) to access the required bucket (Read and Write) used in the application.\n", - "* [Allow access](https://docs.couchbase.com/cloud/clusters/allow-ip-address.html) to the Cluster from the IP on which the application is running." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setup and Installation\n", - "\n", - "### Installing Necessary Libraries\n", - "\n", - "To build our semantic search engine, we need a robust set of tools. The libraries we install handle everything from connecting to databases to performing complex machine learning tasks." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], - "source": [ - "%pip install --quiet datasets==3.5.0 langchain-couchbase==0.3.0 langchain-deepseek==0.1.3 langchain-openai==0.3.13 python-dotenv==1.1.0" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Import Required Modules\n", - "\n", - "The script starts by importing a series of libraries required for various tasks, including handling JSON, logging, time tracking, Couchbase connections, embedding generation, and dataset loading." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import getpass\n", - "import json\n", - "import logging\n", - "import os\n", - "import time\n", - "from datetime import timedelta\n", - "\n", - "from couchbase.auth import PasswordAuthenticator\n", - "from couchbase.cluster import Cluster\n", - "from couchbase.exceptions import (CouchbaseException,\n", - " InternalServerFailureException,\n", - " QueryIndexAlreadyExistsException,ServiceUnavailableException)\n", - "from couchbase.management.buckets import CreateBucketSettings\n", - "from couchbase.management.search import SearchIndex\n", - "from couchbase.options import ClusterOptions\n", - "from datasets import load_dataset\n", - "from dotenv import load_dotenv\n", - "from langchain_core.globals import set_llm_cache\n", - "from langchain_core.output_parsers import StrOutputParser\n", - "from langchain_core.prompts.chat import ChatPromptTemplate\n", - "from langchain_core.runnables import RunnablePassthrough\n", - "from langchain_couchbase.cache import CouchbaseCache\n", - "from langchain_couchbase.vectorstores import CouchbaseSearchVectorStore\n", - "from langchain_openai import OpenAIEmbeddings" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Configure Logging\n", - "\n", - "Logging is configured to track the progress of the script and capture any errors or warnings." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)\n", - "\n", - "# Suppress httpx logging\n", - "logging.getLogger('httpx').setLevel(logging.CRITICAL)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Load Environment Configuration\n", - "\n", - "This section handles loading and validating environment variables and configuration settings:\n", - "\n", - "1. **API Keys:**\n", - " - Supports either direct Deepseek API or OpenRouter API access\n", - " - Prompts for API key input if not found in environment\n", - " - Requires OpenAI API key for embeddings\n", - "\n", - "2. **Couchbase Settings:**\n", - " - Connection details (host, username, password)\n", - " - Bucket, scope and collection names\n", - " - Vector search index configuration\n", - " - Cache collection settings\n", - "\n", - "The code validates that all required credentials are present before proceeding. It allows flexible configuration through environment variables or interactive prompts, with sensible defaults for local development." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "# Load environment variables from .env file if it exists\n", - "load_dotenv()\n", - "\n", - "# API Keys\n", - "# Allow either Deepseek API directly or via OpenRouter\n", - "DEEPSEEK_API_KEY = os.getenv('DEEPSEEK_API_KEY')\n", - "OPENROUTER_API_KEY = os.getenv('OPENROUTER_API_KEY')\n", - "\n", - "if not DEEPSEEK_API_KEY and not OPENROUTER_API_KEY:\n", - " api_choice = input('Choose API (1 for Deepseek direct, 2 for OpenRouter): ')\n", - " if api_choice == '1':\n", - " DEEPSEEK_API_KEY = getpass.getpass('Enter your Deepseek API Key: ')\n", - " else:\n", - " OPENROUTER_API_KEY = getpass.getpass('Enter your OpenRouter API Key: ')\n", - "\n", - "OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') or getpass.getpass('Enter your OpenAI API Key: ')\n", - "\n", - "# Couchbase Settings\n", - "CB_HOST = os.getenv('CB_HOST') or input('Enter your Couchbase host (default: couchbase://localhost): ') or 'couchbase://localhost'\n", - "CB_USERNAME = os.getenv('CB_USERNAME') or input('Enter your Couchbase username (default: Administrator): ') or 'Administrator'\n", - "CB_PASSWORD = os.getenv('CB_PASSWORD') or getpass.getpass('Enter your Couchbase password (default: password): ') or 'password'\n", - "CB_BUCKET_NAME = os.getenv('CB_BUCKET_NAME') or input('Enter your Couchbase bucket name (default: vector-search-testing): ') or 'vector-search-testing'\n", - "INDEX_NAME = os.getenv('INDEX_NAME') or input('Enter your index name (default: vector_search_deepseek): ') or 'vector_search_deepseek'\n", - "SCOPE_NAME = os.getenv('SCOPE_NAME') or input('Enter your scope name (default: shared): ') or 'shared'\n", - "COLLECTION_NAME = os.getenv('COLLECTION_NAME') or input('Enter your collection name (default: deepseek): ') or 'deepseek'\n", - "CACHE_COLLECTION = os.getenv('CACHE_COLLECTION') or input('Enter your cache collection name (default: cache): ') or 'cache'\n", - "\n", - "# Check if required credentials are set\n", - "required_creds = {\n", - " 'OPENAI_API_KEY': OPENAI_API_KEY,\n", - " 'CB_HOST': CB_HOST,\n", - " 'CB_USERNAME': CB_USERNAME,\n", - " 'CB_PASSWORD': CB_PASSWORD,\n", - " 'CB_BUCKET_NAME': CB_BUCKET_NAME\n", - "}\n", - "\n", - "# Add the API key that was chosen\n", - "if DEEPSEEK_API_KEY:\n", - " required_creds['DEEPSEEK_API_KEY'] = DEEPSEEK_API_KEY\n", - "elif OPENROUTER_API_KEY:\n", - " required_creds['OPENROUTER_API_KEY'] = OPENROUTER_API_KEY\n", - "else:\n", - " raise ValueError(\"Either Deepseek API Key or OpenRouter API Key must be provided\")\n", - "\n", - "for cred_name, cred_value in required_creds.items():\n", - " if not cred_value:\n", - " raise ValueError(f\"{cred_name} is not set\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Couchbase Connection Setup\n", - "\n", - "### Connect to Cluster\n", - "\n", - "Connecting to a Couchbase cluster is the foundation of our project. Couchbase will serve as our primary data store, handling all the storage and retrieval operations required for our semantic search engine. By establishing this connection, we enable our application to interact with the database, allowing us to perform operations such as storing embeddings, querying data, and managing collections. This connection is the gateway through which all data will flow, so ensuring it's set up correctly is paramount." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-05-25 14:39:18,465 - INFO - Successfully connected to Couchbase\n" - ] - } - ], - "source": [ - "try:\n", - " auth = PasswordAuthenticator(CB_USERNAME, CB_PASSWORD)\n", - " options = ClusterOptions(auth)\n", - " cluster = Cluster(CB_HOST, options)\n", - " cluster.wait_until_ready(timedelta(seconds=5))\n", - " logging.info(\"Successfully connected to Couchbase\")\n", - "except Exception as e:\n", - " raise ConnectionError(f\"Failed to connect to Couchbase: {str(e)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Setup Collections\n", - "\n", - "The setup_collection() function handles creating and configuring the hierarchical data organization in Couchbase:\n", - "\n", - "1. **Bucket Creation:**\n", - " - Checks if specified bucket exists, creates it if not\n", - " - Sets bucket properties like RAM quota (1024MB) and replication (disabled)\n", - " - Note: If you are using Capella, create a bucket manually called vector-search-testing (or any name you prefer) with the same properties.\n", - "\n", - "2. **Scope Management:** \n", - " - Verifies if requested scope exists within bucket\n", - " - Creates new scope if needed (unless it's the default \"_default\" scope)\n", - "\n", - "3. **Collection Setup:**\n", - " - Checks for collection existence within scope\n", - " - Creates collection if it doesn't exist\n", - " - Waits 2 seconds for collection to be ready\n", - "\n", - "**Additional Tasks:**\n", - "- Creates primary index on collection for query performance\n", - "- Clears any existing documents for clean state\n", - "- Implements comprehensive error handling and logging\n", - "\n", - "The function is called twice to set up:\n", - "1. Main collection for vector embeddings\n", - "2. Cache collection for storing results" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-05-25 14:39:19,580 - INFO - Bucket 'vector-search-testing' exists.\n", - "2025-05-25 14:39:21,409 - INFO - Collection 'deepseek' already exists. Skipping creation.\n", - "2025-05-25 14:39:24,342 - INFO - Primary index present or created successfully.\n", - "2025-05-25 14:39:24,604 - INFO - All documents cleared from the collection.\n", - "2025-05-25 14:39:24,606 - INFO - Bucket 'vector-search-testing' exists.\n", - "2025-05-25 14:39:26,535 - INFO - Collection 'cache' already exists. Skipping creation.\n", - "2025-05-25 14:39:29,589 - INFO - Primary index present or created successfully.\n", - "2025-05-25 14:39:29,813 - INFO - All documents cleared from the collection.\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def setup_collection(cluster, bucket_name, scope_name, collection_name):\n", - " try:\n", - " # Check if bucket exists, create if it doesn't\n", - " try:\n", - " bucket = cluster.bucket(bucket_name)\n", - " logging.info(f\"Bucket '{bucket_name}' exists.\")\n", - " except Exception as e:\n", - " logging.info(f\"Bucket '{bucket_name}' does not exist. Creating it...\")\n", - " bucket_settings = CreateBucketSettings(\n", - " name=bucket_name,\n", - " bucket_type='couchbase',\n", - " ram_quota_mb=1024,\n", - " flush_enabled=True,\n", - " num_replicas=0\n", - " )\n", - " cluster.buckets().create_bucket(bucket_settings)\n", - " time.sleep(2) # Wait for bucket creation to complete and become available\n", - " bucket = cluster.bucket(bucket_name)\n", - " logging.info(f\"Bucket '{bucket_name}' created successfully.\")\n", - "\n", - " bucket_manager = bucket.collections()\n", - "\n", - " # Check if scope exists, create if it doesn't\n", - " scopes = bucket_manager.get_all_scopes()\n", - " scope_exists = any(scope.name == scope_name for scope in scopes)\n", - " \n", - " if not scope_exists and scope_name != \"_default\":\n", - " logging.info(f\"Scope '{scope_name}' does not exist. Creating it...\")\n", - " bucket_manager.create_scope(scope_name)\n", - " logging.info(f\"Scope '{scope_name}' created successfully.\")\n", - "\n", - " # Check if collection exists, create if it doesn't\n", - " collections = bucket_manager.get_all_scopes()\n", - " collection_exists = any(\n", - " scope.name == scope_name and collection_name in [col.name for col in scope.collections]\n", - " for scope in collections\n", - " )\n", - "\n", - " if not collection_exists:\n", - " logging.info(f\"Collection '{collection_name}' does not exist. Creating it...\")\n", - " bucket_manager.create_collection(scope_name, collection_name)\n", - " logging.info(f\"Collection '{collection_name}' created successfully.\")\n", - " else:\n", - " logging.info(f\"Collection '{collection_name}' already exists. Skipping creation.\")\n", - "\n", - " # Wait for collection to be ready\n", - " collection = bucket.scope(scope_name).collection(collection_name)\n", - " time.sleep(2) # Give the collection time to be ready for queries\n", - "\n", - " # Ensure primary index exists\n", - " try:\n", - " cluster.query(f\"CREATE PRIMARY INDEX IF NOT EXISTS ON `{bucket_name}`.`{scope_name}`.`{collection_name}`\").execute()\n", - " logging.info(\"Primary index present or created successfully.\")\n", - " except Exception as e:\n", - " logging.warning(f\"Error creating primary index: {str(e)}\")\n", - "\n", - " # Clear all documents in the collection\n", - " try:\n", - " query = f\"DELETE FROM `{bucket_name}`.`{scope_name}`.`{collection_name}`\"\n", - " cluster.query(query).execute()\n", - " logging.info(\"All documents cleared from the collection.\")\n", - " except Exception as e:\n", - " logging.warning(f\"Error while clearing documents: {str(e)}. The collection might be empty.\")\n", - "\n", - " return collection\n", - " except Exception as e:\n", - " raise RuntimeError(f\"Error setting up collection: {str(e)}\")\n", - " \n", - "setup_collection(cluster, CB_BUCKET_NAME, SCOPE_NAME, COLLECTION_NAME)\n", - "setup_collection(cluster, CB_BUCKET_NAME, SCOPE_NAME, CACHE_COLLECTION)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Understanding Search Vector Index\n", - "\n", - "### Loading Couchbase Search Vector Index\n", - "\n", - "Semantic search requires an efficient way to retrieve relevant documents based on a user's query. This is where the Couchbase **Search Vector Index** comes into play. In this step, we load the Search Vector Index definition from a JSON file, which specifies how the index should be structured. This includes the fields to be indexed, the dimensions of the vectors, and other parameters that determine how the search engine processes queries based on vector similarity.\n", - "\n", - "This OpenRouter Deepseek Search Vector Index configuration requires specific default settings to function properly. This tutorial uses the bucket named `vector-search-testing` with the scope `shared` and collection `deepseek`. The configuration is set up for vectors with exactly `1536 dimensions`, using dot product similarity and optimized for recall. If you want to use a different bucket, scope, or collection, you will need to modify the index configuration accordingly.\n", - "\n", - "For more information on creating a Search Vector Index, please follow the [instructions](https://docs.couchbase.com/cloud/vector-search/create-vector-search-index-ui.html)." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "try:\n", - " with open('deepseek_index.json', 'r') as file:\n", - " index_definition = json.load(file)\n", - "except Exception as e:\n", - " raise ValueError(f\"Error loading index definition: {str(e)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Creating or Updating Search Vector Index\n", - "\n", - "With the index definition loaded, the next step is to create or update the **Search Vector Index** in Couchbase. This step is crucial because it optimizes our database for vector similarity search operations, allowing us to perform searches based on the semantic content of documents rather than just keywords. By creating or updating a Search Vector Index, we enable our search engine to handle complex queries that involve finding semantically similar documents using vector embeddings, which is essential for a robust semantic search engine." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-05-25 14:39:31,015 - INFO - Index 'vector_search_deepseek' found\n", - "2025-05-25 14:39:31,770 - INFO - Index 'vector_search_deepseek' already exists. Skipping creation/update.\n" - ] - } - ], - "source": [ - "try:\n", - " scope_index_manager = cluster.bucket(CB_BUCKET_NAME).scope(SCOPE_NAME).search_indexes()\n", - "\n", - " # Check if index already exists\n", - " existing_indexes = scope_index_manager.get_all_indexes()\n", - " index_name = index_definition[\"name\"]\n", - "\n", - " if index_name in [index.name for index in existing_indexes]:\n", - " logging.info(f\"Index '{index_name}' found\")\n", - " else:\n", - " logging.info(f\"Creating new index '{index_name}'...\")\n", - "\n", - " # Create SearchIndex object from JSON definition\n", - " search_index = SearchIndex.from_json(index_definition)\n", - "\n", - " # Upsert the index (create if not exists, update if exists)\n", - " scope_index_manager.upsert_index(search_index)\n", - " logging.info(f\"Index '{index_name}' successfully created/updated.\")\n", - "\n", - "except QueryIndexAlreadyExistsException:\n", - " logging.info(f\"Index '{index_name}' already exists. Skipping creation/update.\")\n", - "except ServiceUnavailableException:\n", - " raise RuntimeError(\"Search service is not available. Please ensure the Search service is enabled in your Couchbase cluster.\")\n", - "except InternalServerFailureException as e:\n", - " logging.error(f\"Internal server error: {str(e)}\")\n", - " raise" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## OpenAI Configuration\n", - "\n", - "### Creating the Embeddings Client\n", - "\n", - "This section creates an OpenAI embeddings client using the OpenAI API key. The embeddings client is configured to use the \"text-embedding-3-small\" model, which converts text into numerical vector representations. These vector embeddings are essential for semantic search and similarity matching. The client will be used by the vector store to generate embeddings for documents." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-05-25 14:39:32,003 - INFO - Successfully created OpenAI embeddings client\n" - ] - } - ], - "source": [ - "try:\n", - " embeddings = OpenAIEmbeddings(\n", - " api_key=OPENAI_API_KEY,\n", - " model=\"text-embedding-3-small\"\n", - " )\n", - " logging.info(\"Successfully created OpenAI embeddings client\")\n", - "except Exception as e:\n", - " raise ValueError(f\"Error creating OpenAI embeddings client: {str(e)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Document Processing and Vector Store Setup\n", - "\n", - "### Create Couchbase Search Vector Store\n", - "\n", - "A vector store is where we'll keep our embeddings. Unlike traditional text-based search, the Search Vector Store is specifically designed to handle embeddings and perform similarity searches. When a user inputs a query, the search engine converts the query into an embedding and compares it against the embeddings stored in the vector store. This allows the engine to find documents that are semantically similar to the query, even if they don't contain the exact same words. By setting up the Search Vector Store in Couchbase, we create a powerful tool that enables our search engine to understand and retrieve information based on the meaning and context of the query, rather than just the specific words used." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-05-25 14:39:35,246 - INFO - Successfully created vector store\n" - ] - } - ], - "source": [ - "try:\n", - " vector_store = CouchbaseSearchVectorStore(\n", - " cluster=cluster,\n", - " bucket_name=CB_BUCKET_NAME,\n", - " scope_name=SCOPE_NAME,\n", - " collection_name=COLLECTION_NAME,\n", - " embedding=embeddings,\n", - " index_name=INDEX_NAME,\n", - " )\n", - " logging.info(\"Successfully created vector store\")\n", - "except Exception as e:\n", - " raise ValueError(f\"Failed to create vector store: {str(e)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Load BBC News Dataset\n", - "\n", - "To build a search engine, we need data to search through. We use the BBC News dataset from RealTimeData, which provides real-world news articles. This dataset contains news articles from BBC covering various topics and time periods. Loading the dataset is a crucial step because it provides the raw material that our search engine will work with. The quality and diversity of the news articles make it an excellent choice for testing and refining our search engine, ensuring it can handle real-world news content effectively.\n", - "\n", - "The BBC News dataset allows us to work with authentic news articles, enabling us to build and test a search engine that can effectively process and retrieve relevant news content. The dataset is loaded using the Hugging Face datasets library, specifically accessing the \"RealTimeData/bbc_news_alltime\" dataset with the \"2024-12\" version." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-05-25 14:39:41,364 - INFO - Successfully loaded the BBC News dataset with 2687 rows.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loaded the BBC News dataset with 2687 rows\n" - ] - } - ], - "source": [ - "try:\n", - " news_dataset = load_dataset(\n", - " \"RealTimeData/bbc_news_alltime\", \"2024-12\", split=\"train\"\n", - " )\n", - " print(f\"Loaded the BBC News dataset with {len(news_dataset)} rows\")\n", - " logging.info(f\"Successfully loaded the BBC News dataset with {len(news_dataset)} rows.\")\n", - "except Exception as e:\n", - " raise ValueError(f\"Error loading the BBC News dataset: {str(e)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Data Cleaning\n", - "\n", - "We will use the content of the news articles for our RAG system.\n", - "\n", - "The dataset contains a few duplicate records. We are removing them to avoid duplicate results in the retrieval stage of our RAG system." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "We have 1749 unique articles in our database.\n" - ] - } - ], - "source": [ - "news_articles = news_dataset[\"content\"]\n", - "unique_articles = set()\n", - "for article in news_articles:\n", - " if article:\n", - " unique_articles.add(article)\n", - "unique_news_articles = list(unique_articles)\n", - "print(f\"We have {len(unique_news_articles)} unique articles in our database.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Save Data to Vector Store\n", - "\n", - "To efficiently handle the large number of articles, we process them in batches of articles at a time. This batch processing approach helps manage memory usage and provides better control over the ingestion process.\n", - "\n", - "We first filter out any articles that exceed 50,000 characters to avoid potential issues with token limits. Then, using the vector store's add_texts method, we add the filtered articles to our vector database. The batch_size parameter controls how many articles are processed in each iteration.\n", - "\n", - "This approach offers several benefits:\n", - "1. **Memory Efficiency**: Processing in smaller batches prevents memory overload\n", - "2. **Progress Tracking**: Easier to monitor and track the ingestion progress\n", - "3. **Resource Management**: Better control over CPU and network resource utilization\n", - "\n", - "We use a conservative batch size of 50 to ensure reliable operation. The optimal batch size depends on many factors including document sizes, available system resources, network conditions, and concurrent workload." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-05-25 14:41:37,848 - INFO - Document ingestion completed successfully.\n" - ] - } - ], - "source": [ - "batch_size = 50\n", - "\n", - "# Automatic Batch Processing\n", - "articles = [article for article in unique_news_articles if article and len(article) <= 50000]\n", - "\n", - "try:\n", - " vector_store.add_texts(\n", - " texts=articles,\n", - " batch_size=batch_size\n", - " )\n", - " logging.info(\"Document ingestion completed successfully.\")\n", - "except Exception as e:\n", - " raise ValueError(f\"Failed to save documents to vector store: {str(e)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Setup Couchbase Cache\n", - "\n", - "To further optimize our system, we set up a Couchbase-based cache. A cache is a temporary storage layer that holds data that is frequently accessed, speeding up operations by reducing the need to repeatedly retrieve the same information from the database. In our setup, the cache will help us accelerate repetitive tasks, such as looking up similar documents. By implementing a cache, we enhance the overall performance of our search engine, ensuring that it can handle high query volumes and deliver results quickly.\n", - "\n", - "Caching is particularly valuable in scenarios where users may submit similar queries multiple times or where certain pieces of information are frequently requested. By storing these in a cache, we can significantly reduce the time it takes to respond to these queries, improving the user experience." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-05-25 14:41:40,203 - INFO - Successfully created cache\n" - ] - } - ], - "source": [ - "try:\n", - " cache = CouchbaseCache(\n", - " cluster=cluster,\n", - " bucket_name=CB_BUCKET_NAME,\n", - " scope_name=SCOPE_NAME,\n", - " collection_name=CACHE_COLLECTION,\n", - " )\n", - " logging.info(\"Successfully created cache\")\n", - " set_llm_cache(cache)\n", - "except Exception as e:\n", - " raise ValueError(f\"Failed to create cache: {str(e)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Deepseek LLM Configuration\n", - "\n", - "### Setting Up the LLM Model\n", - "\n", - "In this section, we set up the Large Language Model (LLM) for our RAG system. We're using the Deepseek model, which can be accessed through two different methods:\n", - "\n", - "1. **Deepseek API Key**: This is obtained directly from Deepseek's platform (https://deepseek.ai) by creating an account and subscribing to their API services. With this key, you can access Deepseek's models directly using the `ChatDeepSeek` class from the `langchain_deepseek` package.\n", - "\n", - "2. **OpenRouter API Key**: OpenRouter (https://openrouter.ai) is a service that provides unified access to multiple LLM providers, including Deepseek. You can obtain an API key by creating an account on OpenRouter's website. This approach uses the `ChatOpenAI` class from `langchain_openai` but with a custom base URL pointing to OpenRouter's API endpoint.\n", - "\n", - "The key difference is that OpenRouter acts as an intermediary service that can route your requests to various LLM providers, while the Deepseek API gives you direct access to only Deepseek's models. OpenRouter can be useful if you want to switch between different LLM providers without changing your code significantly.\n", - "\n", - "In our implementation, we check for both keys and prioritize using the Deepseek API directly if available, falling back to OpenRouter if not. The model is configured with temperature=0 to ensure deterministic, focused responses suitable for RAG applications." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-05-25 14:41:40,237 - INFO - Successfully created Deepseek LLM client through OpenRouter\n" - ] - } - ], - "source": [ - "from langchain_deepseek import ChatDeepSeek\n", - "from langchain_openai import ChatOpenAI\n", - "\n", - "if DEEPSEEK_API_KEY:\n", - " try:\n", - " llm = ChatDeepSeek(\n", - " api_key=DEEPSEEK_API_KEY,\n", - " model_name=\"deepseek-chat\",\n", - " temperature=0\n", - " )\n", - " logging.info(\"Successfully created Deepseek LLM client\")\n", - " except Exception as e:\n", - " raise ValueError(f\"Error creating Deepseek LLM client: {str(e)}\")\n", - "elif OPENROUTER_API_KEY:\n", - " try:\n", - " llm = ChatOpenAI(\n", - " api_key=OPENROUTER_API_KEY,\n", - " base_url=\"https://openrouter.ai/api/v1\",\n", - " model=\"deepseek/deepseek-chat-v3.1\", \n", - " temperature=0,\n", - " )\n", - " logging.info(\"Successfully created Deepseek LLM client through OpenRouter\")\n", - " except Exception as e:\n", - " raise ValueError(f\"Error creating Deepseek LLM client: {str(e)}\")\n", - "else:\n", - " raise ValueError(\"Either Deepseek API Key or OpenRouter API Key must be provided\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Semantic Search Demo\n", - "\n", - "### Perform Semantic Search\n", - "\n", - "Semantic search in Couchbase involves converting queries and documents into vector representations using an embeddings model. These vectors capture the semantic meaning of the text and are stored directly in Couchbase. When a query is made, Couchbase performs a similarity search by comparing the query vector against the stored document vectors. The similarity metric used for this comparison is configurable, allowing flexibility in how the relevance of documents is determined.\n", - "\n", - "In the provided code, the search process begins by recording the start time, followed by executing the similarity_search_with_score method of the CouchbaseSearchVectorStore. This method searches Couchbase for the most relevant documents based on the vector similarity to the query. The search results include the document content and a similarity score that reflects how closely each document aligns with the query in the defined semantic space. The time taken to perform this search is then calculated and logged, and the results are displayed, showing the most relevant documents along with their similarity scores. This approach leverages Couchbase as both a storage and retrieval engine for vector data, enabling efficient and scalable semantic searches. The integration of vector storage and search capabilities within Couchbase allows for sophisticated semantic search operations without relying on external services for vector storage or comparison." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-05-25 14:41:41,802 - INFO - Semantic search completed in 1.56 seconds\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Semantic Search Results (completed in 1.56 seconds):\n", - "--------------------------------------------------------------------------------\n", - "Score: 0.6303, Text: The Littler effect - how darts hit the bullseye\n", - "\n", - "Teenager Luke Littler began his bid to win the 2025 PDC World Darts Championship with a second-round win against Ryan Meikle. Here we assess Littler's impact after a remarkable rise which saw him named BBC Young Sports Personality of the Year and runner-up in the main award to athlete Keely Hodgkinson.\n", - "\n", - "One year ago, he was barely a household name in his own home. Now he is a sporting phenomenon. After emerging from obscurity aged 16 to reach the World Championship final, the life of Luke Littler and the sport he loves has been transformed. Viewing figures, ticket sales and social media interest have rocketed. Darts has hit the bullseye. This Christmas more than 100,000 children are expected to be opening Littler-branded magnetic dartboards as presents. His impact has helped double the number of junior academies, prompted plans to expand the World Championship and generated interest in darts from Saudi Arabian backers.\n", - "\n", - "Just months after taking his GCSE exams and ranked 164th in the world, Littler beat former champions Raymond van Barneveld and Rob Cross en route to the PDC World Championship final in January, before his run ended with a 7-4 loss to Luke Humphries. With his nickname 'The Nuke' on his purple and yellow shirt and the Alexandra Palace crowd belting out his walk-on song, Pitbull's tune Greenlight, he became an instant hit. Electric on the stage, calm off it. The down-to-earth teenager celebrated with a kebab and computer games. \"We've been watching his progress since he was about seven. He was on our radar, but we never anticipated what would happen. The next thing we know 'Littlermania' is spreading everywhere,\" PDC president Barry Hearn told BBC Sport. A peak TV audience of 3.7 million people watched the final - easily Sky's biggest figure for a non-football sporting event. The teenager from Warrington in Cheshire was too young to legally drive or drink alcohol, but earned £200,000 for finishing second - part of £1m prize money in his first year as a professional - and an invitation to the elite Premier League competition. He turned 17 later in January but was he too young for the demanding event over 17 Thursday nights in 17 locations? He ended up winning the whole thing, and hit a nine-dart finish against Humphries in the final. From Bahrain to Wolverhampton, Littler claimed 10 titles in 2024 and is now eyeing the World Championship.\n", - "\n", - "As he progressed at the Ally Pally, the Manchester United fan was sent a good luck message by the club's former midfielder and ex-England captain David Beckham. In 12 months, Littler's Instagram followers have risen from 4,000 to 1.3m. Commercial backers include a clothing range, cereal firm and train company and he will appear in a reboot of the TV darts show Bullseye. Google say he was the most searched-for athlete online in the UK during 2024. On the back of his success, Littler darts, boards, cabinets, shirts are being snapped up in big numbers. \"This Christmas the junior magnetic dartboard is selling out, we're talking over 100,000. They're 20 quid and a great introduction for young children,\" said Garry Plummer, the boss of sponsors Target Darts, who first signed a deal with Littler's family when he was aged 12. \"All the toy shops want it, they all want him - 17, clean, doesn't drink, wonderful.\"\n", - "\n", - "Littler beat Luke Humphries to win the Premier League title in May\n", - "\n", - "The number of academies for children under the age of 16 has doubled in the last year, says Junior Darts Corporation chairman Steve Brown. There are 115 dedicated groups offering youngsters equipment, tournaments and a place to develop, with bases including Australia, Bulgaria, Greece, Norway, USA and Mongolia. \"We've seen so many inquiries from around the world, it's been such a boom. It took us 14 years to get 1,600 members and within 12 months we have over 3,000, and waiting lists,\" said Brown. \"When I played darts as a child, I was quite embarrassed to tell my friends what my hobby was. All these kids playing darts now are pretty popular at school. It's a bit rock 'n roll and recognised as a cool thing to do.\" Plans are being hatched to extend the World Championship by four days and increase the number of players from 96 to 128. That will boost the number of tickets available by 25,000 to 115,000 but Hearn reckons he could sell three times as many. He says Saudi Arabia wants to host a tournament, which is likely to happen if no-alcohol regulations are relaxed. \"They will change their rules in the next 12 months probably for certain areas having alcohol, and we'll take darts there and have a party in Saudi,\" he said. \"When I got involved in darts, the total prize money was something like £300,000 for the year. This year it will go to £20m. I expect in five years' time, we'll be playing for £40m.\"\n", - "\n", - "Former electrician Cross charged to the 2018 world title in his first full season, while Adrian Lewis and Michael van Gerwen were multiple victors in their 20s and 16-time champion Phil ‘The Power’ Taylor is widely considered the greatest of all time. Littler is currently fourth in the world rankings, although that is based on a two-year Order of Merit. There have been suggestions from others the spotlight on the teenager means world number one Humphries, 29, has been denied the coverage he deserves, but no darts player has made a mark at such a young age as Littler. \"Luke Humphries is another fabulous player who is going to be around for years. Sport is a very brutal world. It is about winning and claiming the high ground. There will be envy around,\" Hearn said. \"Luke Littler is the next Tiger Woods for darts so they better get used to it, and the only way to compete is to get better.\" World number 38 Martin Lukeman was awestruck as he described facing a peak Littler after being crushed 16-3 in the Grand Slam final, with the teenager winning 15 consecutive legs. \"I can't compete with that, it was like Godly. He was relentless, he is so good it's ridiculous,\" he said. Lukeman can still see the benefits he brings, adding: \"What he's done for the sport is brilliant. If it wasn't for him, our wages wouldn't be going up. There's more sponsors, more money coming in, all good.\" Hearn feels future competition may come from players even younger than Littler. \"I watched a 10-year-old a few months ago who averaged 104.89 and checked out a 4-3 win with a 136 finish. They smell the money, the fame and put the hard work in,\" he said. How much better Littler can get is guesswork, although Plummer believes he wants to reach new heights. \"He never says 'how good was I?' But I think he wants to break records and beat Phil Taylor's 16 World Championships and 16 World Matchplay titles,\" he said. \"He's young enough to do it.\" A version of this article was originally published on 29 November.\n", - "• None Know a lot about Littler? Take our quiz\n", - "--------------------------------------------------------------------------------\n", - "Score: 0.6099, Text: Luke Littler has risen from 164th to fourth in the rankings in a year\n", - "\n", - "A tearful Luke Littler hit a tournament record 140.91 set average as he started his bid for the PDC World Championship title with a dramatic 3-1 win over Ryan Meikle. The 17-year-old made headlines around the world when he reached the tournament final in January, where he lost to Luke Humphries. Starting this campaign on Saturday, Littler was millimetres away from a nine-darter when he missed double 12 as he blew Meikle away in the fourth and final set of the second-round match. Littler was overcome with emotion at the end, cutting short his on-stage interview. \"It was probably the toughest game I've ever played. I had to fight until the end,\" he said later in a news conference. \"As soon as the question came on stage and then boom, the tears came. It was just a bit too much to speak on stage. \"It is the worst game I have played. I have never felt anything like that tonight.\" Admitting to nerves during the match, he told Sky Sports: \"Yes, probably the biggest time it's hit me. Coming into it I was fine, but as soon as [referee] George Noble said 'game on', I couldn't throw them.\" Littler started slowly against Meikle, who had two darts for the opening set, but he took the lead by twice hitting double 20. Meikle did not look overawed against his fellow Englishman and levelled, but Littler won the third set and exploded into life in the fourth. The tournament favourite hit four maximum 180s as he clinched three straight legs in 11, 10 and 11 darts for a record set average, and 100.85 overall. Meanwhile, two seeds crashed out on Saturday night – five-time world champion Raymond van Barneveld lost to Welshman Nick Kenny, while England's Ryan Joyce beat Danny Noppert. Australian Damon Heta was another to narrowly miss out on a nine-darter, just failing on double 12 when throwing for the match in a 3-1 win over Connor Scutt. Ninth seed Heta hit four 100-plus checkouts to come from a set down against Scutt in a match in which both men averaged more than 97.\n", - "\n", - "Littler was hugged by his parents after victory over Meikle\n", - "\n", - "Littler returned to Alexandra Palace to a boisterous reception from more than 3,000 spectators and delivered an astonishing display in the fourth set. He was on for a nine-darter after his opening two throws in both of the first two legs and completed the set in 32 darts - the minimum possible is 27. The teenager will next play after Christmas against European Championship winner Ritchie Edhouse, the 29th seed, or Ian White, and is seeded to meet Humphries in the semi-finals. Having entered last year's event ranked 164th, Littler is up to fourth in the world and will go to number two if he reaches the final again this time. He has won 10 titles in his debut professional year, including the Premier League and Grand Slam of Darts. After reaching the World Championship final as a debutant aged just 16, Littler's life has been transformed and interest in darts has rocketed. Google say he was the most searched-for athlete online in the UK during 2024. This Christmas, more than 100,000 children are expected to be opening Littler-branded magnetic dartboards as presents. His impact has helped double the number of junior academies and has prompted plans to expand the World Championship. Littler was named BBC Young Sports Personality of the Year on Tuesday and was runner-up to athlete Keely Hodgkinson for the main award.\n", - "\n", - "Nick Kenny will play world champion Luke Humphries in round three after Christmas\n", - "\n", - "Barneveld was shocked 3-1 by world number 76 Kenny, who was in tears after a famous victory. Kenny, 32, will face Humphries in round three after defeating the Dutchman, who won the BDO world title four times and the PDC crown in 2007. Van Barneveld, ranked 32nd, became the sixth seed to exit in the second round. His compatriot Noppert, the 13th seed, was stunned 3-1 by Joyce, who will face Ryan Searle or Matt Campbell next, with the winner of that tie potentially meeting Littler in the last 16. Elsewhere, 15th seed Chris Dobey booked his place in the third round with a 3-1 win over Alexander Merkx. Englishman Dobey concluded an afternoon session which started with a trio of 3-0 scorelines. Northern Ireland's Brendan Dolan beat Lok Yin Lee to set up a meeting with three-time champion Michael van Gerwen after Christmas. In the final two first-round matches of the 2025 competition, Wales' Rhys Griffin beat Karel Sedlacek of the Czech Republic before Asia number one Alexis Toylo cruised past Richard Veenstra.\n", - "--------------------------------------------------------------------------------\n", - "Score: 0.5980, Text: Luke Littler is one of six contenders for the 2024 BBC Sports Personality of the Year award.\n", - "\n", - "Here BBC Sport takes a look at the darts player's year in five photos.\n", - "--------------------------------------------------------------------------------\n", - "Score: 0.5590, Text: Littler is Young Sports Personality of the Year\n", - "\n", - "This video can not be played To play this video you need to enable JavaScript in your browser.\n", - "\n", - "Darts player Luke Littler has been named BBC Young Sports Personality of the Year 2024. The 17-year-old has enjoyed a breakthrough year after finishing runner-up at the 2024 PDC World Darts Championship in January. The Englishman, who has won 10 senior titles on the Professional Darts Corporation tour this year, is the first darts player to claim the award. \"It shows how well I have done this year, not only for myself, but I have changed the sport of darts,\" Littler told BBC One. \"I know the amount of academies that have been brought up in different locations, tickets selling out at Ally Pally in hours and the Premier League selling out - it just shows how much I have changed it.\"\n", - "\n", - "He was presented with the trophy by Harry Aikines-Aryeetey - a former sprinter who won the award in 2005 - and ex-rugby union player Jodie Ounsley, both of whom are stars of the BBC television show Gladiators. Skateboarder Sky Brown, 16, and Para-swimmer William Ellard, 18, were also shortlisted for the award. Littler became a household name at the start of 2024 by reaching the World Championship final aged just 16 years and 347 days. That achievement was just the start of a trophy-laden year, with Littler winning the Premier League Darts, Grand Slam and World Series of Darts Finals among his haul of titles. Littler has gone from 164th to fourth in the world rankings and earned more than £1m in prize money in 2024. The judging panel for Young Sports Personality of the Year included Paralympic gold medallist Sammi Kinghorn, Olympic silver medal-winning BMX freestyler Keiran Reilly, television presenter Qasa Alom and Radio 1 DJ Jeremiah Asiamah, as well as representatives from the Youth Sport Trust, Blue Peter and BBC Sport.\n", - "--------------------------------------------------------------------------------\n", - "Score: 0.5414, Text: Wright is the 17th seed at the World Championship\n", - "\n", - "Two-time champion Peter Wright won his opening game at the PDC World Championship, while Ryan Meikle edged out Fallon Sherrock to set up a match against teenage prodigy Luke Littler. Scotland's Wright, the 2020 and 2022 winner, has been out of form this year, but overcame Wesley Plaisier 3-1 in the second round at Alexandra Palace in London. \"It was this crowd that got me through, they wanted me to win. I thank you all,\" said Wright. Meikle came from a set down to claim a 3-2 victory in his first-round match against Sherrock, who was the first woman to win matches at the tournament five years ago. The 28-year-old will now play on Saturday against Littler, who was named BBC Young Sports Personality of the Year and runner-up in the main award to athlete Keely Hodgkinson on Tuesday night. Littler, 17, will be competing on the Ally Pally stage for the first time since his rise to stardom when finishing runner-up in January's world final to Luke Humphries. Earlier on Tuesday, World Grand Prix champion Mike de Decker – the 24th seed - suffered a surprise defeat to Luke Woodhouse in the second round. He is the second seed to exit following 16th seed James Wade's defeat on Monday to Jermaine Wattimena, who meets Wright in round three. Kevin Doets recovered from a set down to win 3-1 against Noa-Lynn van Leuven, who was making history as the first transgender woman to compete in the tournament.\n", - "\n", - "Sherrock drew level at 2-2 but lost the final set to Meikle\n", - "\n", - "The 54-year-old Wright only averaged 89.63 to his opponent's 93.77, but did enough to progress. Sporting a purple mohawk and festive outfit, crowd favourite 'Snakebite' showed glimpses of his best to win the first set and survived eight set darts to go 2-0 ahead. He lost the next but Dutchman Plaisier missed two more set darts in the fourth and Wright seized his opportunity. \"Wesley had his chances but he missed them and I took them,\" he said. \"He's got his tour card and he's going to be a dangerous player next year for all the players playing against him.\" Sherrock, 30, fought back from 2-1 down to force a decider against her English compatriot Meikle. She then narrowly missed the bull to take out 170 in the fourth leg before left-hander Meikle held his nerve to hit double 18 for a 96 finish to seal a hard-fought success. \"I felt under pressure from the start and to come through feels unbelievable,\" said Meikle. \"It's an unbelievable prize to play Luke here on this stage. It's the biggest stage of them all. I'm so happy.\" World number 81 Jeffrey de Graaf, who was born in the Netherlands but now represents Sweden, looked in trouble against Rashad Sweeting before prevailing 3-1. Sweeting, who was making history as the first player from the Bahamas to compete in the tournament, took the first set, but De Graaf fought back to clinch a second-round meeting with two-time champion Gary Anderson Germany's Ricardo Pietreczko, ranked 34, beat China's Xiaochen Zong 3-1 and will face Gian van Veen next.\n", - "--------------------------------------------------------------------------------\n", - "Score: 0.5402, Text: Second seed Smith knocked out of Worlds by Doets\n", - "\n", - "Michael Smith was 2-1 ahead but fell to a shock exit\n", - "\n", - "Former champion Michael Smith has been sensationally knocked out of the PDC World Championship by Kevin Doets. Englishman Smith, seeded second, went down 3-2 after a pulsating second-round duel at Alexandra Palace in London. Dutchman Doets prevailed 6-4 in the deciding set, despite checkouts of 123, 84, 94 and 76 from 2023 champion Smith. \"This was the most stressful game of my life and I've won it, yes,\" said world number 51 Doets. \"I felt if I can keep my focus, I won't lose this. It was so very tight, to get over the line was amazing.\" Doets, 26, took the first set and fought back after going 2-1 down to avenge his narrow defeat to Smith at the same stage last year. Having lost in the second round of the tournament for the first time since 2020, the 34-year-old Smith will now drop to at least 15 in the rankings.\n", - "\n", - "Doets won in the first round against Noa-Lynn van Leuven, who was the first transgender woman to play in the tournament\n", - "\n", - "England's Scott Williams, who made a shock run to the semi-finals in the 2024 tournament before losing to eventual champion Luke Humphries, overcame Niko Springer 3-1 in a thriller. German debutant Springer, second on this year's development tour, won all three legs in the opening set before the match exploded into life. Williams hit back, showing his old swagger as he went ahead after a sensational third set which featured seven 180s. The 34-year-old edged the deciding leg in the fourth and will meet 2018 champion Rob Cross in round two on Monday. Nick Kenny delighted the Ally Pally crowd with a fabulous 170 finish to seal a 3-0 victory in round one over American Stowe Buntz. The Welshman, 31, will face five-time world champion Raymond Barneveld on Saturday evening on a bill which also features teenage star Luke Littler against Ryan Meikle. Canadian Matt Campbell set up a second-round match against Ryan Searle with a 3-2 defeat of Austrian Mensur Suljovic.\n", - "\n", - "England's Callan Rydz averaged 107.06 to book his place in the second round, before Gabriel Clemens was knocked out by Wales' Robert Owen on Thursday afternoon. Rydz beat Croatian Romeo Grbavac 3-0, recording the tournament's highest average first-round match average in its current 96-player format. It was the competition's 26th highest match average overall and comfortably the best so far at the 2025 event. The previous record was held by teenager Luke Littler, who scored 106.12 at this stage last year. Rydz, from Bedlington in Northumberland, meets Germany's Martin Schindler in the second round on Sunday evening. The afternoon session concluded with Germany's 27th seed Clemens being beaten by Owen, who is ranked 50 places below him. Owen recorded a 3-1 victory, his second in a matter of days, to reach the third round, which begins on 27 December. Hong Kong's Lok Yin Lee came from a set down to beat Chris Landman 3-1 after winning nine straight legs. Lee will face Northern Ireland's Brendan Dolan in round two on Saturday afternoon. Meanwhile, 2024 Grand Slam of Darts runner-up Martin Lukeman came from a set down to beat Indian qualifier Nitin Kumar 3-1. Lukeman meets 21st seed Andrew Gilding on Monday afternoon for a place in the last 32.\n", - "--------------------------------------------------------------------------------\n", - "Score: 0.5328, Text: Cross loses as record number of seeds out of Worlds\n", - "\n", - "Rob Cross has suffered three second-round exits in his eight World Championships\n", - "\n", - "Former champion Rob Cross became the latest high-profile casualty as a record-breaking 14th seed exited the PDC World Darts Championship in the second round. The number five seed was beaten 3-1 by close friend Scott Williams, who will face Germany's Ricardo Pietreczko in round three. Cross, who won the event on his debut in 2018, took the opening set but failed to reach anywhere near his best as he suffered his third second-round exit. He was joined by number six seed David Chisnall, who was beaten 3-2 in a sudden-death leg by Ricky Evans, who came into the tournament 46th in the PDC's Order of Merit. The 2021 semi-finalist won the opening set, but then found himself 2-1 down to an inspired Evans, who was cheered on relentlessly by the Alexandra Palace crowd. He forced the game into a deciding set and faced match dart but Evans missed bullseye by the width of the wire. Chisnall then missed his own match dart on double tops, before he made a miscalculation when attempting to checkout 139 at 5-4 down. No real harm was done with a sudden-death leg forced but he was unable to hold off Evans, who reaches the third round for the third time in the last five years. \"It's not even what it is, again I've played a world-class darts player. I've played quite well and won,\" Evans told Sky Sports. \"Look at this [the crowd], wow. I don't understand it, why are they cheering me on? \"I don't get this reception in my household. Thank you very much. You've made a very fat guy very happy.\" Evans will face unseeded Welshman Robert Owen when the third round starts after the three-day Christmas break.\n", - "\n", - "World youth champion Gian van Veen had become the 12th seed to be knocked out when he lost 3-1 to Pietreczko. The 28th seed lost the opening set, having missed nine darts at double, but levelled. However, the Dutchman was unable to match Pietreczko, who closed out a comfortable win with a checkout percentage of 55.6%. Pietreczko said: \"I am over the moon to win. It is very important for me to be in the third round after Christmas. I love the big stage.\" The 26th seed trailed 1-0 and 2-1, and both players went on to miss match darts, before Gurney won the final set 3-1 on legs.\n", - "\n", - "Jonny Clayton is into the third round of the PDC World Darts Championship for a sixth consecutive year\n", - "\n", - "In the afternoon session, Welsh number seven seed Jonny Clayton also needed sudden death to pull off a sensational final-set comeback against Mickey Mansell in. He was a leg away from defeat twice to his Northern Irish opponent, but came from behind to win the final set 6-5 in a sudden-death leg to win 3-2. Clayton, who will play Gurney in round three, lost the opening set of the match, but fought back to lead 2-1, before being pegged back again by 51-year-old Mansell, who then missed match darts on double tops in the deciding set. \"I was very emotional. I've got to be honest, that meant a lot,\" said Clayton, who is in the favourable half of the draw following shock second-round exits for former world champions Michael Smith and Gary Anderson. \"I had chances before and Mickey definitely had chances before. It wasn't great to play in, not the best - I wouldn't wish that on my worst enemy. \"There is a lot of weight off my shoulders after that. I know there is another gear or two in the bank, but I'll be honest that meant a lot to me, it is a tester and will try and make me believe again.\" Clayton was 2-0 down in the fifth set after consecutive 136 and 154 checkouts from Mansell, but won three legs on the trot in 15, 12 and 10 darts to wrestle a 3-2 lead. He missed three darts for the match, before his unseeded opponent held and broke Clayton's throw to lead 4-3. Mansell missed a match dart at double 20, before Clayton won on double five after two missed checkouts. Elsewhere, Northern Ireland's Josh Rock booked his place in the third round against England's Chris Dobey with a 3-0 win over Wales' Rhys Griffin. Martin Lukeman, runner-up to Luke Littler at the Grand Slam of Darts last month, is out after a 3-1 loss to number 21 seed Andrew Gilding. The final day before the Christmas break started with Poland's number 31 seed Krzysztof Ratajski recording a 3-1 win over Alexis Toylo of the Philippines.\n", - "\n", - "All times are GMT and subject to change. Two fourth-round matches will also be played\n", - "--------------------------------------------------------------------------------\n", - "Score: 0.5116, Text: Michael van Gerwen has made just one major ranking event final in 2024\n", - "\n", - "Michael van Gerwen enjoyed a comfortable 3-0 victory over English debutant James Hurrell in his opening match of the PDC World Darts Championship. The three-time world champion has had a tough year by his standards, having fallen behind Luke Littler and Luke Humphries, so a relatively stress-free opening match at Alexandra Palace was just what was needed. Hurrell, 40, offered some resistance early on when taking the opening leg of the match, but he would win just two more as Van Gerwen proved far too strong. The third-seeded Dutchman averaged 94.85, took out two three-figure checkouts and hit 50% of his doubles - with six of his nine misses coming in one scrappy leg. Van Gerwen, 35, will now face either Brendan Dolan or Lok Yin Lee in the third round.\n", - "\n", - "\"I think I played OK,\" Van Gerwen told Sky Sports after his match. \"Of course, I was a bit nervous. Like everyone knows it's been a tough year for me. \"Overall, it was a good performance. I was confident. I won the game, that's the main thing.\" Also on Friday night, Germany's Florian Hempel showed why he loves playing on the Alexandra Palace stage with a thrilling 3-1 victory in a high-quality contest against Jeffrey de Zwaan. Both men hit seven 180s in a match played at a fast and furious pace, but 34-year-old Hempel's superior doubles gave him a fourth straight first-round victory in the competition. Hempel moves on to a tie with 26th seed Daryl Gurney but it was a damaging loss for De Zwaan, 28, who came through a late qualifier in November and needed a good run here to keep his PDC tour card for next season. Mickey Mansell earned a second-round date with world number seven Jonny Clayton after a scrappy 3-1 win over Japan's Tomoya Goto, while Dylan Slevin came through an all-Irish tie against William O'Connor to progress to a meeting with Dimitri van den Bergh.\n", - "\n", - "Stephen Bunting is in the third round of the PDC World Darts Championship for a third consecutive year\n", - "\n", - "In the afternoon session, Stephen Bunting came from behind to beat Kai Gotthardt 3-1 and book his place in the third round. Englishman Bunting, ranked eighth in the world, dropped the first set and almost went 2-0 down in the match before staging an impressive recovery. Tournament debutant Gotthardt missed three darts at double eight to win the second set, allowing Bunting to take out double 10 to level the match before powering away to victory by winning the third and fourth sets without losing a leg. Victory for \"The Bullet\" sets up a last 32 meeting with the winner of Dirk van Duijvenbode's meeting with Madars Razma after Christmas. Should Bunting progress further, he is seeded to face world number one and defending world champion Luke Humphries in the quarter-finals on New Year's Day. Elsewhere in Friday afternoon's session, the Dutch duo of Alexander Merkx and Wessel Nijman advanced to the second round with wins over Stephen Burton and Cameron Carolissen respectively. England's Ian White was handed a walkover victory against Sandro Eric Sosing of the Philippines. Sosing withdrew from the competition on medical grounds and was taken to hospital following chest pains.\n", - "--------------------------------------------------------------------------------\n", - "Score: 0.5113, Text: Gary Anderson was the fifth seed to be beaten on Sunday\n", - "\n", - "Two-time champion Gary Anderson has been dumped out of the PDC World Championship on his 54th birthday by Jeffrey de Graaf. The Scot, winner in 2015 and 2016, lost 3-0 to the Swede in a second-round shock at Alexandra Palace in London. \"Gary didn't really show up as he usually does. I'm very happy with the win,\" said De Graaf, 34, who had a 75% checkout success and began with an 11-dart finish. \"It's a dream come true for me. He's been my idol since I was 14 years old.\" Anderson, ranked 14th, became the 11th seed to be knocked out from the 24 who have played so far, and the fifth to fall on Sunday.\n", - "\n", - "He came into the competition with the year's highest overall three-dart average of 99.66 but hit just three of his 20 checkout attempts to lose his opening match of the tournament for the first time. De Graaf will now meet Filipino qualifier Paolo Nebrida after he stunned England's Ross Smith, the 19th seed, in straight sets. Ritchie Edhouse, Dirk van Duijvenbode and Martin Schindler were the other seeds beaten on day eight. England's Callan Rydz, who hit a record first-round average of 107.06 on Thursday, followed up with a 3-0 win over 23rd seed Schindler on Sunday. The German missed double 12 for a nine-darter in the first set – the third player to do so in 24 hours after Luke Littler and Damon Heta – and ended up losing the leg. Rydz next meets Belgian Dimitri van den Bergh, who hit six 180s and averaged 96 in a 3-0 win over Irishman Dylan Slevin.\n", - "\n", - "England's Joe Cullen abruptly left his post-match news conference and accused the media of not showing him respect after his 3-0 win over Dutchman Wessel Nijman. Nijman, who has previously served a ban for breaching betting and anti-corruption rules, had been billed as favourite beforehand to beat 23rd seed Cullen. \"Honestly, the media attention that Wessel's got, again this is not a reflection on him,\" Cullen said. \"He seems like a fantastic kid, he's been caught up in a few things beforehand, but he's served his time and he's held his hands up, like a lot haven't. \"I think the way I've been treated probably with the media and things like that - I know you guys have no control over the bookies - I've been shown no respect, so I won't be showing any respect to any of you guys tonight. \"I'm going to go home. Cheers.\" Ian 'Diamond' White beat European champion and 29th seed Edhouse 3-1 and will face teenage star Littler in the next round. White, born in the same Cheshire town as the 17-year-old, acknowledged he would need to up his game in round three. Asked if he knew who was waiting for him, White joked: \"Yeah, Runcorn's number two. I'm from Runcorn and I'm number one.\" Ryan Searle started Sunday afternoon's action off with a 10-dart leg and went on to beat Matt Campbell 3-0, while Latvian Madars Razma defeated 25th seed Van Duijvenbode 3-1. Seventh seed Jonny Clayton and 2018 champion Rob Cross are among the players in action on Monday as the second round concludes. The third round will start on Friday after a three-day break for Christmas.\n", - "--------------------------------------------------------------------------------\n", - "Score: 0.5105, Text: Christian Kist was sealing his first televised nine-darter\n", - "\n", - "Christian Kist hit a nine-darter but lost his PDC World Championship first-round match to Madars Razma. The Dutchman became the first player to seal a perfect leg in the tournament since Michael Smith did so on the way to beating Michael van Gerwen in the 2023 final. Kist, the 2012 BDO world champion at Lakeside, collects £60,000 for the feat, with the same amount being awarded by sponsors to a charity and to one spectator inside Alexandra Palace in London. The 38-year-old's brilliant finish sealed the opening set, but his Latvian opponent bounced back to win 3-1. Darts is one of the few sports that can measure perfection; snooker has the 147 maximum break, golf has the hole-in-one, darts has the nine-dart finish. Kist scored two maximum 180s to leave a 141 checkout which he completed with a double 12, to the delight of more than 3,000 spectators. The English 12th seed, who has been troubled by wrist and back injuries, could next play Andrew Gilding in the third round - which begins on 27 December - should Gilding beat the winner of Martin Lukeman's match against qualifier Nitin Kumar. Aspinall faces a tough task to reach the last four again, with 2018 champion Rob Cross and 2024 runner-up Luke Littler both in his side of the draw.\n", - "\n", - "Kist - who was knocked out of last year's tournament by teenager Littler - will still earn a bigger cheque than he would have got for a routine run to the quarter-finals. His nine-darter was the 15th in the history of the championship and first since the greatest leg in darts history when Smith struck, moments after Van Gerwen just missed his attempt. Darts fan Kris, a railway worker from Sutton in south London, was the random spectator picked out to receive £60,000, with Prostate Cancer UK getting the same sum from tournament sponsors Paddy Power. \"I'm speechless to be honest. I didn't expect it to happen to me,\" Kris said. \"This was a birthday present so it makes it even better. My grandad got me tickets. It was just a normal day - I came here after work.\" Kist said: \"Hitting the double 12 felt amazing. It was a lovely moment for everyone and I hope Kris enjoys the money. Maybe I will go on vacation next month.\" Earlier, Jim Williams was favourite against Paolo Nebrida but lost 3-2 in an epic lasting more than an hour. The Filipino took a surprise 2-1 lead and Williams only went ahead for the first time in the opening leg of the deciding set. The Welshman looked on course for victory but missed five match darts. UK Open semi-finalist Ricky Evans set up a second-round match against Dave Chisnall, checking out on 109 to edge past Gordon Mathers 3-2.\n", - "--------------------------------------------------------------------------------\n" - ] - } - ], - "source": [ - "query = \"What were Luke Littler's key achievements and records in his recent PDC World Championship match?\"\n", - "\n", - "try:\n", - " # Perform the semantic search\n", - " start_time = time.time()\n", - " search_results = vector_store.similarity_search_with_score(query, k=10)\n", - " search_elapsed_time = time.time() - start_time\n", - "\n", - " logging.info(f\"Semantic search completed in {search_elapsed_time:.2f} seconds\")\n", - "\n", - " # Display search results\n", - " print(f\"\\nSemantic Search Results (completed in {search_elapsed_time:.2f} seconds):\")\n", - " print(\"-\" * 80)\n", - "\n", - " for doc, score in search_results:\n", - " print(f\"Score: {score:.4f}, Text: {doc.page_content}\")\n", - " print(\"-\" * 80)\n", - "\n", - "except CouchbaseException as e:\n", - " raise RuntimeError(f\"Error performing semantic search: {str(e)}\")\n", - "except Exception as e:\n", - " raise RuntimeError(f\"Unexpected error: {str(e)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## RAG System Demo\n", - "\n", - "### Retrieval-Augmented Generation (RAG) with Couchbase and LangChain\n", - "\n", - "Couchbase and LangChain can be seamlessly integrated to create RAG (Retrieval-Augmented Generation) chains, enhancing the process of generating contextually relevant responses. In this setup, Couchbase serves as the vector store, where embeddings of documents are stored. When a query is made, LangChain retrieves the most relevant documents from Couchbase by comparing the query's embedding with the stored document embeddings. These documents, which provide contextual information, are then passed to a generative language model within LangChain.\n", - "\n", - "The language model, equipped with the context from the retrieved documents, generates a response that is both informed and contextually accurate. This integration allows the RAG chain to leverage Couchbase's efficient storage and retrieval capabilities, while LangChain handles the generation of responses based on the context provided by the retrieved documents. Together, they create a powerful system that can deliver highly relevant and accurate answers by combining the strengths of both retrieval and generation." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-05-25 14:41:41,810 - INFO - Successfully created RAG chain\n" - ] - } - ], - "source": [ - "# Create RAG prompt template\n", - "rag_prompt = ChatPromptTemplate.from_messages([\n", - " (\"system\", \"You are a helpful assistant that answers questions based on the provided context.\"),\n", - " (\"human\", \"Context: {context}\\n\\nQuestion: {question}\")\n", - "])\n", - "\n", - "# Create RAG chain\n", - "rag_chain = (\n", - " {\"context\": vector_store.as_retriever(), \"question\": RunnablePassthrough()}\n", - " | rag_prompt\n", - " | llm\n", - " | StrOutputParser()\n", - ")\n", - "logging.info(\"Successfully created RAG chain\")" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "RAG Response: In his recent 2025 PDC World Championship second-round match against Ryan Meikle, **Luke Littler** achieved several notable milestones and records:\n", - "\n", - "1. **Tournament Record Set Average**: \n", - " - Littler hit a **140.91 set average** in the fourth set, the highest ever recorded in the tournament for a single set. This included three consecutive legs finished in 11, 10, and 11 darts.\n", - "\n", - "2. **Near Nine-Darter**: \n", - " - He narrowly missed a nine-dart finish (the pinnacle of darts perfection) by millimeters when he failed to land double 12 in the fourth set.\n", - "\n", - "3. **Overall Performance**: \n", - " - Despite a slow start and admitted nerves, he secured a **3-1 victory** with a dominant fourth set, hitting **four maximum 180s** and maintaining an overall match average of **100.85**.\n", - "\n", - "4. **Emotional Impact**: \n", - " - The 17-year-old became emotional post-match, cutting short his on-stage interview due to the intensity of the moment, later calling it the \"toughest game\" he’d ever played.\n", - "\n", - "These achievements highlight his resilience and skill, further cementing his status as a rising star in darts.\n", - "RAG response generated in 21.84 seconds\n" - ] - } - ], - "source": [ - "try:\n", - " start_time = time.time()\n", - " rag_response = rag_chain.invoke(query)\n", - " rag_elapsed_time = time.time() - start_time\n", - "\n", - " print(f\"RAG Response: {rag_response}\")\n", - " print(f\"RAG response generated in {rag_elapsed_time:.2f} seconds\")\n", - "except InternalServerFailureException as e:\n", - " if \"query request rejected\" in str(e):\n", - " print(\"Error: Search request was rejected due to rate limiting. Please try again later.\")\n", - " else:\n", - " print(f\"Internal server error occurred: {str(e)}\")\n", - "except Exception as e:\n", - " print(f\"Unexpected error occurred: {str(e)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Using Couchbase as a Caching Mechanism\n", - "\n", - "Couchbase can be effectively used as a caching mechanism for RAG (Retrieval-Augmented Generation) responses by storing and retrieving precomputed results for specific queries. This approach enhances the system's efficiency and speed, particularly when dealing with repeated or similar queries. When a query is first processed, the RAG chain retrieves relevant documents, generates a response using the language model, and then stores this response in Couchbase, with the query serving as the key.\n", - "\n", - "For subsequent requests with the same query, the system checks Couchbase first. If a cached response is found, it is retrieved directly from Couchbase, bypassing the need to re-run the entire RAG process. This significantly reduces response time because the computationally expensive steps of document retrieval and response generation are skipped. Couchbase's role in this setup is to provide a fast and scalable storage solution for caching these responses, ensuring that frequently asked queries can be answered more quickly and efficiently." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Query 1: What happened in the match between Fullham and Liverpool?\n", - "Response: In the match between Fulham and Liverpool, the game ended in a 2-2 draw. Key highlights include:\n", - "\n", - "1. **Red Card Incident**: Liverpool played most of the match with 10 men after Andy Robertson received a red card in the 17th minute for denying a goalscoring opportunity. He had earlier been injured in a tackle by Fulham's Issa Diop.\n", - "\n", - "2. **Comeback Resilience**: Despite the numerical disadvantage, Liverpool twice came from behind. Diogo Jota scored an 86th-minute equalizer to secure a point. Fulham's Antonee Robinson praised Liverpool, noting it \"didn’t feel like they had 10 men\" due to their aggressive, high-pressing approach.\n", - "\n", - "3. **Performance Metrics**: Liverpool dominated possession (over 60%) and led in key attacking stats (shots, big chances, touches in the opposition box), showcasing their determination even with a player deficit.\n", - "\n", - "4. **Manager & Player Reactions**: \n", - " - Manager Arne Slot commended his team’s \"outstanding\" character and resilience, particularly highlighting Robertson’s effort despite the red card.\n", - " - Captain Virgil van Dijk emphasized the team’s ability to \"stay calm\" and fight back under pressure.\n", - "\n", - "5. **League Impact**: The draw extended Liverpool’s lead at the top of the Premier League to five points, as rivals Arsenal also dropped points. Pundits, including Chris Sutton, lauded Liverpool’s \"phenomenal\" response to adversity. \n", - "\n", - "Fulham’s strong performance, described as \"brave,\" was also acknowledged, making the match a thrilling encounter between both sides.\n", - "Time taken: 14.14 seconds\n", - "\n", - "Query 2: What were Luke Littler's key achievements and records in his recent PDC World Championship match?\n", - "Response: In his recent 2025 PDC World Championship second-round match against Ryan Meikle, **Luke Littler** achieved several notable milestones and records:\n", - "\n", - "1. **Tournament Record Set Average**: \n", - " - Littler hit a **140.91 set average** in the fourth set, the highest ever recorded in the tournament for a single set. This included three consecutive legs finished in 11, 10, and 11 darts.\n", - "\n", - "2. **Near Nine-Darter**: \n", - " - He narrowly missed a nine-dart finish (the pinnacle of darts perfection) by millimeters when he failed to land double 12 in the fourth set.\n", - "\n", - "3. **Overall Performance**: \n", - " - Despite a slow start and admitted nerves, he secured a **3-1 victory** with a dominant fourth set, hitting **four maximum 180s** and maintaining an overall match average of **100.85**.\n", - "\n", - "4. **Emotional Impact**: \n", - " - The 17-year-old became emotional post-match, cutting short his on-stage interview due to the intensity of the moment, later calling it the \"toughest game\" he’d ever played.\n", - "\n", - "These achievements highlight his resilience and skill, further cementing his status as a rising star in darts.\n", - "Time taken: 1.82 seconds\n", - "\n", - "Query 3: What happened in the match between Fullham and Liverpool?\n", - "Response: In the match between Fulham and Liverpool, the game ended in a 2-2 draw. Key highlights include:\n", - "\n", - "1. **Red Card Incident**: Liverpool played most of the match with 10 men after Andy Robertson received a red card in the 17th minute for denying a goalscoring opportunity. He had earlier been injured in a tackle by Fulham's Issa Diop.\n", - "\n", - "2. **Comeback Resilience**: Despite the numerical disadvantage, Liverpool twice came from behind. Diogo Jota scored an 86th-minute equalizer to secure a point. Fulham's Antonee Robinson praised Liverpool, noting it \"didn’t feel like they had 10 men\" due to their aggressive, high-pressing approach.\n", - "\n", - "3. **Performance Metrics**: Liverpool dominated possession (over 60%) and led in key attacking stats (shots, big chances, touches in the opposition box), showcasing their determination even with a player deficit.\n", - "\n", - "4. **Manager & Player Reactions**: \n", - " - Manager Arne Slot commended his team’s \"outstanding\" character and resilience, particularly highlighting Robertson’s effort despite the red card.\n", - " - Captain Virgil van Dijk emphasized the team’s ability to \"stay calm\" and fight back under pressure.\n", - "\n", - "5. **League Impact**: The draw extended Liverpool’s lead at the top of the Premier League to five points, as rivals Arsenal also dropped points. Pundits, including Chris Sutton, lauded Liverpool’s \"phenomenal\" response to adversity. \n", - "\n", - "Fulham’s strong performance, described as \"brave,\" was also acknowledged, making the match a thrilling encounter between both sides.\n", - "Time taken: 1.52 seconds\n" - ] - } - ], - "source": [ - "try:\n", - " queries = [\n", - " \"What happened in the match between Fullham and Liverpool?\",\n", - " \"What were Luke Littler's key achievements and records in his recent PDC World Championship match?\", # Repeated query\n", - " \"What happened in the match between Fullham and Liverpool?\", # Repeated query\n", - " ]\n", - "\n", - " for i, query in enumerate(queries, 1):\n", - " print(f\"\\nQuery {i}: {query}\")\n", - " start_time = time.time()\n", - "\n", - " response = rag_chain.invoke(query)\n", - " elapsed_time = time.time() - start_time\n", - " print(f\"Response: {response}\")\n", - " print(f\"Time taken: {elapsed_time:.2f} seconds\")\n", - "\n", - "except InternalServerFailureException as e:\n", - " if \"query request rejected\" in str(e):\n", - " print(\"Error: Search request was rejected due to rate limiting. Please try again later.\")\n", - " else:\n", - " print(f\"Internal server error occurred: {str(e)}\")\n", - "except Exception as e:\n", - " print(f\"Unexpected error occurred: {str(e)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Conclusion\n", - "\n", - "By following these steps, you'll have a fully functional semantic search engine that leverages the strengths of Couchbase's Search Vector Index and Deepseek (via OpenRouter). This guide is designed not just to show you how to build the system, but also to explain why each step is necessary, giving you a deeper understanding of the principles behind semantic search and how to implement it effectively. Whether you're a newcomer to software development or an experienced developer looking to expand your skills, this guide will provide you with the knowledge and tools you need to create a powerful, AI-driven search engine." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.2" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/openrouter-deepseek/search_based/deepseek_index.json b/openrouter-deepseek/search_based/deepseek_index.json deleted file mode 100644 index a16840c8..00000000 --- a/openrouter-deepseek/search_based/deepseek_index.json +++ /dev/null @@ -1,73 +0,0 @@ -{ - "type": "fulltext-index", - "name": "vector_search_deepseek", - "uuid": "", - "sourceType": "gocbcore", - "sourceName": "vector-search-testing", - "planParams": { - "maxPartitionsPerPIndex": 64, - "indexPartitions": 16 - }, - "params": { - "doc_config": { - "docid_prefix_delim": "", - "docid_regexp": "", - "mode": "scope.collection.type_field", - "type_field": "type" - }, - "mapping": { - "analysis": {}, - "default_analyzer": "standard", - "default_datetime_parser": "dateTimeOptional", - "default_field": "_all", - "default_mapping": { - "dynamic": true, - "enabled": false - }, - "default_type": "_default", - "docvalues_dynamic": false, - "index_dynamic": true, - "store_dynamic": false, - "type_field": "_type", - "types": { - "shared.deepseek": { - "dynamic": true, - "enabled": true, - "properties": { - "embedding": { - "dynamic": false, - "enabled": true, - "fields": [ - { - "dims": 1536, - "index": true, - "name": "embedding", - "similarity": "dot_product", - "type": "vector", - "vector_index_optimized_for": "recall" - } - ] - }, - "text": { - "dynamic": false, - "enabled": true, - "fields": [ - { - "index": true, - "name": "text", - "store": true, - "type": "text" - } - ] - } - } - } - } - }, - "store": { - "indexType": "scorch", - "segmentVersion": 16 - } - }, - "sourceParams": {} -} diff --git a/openrouter-deepseek/search_based/frontmatter.md b/openrouter-deepseek/search_based/frontmatter.md deleted file mode 100644 index 1ce3d507..00000000 --- a/openrouter-deepseek/search_based/frontmatter.md +++ /dev/null @@ -1,23 +0,0 @@ ---- -# frontmatter -path: "/tutorial-openrouter-deepseek-with-search-vector-index" -title: Retrieval-Augmented Generation (RAG) with OpenRouter Deepseek and Couchbase Search Vector Index -short_title: RAG with OpenRouter Deepseek and Couchbase Search Vector Index -description: - - Learn how to build a semantic search engine using OpenRouter Deepseek and Couchbase Search Vector Index. - - This tutorial demonstrates how to integrate Couchbase's Search Vector Index capabilities with OpenRouter Deepseek as both embeddings and language model provider. - - You'll understand how to perform Retrieval-Augmented Generation (RAG) using LangChain, OpenRouter Deepseek, and Couchbase Search Vector Index. -content_type: tutorial -filter: sdk -technology: - - vector search -tags: - - Search Vector Index - - Artificial Intelligence - - LangChain - - Deepseek - - OpenRouter -sdk_language: - - python -length: 60 Mins ----