diff --git a/index.toml b/index.toml index ccd03cc..50db52f 100644 --- a/index.toml +++ b/index.toml @@ -16,13 +16,13 @@ dependencies = ["datasets>=2.6.1", "sentence-transformers>=4.1.0"] featured = true [[tutorial]] -title = "Generating Structured Output with Loop-Based Auto-Correction" -description = "Learn how to extract structured data using an LLM, and to validate the generated output against a predefined schema." -level = "intermediate" +title = "Generating Structured Output with OpenAIChatGenerator" +description = "Learn how to generate structured output using OpenAIChatGenerator, and to validate the generated output against a predefined schema." +level = "beginner" weight = 71 -notebook = "28_Structured_Output_With_Loop.ipynb" +notebook = "28_Structured_Output_With_OpenAI.ipynb" aliases = [] -completion_time = "15 min" +completion_time = "10 min" created_at = 2023-11-30 dependencies = ["colorama"] diff --git a/tutorials/28_Structured_Output_With_Loop.ipynb b/tutorials/28_Structured_Output_With_Loop.ipynb deleted file mode 100644 index 5041dbc..0000000 --- a/tutorials/28_Structured_Output_With_Loop.ipynb +++ /dev/null @@ -1,511 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "AVBtOVlNJ51C" - }, - "source": [ - "# Tutorial: Generating Structured Output with Loop-Based Auto-Correction\n", - "\n", - "- **Level**: Intermediate\n", - "- **Time to complete**: 15 minutes\n", - "- **Prerequisites**: You must have an API key from an active OpenAI account as this tutorial is using the gpt-4o-mini model by OpenAI.\n", - "- **Components Used**: `PromptBuilder`, `OpenAIChatGenerator`, `OutputValidator` (Custom component)\n", - "- **Goal**: After completing this tutorial, you will have built a system that extracts unstructured data, puts it in a JSON schema, and automatically corrects errors in the JSON output from a large language model (LLM) to make sure it follows the specified structure.\n", - "\n", - "## Overview\n", - "This tutorial demonstrates how to use Haystack's advanced [looping pipelines](https://docs.haystack.deepset.ai/docs/pipelines#loops) with LLMs for more dynamic and flexible data processing. You'll learn how to extract structured data from unstructured data using an LLM, and to validate the generated output against a predefined schema.\n", - "\n", - "This tutorial uses `gpt-4o-mini` to change unstructured passages into JSON outputs that follow the [Pydantic](https://github.com/pydantic/pydantic) schema. It uses a custom OutputValidator component to validate the JSON and loop back to make corrections, if necessary." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jmiAHh1oGsKI" - }, - "source": [ - "## Preparing the Colab Environment\n", - "\n", - "Enable the debug mode of logging:" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "id": "Vor9IHuNRvEh" - }, - "outputs": [], - "source": [ - "import logging\n", - "\n", - "logging.basicConfig()\n", - "logging.getLogger(\"canals.pipeline.pipeline\").setLevel(logging.DEBUG)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ljbWiyJkKiPw" - }, - "source": [ - "## Installing Dependencies\n", - "Install Haystack and [colorama](https://pypi.org/project/colorama/) with pip:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "kcc1AlLQd_jI", - "outputId": "efc4bbab-a9fe-46ee-d8af-9d86edacaf04" - }, - "outputs": [], - "source": [ - "%%bash\n", - "\n", - "pip install haystack-ai\n", - "pip install colorama" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Cmjfa8CiCeFl" - }, - "source": [ - "## Defining a Schema to Parse the JSON Object\n", - "\n", - "Define a simple JSON schema for the data you want to extract from a text passsage using the LLM. As the first step, define two [Pydantic models](https://docs.pydantic.dev/1.10/usage/models/), `City` and `CitiesData`, with suitable fields and types." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "xwKrDOOGdaAz" - }, - "outputs": [], - "source": [ - "from typing import List\n", - "from pydantic import BaseModel\n", - "\n", - "\n", - "class City(BaseModel):\n", - " name: str\n", - " country: str\n", - " population: int\n", - "\n", - "\n", - "class CitiesData(BaseModel):\n", - " cities: List[City]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "zv-6-l_PCeFl" - }, - "source": [ - "> You can change these models according to the format you wish to extract from the text." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ouk1mAOUCeFl" - }, - "source": [ - "Then, generate a JSON schema from Pydantic models using `schema_json()`. You will later on use this schema in the prompt to instruct the LLM.\n", - "\n", - "To learn more about the JSON schemas, visit [Pydantic Schema](https://docs.pydantic.dev/1.10/usage/schema/). " - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "8Lg9_72jCeFl" - }, - "outputs": [], - "source": [ - "json_schema = CitiesData.schema_json(indent=2)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KvNhg0bP7kfg" - }, - "source": [ - "## Creating a Custom Component: OutputValidator\n", - "\n", - "`OutputValidator` is a custom component that validates if the JSON object the LLM generates complies with the provided [Pydantic model](https://docs.pydantic.dev/1.10/usage/models/). If it doesn't, OutputValidator returns an error message along with the incorrect JSON object to get it fixed in the next loop.\n", - "\n", - "For more details about custom components, see [Creating Custom Components](https://docs.haystack.deepset.ai/docs/custom-components)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "yr6D8RN2d7Vy" - }, - "outputs": [], - "source": [ - "import json\n", - "import random\n", - "import pydantic\n", - "from pydantic import ValidationError\n", - "from typing import Optional, List\n", - "from colorama import Fore\n", - "from haystack import component\n", - "from haystack.dataclasses import ChatMessage\n", - "\n", - "\n", - "# Define the component input parameters\n", - "@component\n", - "class OutputValidator:\n", - " def __init__(self, pydantic_model: pydantic.BaseModel):\n", - " self.pydantic_model = pydantic_model\n", - " self.iteration_counter = 0\n", - "\n", - " # Define the component output\n", - " @component.output_types(valid_replies=List[str], invalid_replies=Optional[List[str]], error_message=Optional[str])\n", - " def run(self, replies: List[ChatMessage]):\n", - "\n", - " self.iteration_counter += 1\n", - "\n", - " ## Try to parse the LLM's reply ##\n", - " # If the LLM's reply is a valid object, return `\"valid_replies\"`\n", - " try:\n", - " output_dict = json.loads(replies[0].text)\n", - " self.pydantic_model.parse_obj(output_dict)\n", - " print(\n", - " Fore.GREEN\n", - " + f\"OutputValidator at Iteration {self.iteration_counter}: Valid JSON from LLM - No need for looping: {replies[0]}\"\n", - " )\n", - " return {\"valid_replies\": replies}\n", - "\n", - " # If the LLM's reply is corrupted or not valid, return \"invalid_replies\" and the \"error_message\" for LLM to try again\n", - " except (ValueError, ValidationError) as e:\n", - " print(\n", - " Fore.RED\n", - " + f\"OutputValidator at Iteration {self.iteration_counter}: Invalid JSON from LLM - Let's try again.\\n\"\n", - " f\"Output from LLM:\\n {replies[0]} \\n\"\n", - " f\"Error from OutputValidator: {e}\"\n", - " )\n", - " return {\"invalid_replies\": replies, \"error_message\": str(e)}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vQ_TfSBkCeFm" - }, - "source": [ - "Then, create an OutputValidator instance with `CitiesData` that you have created before." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "bhPCLCBCCeFm" - }, - "outputs": [], - "source": [ - "output_validator = OutputValidator(pydantic_model=CitiesData)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xcIWKjW4k42r" - }, - "source": [ - "## Creating the Prompt\n", - "\n", - "Write instructions for the LLM for converting a passage into a JSON format. Ensure the instructions explain how to identify and correct errors if the JSON doesn't match the required schema. Once you create the prompt, initialize PromptBuilder to use it. \n", - "\n", - "For information about Jinja2 template and ChatPromptBuilder, see [ChatPromptBuilder](https://docs.haystack.deepset.ai/docs/chatpromptbuilder)." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "ohPpNALjdVKt" - }, - "outputs": [], - "source": [ - "from haystack.components.builders import ChatPromptBuilder\n", - "\n", - "\n", - "prompt_template = [\n", - " ChatMessage.from_user(\n", - " \"\"\"\n", - "Create a JSON object from the information present in this passage: {{passage}}.\n", - "Only use information that is present in the passage. Follow this JSON schema, but only return the actual instances without any additional schema definition:\n", - "{{schema}}\n", - "Make sure your response is a dict and not a list.\n", - "{% if invalid_replies and error_message %}\n", - " You already created the following output in a previous attempt: {{invalid_replies}}\n", - " However, this doesn't comply with the format requirements from above and triggered this Python exception: {{error_message}}\n", - " Correct the output and try again. Just return the corrected output without any extra explanations.\n", - "{% endif %}\n", - "\"\"\"\n", - " )\n", - "]\n", - "prompt_builder = ChatPromptBuilder(template=prompt_template)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KM9-Zq2FL7Nn" - }, - "source": [ - "## Initalizing the ChatGenerator\n", - "\n", - "[OpenAIChatGenerator](https://docs.haystack.deepset.ai/docs/openaichatgenerator) generates\n", - "text using OpenAI's `gpt-4o-mini` model by default. Set the `OPENAI_API_KEY` variable and provide a model name to the ChatGenerator." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "Z4cQteIgunUR" - }, - "outputs": [], - "source": [ - "import os\n", - "from getpass import getpass\n", - "\n", - "from haystack.components.generators.chat import OpenAIChatGenerator\n", - "\n", - "if \"OPENAI_API_KEY\" not in os.environ:\n", - " os.environ[\"OPENAI_API_KEY\"] = getpass(\"Enter OpenAI API key:\")\n", - "chat_generator = OpenAIChatGenerator()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "zbotIOgXHkC5" - }, - "source": [ - "## Building the Pipeline\n", - "\n", - "Add all components to your pipeline and connect them. Add connections from `output_validator` back to the `prompt_builder` for cases where the produced JSON doesn't comply with the JSON schema. Set `max_runs_per_component` to avoid infinite looping." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "eFglN9YEv-1W" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "\n", - "๐Ÿš… Components\n", - " - prompt_builder: ChatPromptBuilder\n", - " - llm: OpenAIChatGenerator\n", - " - output_validator: OutputValidator\n", - "๐Ÿ›ค๏ธ Connections\n", - " - prompt_builder.prompt -> llm.messages (List[ChatMessage])\n", - " - llm.replies -> output_validator.replies (List[ChatMessage])\n", - " - output_validator.invalid_replies -> prompt_builder.invalid_replies (Optional[List[str]])\n", - " - output_validator.error_message -> prompt_builder.error_message (Optional[str])" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from haystack import Pipeline\n", - "\n", - "pipeline = Pipeline(max_runs_per_component=5)\n", - "\n", - "# Add components to your pipeline\n", - "pipeline.add_component(instance=prompt_builder, name=\"prompt_builder\")\n", - "pipeline.add_component(instance=chat_generator, name=\"llm\")\n", - "pipeline.add_component(instance=output_validator, name=\"output_validator\")\n", - "\n", - "# Now, connect the components to each other\n", - "pipeline.connect(\"prompt_builder.prompt\", \"llm.messages\")\n", - "pipeline.connect(\"llm.replies\", \"output_validator\")\n", - "# If a component has more than one output or input, explicitly specify the connections:\n", - "pipeline.connect(\"output_validator.invalid_replies\", \"prompt_builder.invalid_replies\")\n", - "pipeline.connect(\"output_validator.error_message\", \"prompt_builder.error_message\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-UKW5wtIIT7w" - }, - "source": [ - "### Visualize the Pipeline\n", - "\n", - "Draw the pipeline with the [`draw()`](https://docs.haystack.deepset.ai/docs/drawing-pipeline-graphs) method to confirm the connections are correct. You can find the diagram in the Files section of this Colab." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "RZJg6YHId300" - }, - "outputs": [], - "source": [ - "# pipeline.draw(\"auto-correct-pipeline.png\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kV_kexTjImpo" - }, - "source": [ - "## Testing the Pipeline\n", - "\n", - "Run the pipeline with an example passage that you want to convert into a JSON format and the `json_schema` you have created for `CitiesData`. For the given example passage, the generated JSON object should be like:\n", - "```json\n", - "{\n", - " \"cities\": [\n", - " {\n", - " \"name\": \"Berlin\",\n", - " \"country\": \"Germany\",\n", - " \"population\": 3850809\n", - " },\n", - " {\n", - " \"name\": \"Paris\",\n", - " \"country\": \"France\",\n", - " \"population\": 2161000\n", - " },\n", - " {\n", - " \"name\": \"Lisbon\",\n", - " \"country\": \"Portugal\",\n", - " \"population\": 504718\n", - " }\n", - " ]\n", - "}\n", - "```\n", - "The output of the LLM should be compliant with the `json_schema`. If the LLM doesn't generate the correct JSON object, it will loop back and try again." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "yIoMedb6eKia", - "outputId": "4a9ef924-cf26-4908-d83f-b0bc0dc03b54" - }, - "outputs": [], - "source": [ - "passage = \"Berlin is the capital of Germany. It has a population of 3,850,809. Paris, France's capital, has 2.161 million residents. Lisbon is the capital and the largest city of Portugal with the population of 504,718.\"\n", - "result = pipeline.run({\"prompt_builder\": {\"passage\": passage, \"schema\": json_schema}})" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WWxmPgADS_Fa" - }, - "source": [ - "> If you encounter `PipelineMaxLoops: Maximum loops count (5) exceeded for component 'prompt_builder'.` error, consider increasing the maximum loop count or simply rerun the pipeline." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "eWPawSjgSJAM" - }, - "source": [ - "### Print the Correct JSON\n", - "If you didn't get any error, you can now print the corrected JSON." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "BVO47gXQQnDC", - "outputId": "460a10d4-a69a-49cd-bbb2-fc4980907299" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'cities': [{'name': 'Berlin', 'country': 'Germany', 'population': 3850809}, {'name': 'Paris', 'country': 'France', 'population': 2161000}, {'name': 'Lisbon', 'country': 'Portugal', 'population': 504718}]}\n" - ] - } - ], - "source": [ - "valid_reply = result[\"output_validator\"][\"valid_replies\"][0].text\n", - "valid_json = json.loads(valid_reply)\n", - "print(valid_json)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Egz_4h2vI_QL" - }, - "source": [ - "## What's next\n", - "\n", - "๐ŸŽ‰ Congratulations! You've built a system that generates structured JSON out of unstructured text passages, and auto-corrects it by using the looping functionality of Haystack pipelines.\n", - "\n", - "To stay up to date on the latest Haystack developments, you can [subscribe to our newsletter](https://landing.deepset.ai/haystack-community-updates) and [join Haystack discord community](https://discord.gg/haystack).\n", - "\n", - "Thanks for reading!" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "gpuType": "T4", - "provenance": [] - }, - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.6" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/tutorials/28_Structured_Output_With_OpenAI.ipynb b/tutorials/28_Structured_Output_With_OpenAI.ipynb new file mode 100644 index 0000000..c6e9ebf --- /dev/null +++ b/tutorials/28_Structured_Output_With_OpenAI.ipynb @@ -0,0 +1,616 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "AVBtOVlNJ51C" + }, + "source": [ + "# Tutorial: Generating Structured Output with OpenAIChatGenerator\n", + "\n", + "- **Level**: Intermediate\n", + "- **Time to complete**: 15 minutes\n", + "- **Prerequisites**: You must have an API key from an active OpenAI account as this tutorial is using the gpt-4o-mini model by OpenAI.\n", + "- **Components Used**: `ChatPromptBuilder`, `OpenAIChatGenerator`, `OpenAIResponsesChatGenerator`\n", + "- **Goal**: Learn how to generate structured outputs with `OpenAIChatGenerator` or `OpenAIResponsesChatGenerator` using Pydantic model or JSON schema.\n", + "\n", + "## Overview\n", + "This tutorial shows how to produce structured outputs by either providing [Pydantic](https://github.com/pydantic/pydantic) model or JSON schema to `OpenAIChatGenerator`.\n", + "\n", + "Note: Only latest model starting with `gpt-4o-mini` can be used for this feature.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jmiAHh1oGsKI" + }, + "source": [ + "## Preparing the Colab Environment\n", + "\n", + "Enable the debug mode of logging:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "Vor9IHuNRvEh" + }, + "outputs": [], + "source": [ + "import logging\n", + "\n", + "logging.basicConfig()\n", + "logging.getLogger(\"canals.pipeline.pipeline\").setLevel(logging.DEBUG)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ljbWiyJkKiPw" + }, + "source": [ + "## Installing Dependencies\n", + "Install Haystack and [colorama](https://pypi.org/project/colorama/) with pip:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: haystack-ai==2.20.0 in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (2.20.0)\n", + "Requirement already satisfied: docstring-parser in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from haystack-ai==2.20.0) (0.17.0)\n", + "Requirement already satisfied: filetype in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from haystack-ai==2.20.0) (1.2.0)\n", + "Requirement already satisfied: haystack-experimental in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from haystack-ai==2.20.0) (0.14.1)\n", + "Requirement already satisfied: jinja2 in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from haystack-ai==2.20.0) (3.1.6)\n", + "Requirement already satisfied: jsonschema in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from haystack-ai==2.20.0) (4.25.1)\n", + "Requirement already satisfied: lazy-imports in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from haystack-ai==2.20.0) (1.0.1)\n", + "Requirement already satisfied: more-itertools in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from haystack-ai==2.20.0) (10.8.0)\n", + "Requirement already satisfied: networkx in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from haystack-ai==2.20.0) (3.5)\n", + "Requirement already satisfied: numpy in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from haystack-ai==2.20.0) (2.3.4)\n", + "Requirement already satisfied: openai>=1.99.2 in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from haystack-ai==2.20.0) (2.6.1)\n", + "Requirement already satisfied: posthog!=3.12.0 in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from haystack-ai==2.20.0) (6.7.11)\n", + "Requirement already satisfied: pydantic in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from haystack-ai==2.20.0) (2.12.3)\n", + "Requirement already satisfied: python-dateutil in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from haystack-ai==2.20.0) (2.9.0.post0)\n", + "Requirement already satisfied: pyyaml in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from haystack-ai==2.20.0) (6.0.3)\n", + "Requirement already satisfied: requests in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from haystack-ai==2.20.0) (2.32.5)\n", + "Requirement already satisfied: tenacity!=8.4.0 in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from haystack-ai==2.20.0) (9.1.2)\n", + "Requirement already satisfied: tqdm in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from haystack-ai==2.20.0) (4.67.1)\n", + "Requirement already satisfied: typing-extensions>=4.7 in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from haystack-ai==2.20.0) (4.15.0)\n", + "Requirement already satisfied: anyio<5,>=3.5.0 in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from openai>=1.99.2->haystack-ai==2.20.0) (4.11.0)\n", + "Requirement already satisfied: distro<2,>=1.7.0 in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from openai>=1.99.2->haystack-ai==2.20.0) (1.9.0)\n", + "Requirement already satisfied: httpx<1,>=0.23.0 in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from openai>=1.99.2->haystack-ai==2.20.0) (0.28.1)\n", + "Requirement already satisfied: jiter<1,>=0.10.0 in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from openai>=1.99.2->haystack-ai==2.20.0) (0.11.1)\n", + "Requirement already satisfied: sniffio in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from openai>=1.99.2->haystack-ai==2.20.0) (1.3.1)\n", + "Requirement already satisfied: idna>=2.8 in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from anyio<5,>=3.5.0->openai>=1.99.2->haystack-ai==2.20.0) (3.11)\n", + "Requirement already satisfied: certifi in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from httpx<1,>=0.23.0->openai>=1.99.2->haystack-ai==2.20.0) (2025.10.5)\n", + "Requirement already satisfied: httpcore==1.* in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from httpx<1,>=0.23.0->openai>=1.99.2->haystack-ai==2.20.0) (1.0.9)\n", + "Requirement already satisfied: h11>=0.16 in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from httpcore==1.*->httpx<1,>=0.23.0->openai>=1.99.2->haystack-ai==2.20.0) (0.16.0)\n", + "Requirement already satisfied: annotated-types>=0.6.0 in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from pydantic->haystack-ai==2.20.0) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.41.4 in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from pydantic->haystack-ai==2.20.0) (2.41.4)\n", + "Requirement already satisfied: typing-inspection>=0.4.2 in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from pydantic->haystack-ai==2.20.0) (0.4.2)\n", + "Requirement already satisfied: six>=1.5 in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from posthog!=3.12.0->haystack-ai==2.20.0) (1.17.0)\n", + "Requirement already satisfied: backoff>=1.10.0 in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from posthog!=3.12.0->haystack-ai==2.20.0) (2.2.1)\n", + "Requirement already satisfied: charset_normalizer<4,>=2 in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from requests->haystack-ai==2.20.0) (3.4.4)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from requests->haystack-ai==2.20.0) (2.5.0)\n", + "Requirement already satisfied: rich in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from haystack-experimental->haystack-ai==2.20.0) (14.2.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from jinja2->haystack-ai==2.20.0) (3.0.3)\n", + "Requirement already satisfied: attrs>=22.2.0 in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from jsonschema->haystack-ai==2.20.0) (25.4.0)\n", + "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from jsonschema->haystack-ai==2.20.0) (2025.9.1)\n", + "Requirement already satisfied: referencing>=0.28.4 in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from jsonschema->haystack-ai==2.20.0) (0.37.0)\n", + "Requirement already satisfied: rpds-py>=0.7.1 in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from jsonschema->haystack-ai==2.20.0) (0.28.0)\n", + "Requirement already satisfied: markdown-it-py>=2.2.0 in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from rich->haystack-experimental->haystack-ai==2.20.0) (4.0.0)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from rich->haystack-experimental->haystack-ai==2.20.0) (2.19.2)\n", + "Requirement already satisfied: mdurl~=0.1 in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (from markdown-it-py>=2.2.0->rich->haystack-experimental->haystack-ai==2.20.0) (0.1.2)\n", + "Requirement already satisfied: colorama in /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages (0.4.6)\n" + ] + } + ], + "source": [ + "%%bash\n", + "\n", + "pip install \"haystack-ai==2.20.0\"\n", + "pip install colorama" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Cmjfa8CiCeFl" + }, + "source": [ + "## Structured outputs in OpenAIChatGenerator\n", + "\n", + "### Using Pydantic Models\n", + "First, we'll see how to pass Pydantic model to `OpenAIChatGenerator`. For this purpose, we define two [Pydantic models](https://docs.pydantic.dev/1.10/usage/models/), `City` and `CitiesData`. These models specify the fields and types that represent the data structure we want." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "id": "xwKrDOOGdaAz" + }, + "outputs": [], + "source": [ + "from typing import List\n", + "from pydantic import BaseModel\n", + "\n", + "\n", + "class City(BaseModel):\n", + " name: str\n", + " country: str\n", + " population: int\n", + "\n", + "\n", + "class CitiesData(BaseModel):\n", + " cities: List[City]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zv-6-l_PCeFl" + }, + "source": [ + "> You can change these models according to the format you wish to extract from the text." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xcIWKjW4k42r" + }, + "source": [ + "\n", + "Use `ChatPromptBuilder` in the pipeline to pass the userโ€™s message to `OpenAIChatGenerator`.\n", + "For information about Jinja2 template and ChatPromptBuilder, see [ChatPromptBuilder](https://docs.haystack.deepset.ai/docs/chatpromptbuilder)." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "id": "ohPpNALjdVKt" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:haystack.components.builders.chat_prompt_builder:ChatPromptBuilder has 1 prompt variables, but `required_variables` is not set. By default, all prompt variables are treated as optional, which may lead to unintended behavior in multi-branch pipelines. To avoid unexpected execution, ensure that variables intended to be required are explicitly set in `required_variables`.\n" + ] + } + ], + "source": [ + "from haystack.components.builders import ChatPromptBuilder\n", + "from haystack.dataclasses import ChatMessage\n", + "\n", + "\n", + "prompt_template = [ChatMessage.from_user(\"User Input: {{passage}}\")]\n", + "prompt_builder = ChatPromptBuilder(template=prompt_template)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KM9-Zq2FL7Nn" + }, + "source": [ + "\n", + "[OpenAIChatGenerator](https://docs.haystack.deepset.ai/docs/openaichatgenerator) generates\n", + "text using OpenAI's `gpt-4o-mini` model by default. We pass our Pydantic model to `response_format` paramter in generation_kwargs .\n", + "\n", + "We also need to set the `OPENAI_API_KEY` variable.\n", + "\n", + "Note: You can also set the `response_format` in `generation_kwargs` param in the `run` method of chat generator." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "id": "Z4cQteIgunUR" + }, + "outputs": [], + "source": [ + "import os\n", + "from getpass import getpass\n", + "\n", + "from haystack.components.generators.chat import OpenAIChatGenerator\n", + "\n", + "if \"OPENAI_API_KEY\" not in os.environ:\n", + " os.environ[\"OPENAI_API_KEY\"] = getpass(\"Enter OpenAI API key:\")\n", + "chat_generator = OpenAIChatGenerator(generation_kwargs={\"response_format\": CitiesData})" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zbotIOgXHkC5" + }, + "source": [ + "\n", + "Add all components to your pipeline and connect them." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "id": "eFglN9YEv-1W" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "\n", + "๐Ÿš… Components\n", + " - prompt_builder: ChatPromptBuilder\n", + " - llm: OpenAIChatGenerator\n", + "๐Ÿ›ค๏ธ Connections\n", + " - prompt_builder.prompt -> llm.messages (list[ChatMessage])" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from haystack import Pipeline\n", + "\n", + "pipeline = Pipeline(max_runs_per_component=5)\n", + "\n", + "# Add components to your pipeline\n", + "pipeline.add_component(instance=prompt_builder, name=\"prompt_builder\")\n", + "pipeline.add_component(instance=chat_generator, name=\"llm\")\n", + "\n", + "# Now, connect the components to each other\n", + "pipeline.connect(\"prompt_builder.prompt\", \"llm.messages\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kV_kexTjImpo" + }, + "source": [ + "## Testing the Pipeline\n", + "\n", + "Run the pipeline with an example passage that you want to convert into a JSON format and the `json_schema` you have created for `CitiesData`. For the given example passage, the generated JSON object should be like:\n", + "```json\n", + "{\n", + " \"cities\": [\n", + " {\n", + " \"name\": \"Berlin\",\n", + " \"country\": \"Germany\",\n", + " \"population\": 3850809\n", + " },\n", + " {\n", + " \"name\": \"Paris\",\n", + " \"country\": \"France\",\n", + " \"population\": 2161000\n", + " },\n", + " {\n", + " \"name\": \"Lisbon\",\n", + " \"country\": \"Portugal\",\n", + " \"population\": 504718\n", + " }\n", + " ]\n", + "}\n", + "```\n", + "The output of the LLM should be compliant with the `json_schema`. " + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "yIoMedb6eKia", + "outputId": "4a9ef924-cf26-4908-d83f-b0bc0dc03b54" + }, + "outputs": [], + "source": [ + "passage = \"Berlin is the capital of Germany. It has a population of 3,850,809. Paris, France's capital, has 2.161 million residents. Lisbon is the capital and the largest city of Portugal with the population of 504,718.\"\n", + "result = pipeline.run({\"prompt_builder\": {\"passage\": passage}})" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eWPawSjgSJAM" + }, + "source": [ + "### Print the Correct JSON\n", + "If you didn't get any error, you can now print the corrected JSON." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "BVO47gXQQnDC", + "outputId": "460a10d4-a69a-49cd-bbb2-fc4980907299" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'cities': [{'name': 'Berlin', 'country': 'Germany', 'population': 3850809}, {'name': 'Paris', 'country': 'France', 'population': 2161000}, {'name': 'Lisbon', 'country': 'Portugal', 'population': 504718}]}\n" + ] + } + ], + "source": [ + "import json\n", + "valid_reply = result[\"llm\"][\"replies\"][0].text\n", + "valid_json = json.loads(valid_reply)\n", + "print(valid_json)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using Json schema\n", + "\n", + "Now, weโ€™ll create a json schema of the `CitiesData` model and pass it to `OpenAIChatGenerator`. OpenAI expects schemas in a specific format, so the schema generated with `model_json_schema()` cannot be used directly.\n", + "\n", + "For details on how to create schemas for OpenAI, see the [OpenAI Structured Outputs guide](https://platform.openai.com/docs/guides/structured-outputs#supported-schemas)." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "cities_data_schema={\n", + " \"type\": \"json_schema\",\n", + " \"json_schema\": {\n", + " \"name\": \"CitiesData\",\n", + " \"schema\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"cities\": {\n", + " \"type\": \"array\",\n", + " \"items\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"name\": { \"type\": \"string\" },\n", + " \"country\": { \"type\": \"string\" },\n", + " \"population\": { \"type\": \"integer\" }\n", + " },\n", + " \"required\": [\"name\", \"country\", \"population\"],\n", + " \"additionalProperties\": False\n", + " }\n", + " }\n", + " },\n", + " \"required\": [\"cities\"],\n", + " \"additionalProperties\": False\n", + " },\n", + " \"strict\": True\n", + " }\n", + " }" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Pass this json schema to the `response_format` parameter in chat generator. We run the generator indivdually to see the output." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\"cities\":[{\"name\":\"Berlin\",\"country\":\"Germany\",\"population\":3850809},{\"name\":\"Paris\",\"country\":\"France\",\"population\":2161000},{\"name\":\"Lisbon\",\"country\":\"Portugal\",\"population\":504718}]}\n" + ] + } + ], + "source": [ + "chat_generator = OpenAIChatGenerator(generation_kwargs={\"response_format\": cities_data_schema})\n", + "\n", + "messages = [ChatMessage.from_user(\"Berlin is the capital of Germany. It has a population of 3,850,809. Paris, France's capital, has 2.161 million residents. Lisbon is the capital and the largest city of Portugal with the population of 504,718.\")]\n", + "\n", + "result = chat_generator.run(messages=messages)\n", + "\n", + "print(result[\"replies\"][0].text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Structured outputs in OpenAIResponsesChatGenerator\n", + "\n", + "### Using Pydantic Models\n", + "We'll use the models City and CitiesData defined above. \n", + "[OpenAIResponsesChatGenerator](https://docs.haystack.deepset.ai/docs/openairesponseschatgenerator) generates\n", + "text using OpenAI's `gpt-5-mini` model by default. We pass our Pydantic model to `text_format` parameter in `generation_kwargs` when calling the `run` method.\n", + "\n", + "We also need to set the `OPENAI_API_KEY` variable.\n", + "\n", + "Note: You can also set the `text_format` in the constructor's `generation_kwargs`, but passing it in the `run` method is recommended for better compatibility.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from getpass import getpass\n", + "\n", + "from haystack.components.generators.chat import OpenAIResponsesChatGenerator\n", + "\n", + "if \"OPENAI_API_KEY\" not in os.environ:\n", + " os.environ[\"OPENAI_API_KEY\"] = getpass(\"Enter OpenAI API key:\")\n", + "responses_generator = OpenAIResponsesChatGenerator(generation_kwargs={\"text_format\": CitiesData})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Lets check the structured output with a simple user message." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'replies': [ChatMessage(_role=, _content=[ReasoningContent(reasoning_text='', extra={'id': 'rs_02308545114ab79f00691dee6614f08198a6a7a20c09e0071e', 'type': 'reasoning'}), TextContent(text='{\\n \"cities\": [\\n {\\n \"name\": \"Berlin\",\\n \"country\": \"Germany\",\\n \"population\": 3850809\\n },\\n {\\n \"name\": \"Paris\",\\n \"country\": \"France\",\\n \"population\": 2161000\\n }\\n ]\\n}')], _name=None, _meta={'id': 'resp_02308545114ab79f00691dee6528d08198bf3e9e82903570a4', 'created_at': 1763569253.0, 'error': None, 'incomplete_details': None, 'instructions': None, 'metadata': {}, 'model': 'gpt-5-mini-2025-08-07', 'object': 'response', 'parallel_tool_calls': True, 'temperature': 1.0, 'tool_choice': 'auto', 'tools': [], 'top_p': 1.0, 'background': False, 'max_output_tokens': None, 'max_tool_calls': None, 'previous_response_id': None, 'prompt_cache_key': None, 'reasoning': {'effort': 'medium', 'summary': None}, 'safety_identifier': None, 'service_tier': 'default', 'status': 'completed', 'text': {'format': {'name': 'CitiesData', 'schema': {'$defs': {'City': {'properties': {'name': {'title': 'Name', 'type': 'string'}, 'country': {'title': 'Country', 'type': 'string'}, 'population': {'title': 'Population', 'type': 'integer'}}, 'required': ['name', 'country', 'population'], 'title': 'City', 'type': 'object', 'additionalProperties': False}}, 'properties': {'cities': {'items': {'$ref': '#/$defs/City'}, 'title': 'Cities', 'type': 'array'}}, 'required': ['cities'], 'title': 'CitiesData', 'type': 'object', 'additionalProperties': False}, 'type': 'json_schema', 'description': None, 'strict': True}, 'verbosity': 'medium'}, 'top_logprobs': 0, 'truncation': 'disabled', 'usage': {'input_tokens': 131, 'input_tokens_details': {'cached_tokens': 0}, 'output_tokens': 265, 'output_tokens_details': {'reasoning_tokens': 192}, 'total_tokens': 396}, 'user': None, 'billing': {'payer': 'developer'}, 'prompt_cache_retention': None, 'store': True, 'logprobs': [[]]})]}" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "responses_generator.run(messages=[ChatMessage.from_user(\" Berlin is the capital of Germany. It has a population of 3,850,809. Paris, France's capital, has 2.161 million residents.\")])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, weโ€™ll create a json schema of the `CitiesData` model and pass it to `OpenAIResponsesChatGenerator`. We cannot use the same schema we defined for `OpenAIChatGenerator` as OpenAI Responses API expects a different format of schema.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "cities_data_schema_responses={\n", + " \"format\": {\n", + " \"type\": \"json_schema\",\n", + " \"name\": \"CitiesData\",\n", + " \"schema\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"cities\": {\n", + " \"type\": \"array\",\n", + " \"items\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"name\": { \"type\": \"string\" },\n", + " \"country\": { \"type\": \"string\" },\n", + " \"population\": { \"type\": \"integer\" }\n", + " },\n", + " \"required\": [\"name\", \"country\", \"population\"],\n", + " \"additionalProperties\": False\n", + " }\n", + " }\n", + " },\n", + " \"required\": [\"cities\"],\n", + " \"additionalProperties\": False\n", + " },\n", + " \"strict\": True\n", + " }\n", + " }" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We pass our json schema to `text` paramter in `generation_kwargs`.\n", + "\n", + "Note: You can also set the `text` in `generation_kwargs` param in the `run` method of the chat generator." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'cities': [{'name': 'Berlin', 'country': 'Germany', 'population': 3850809}, {'name': 'Paris', 'country': 'France', 'population': 2161000}]}\n" + ] + } + ], + "source": [ + "chat_generator = OpenAIResponsesChatGenerator(generation_kwargs={\"text\": cities_data_schema_responses})\n", + "\n", + "result = chat_generator.run(messages=[ChatMessage.from_user(\" Berlin is the capital of Germany. It has a population of 3,850,809. Paris, France's capital, has 2.161 million residents.\")])\n", + "parsed = json.loads(result[\"replies\"][0].text)\n", + "\n", + "print(parsed)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Egz_4h2vI_QL" + }, + "source": [ + "## What's next\n", + "\n", + "๐ŸŽ‰ Congratulations! You've learnt how to easily produce structured ouputs with `OpenAIChatGenerator` and `OpenAIResponsesChatGenerator` using Pydantic models and JSON schema.\n", + "\n", + "Other chat generators that also support structured outputs: `MistralChatGenerator`, `OpenRouterChatGenerator`, `NvidiaChatGenerator`, `MetaLlamaChatGenerator`.\n", + "\n", + "To stay up to date on the latest Haystack developments, you can [subscribe to our newsletter](https://landing.deepset.ai/haystack-community-updates) and [join Haystack discord community](https://discord.gg/haystack).\n", + "\n", + "Thanks for reading!" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.12" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}