From aa23d97483d570e957ba440cc93f4edc501ce7e4 Mon Sep 17 00:00:00 2001
From: Dhruv Gupta <dhruv.gupta@databricks.com>
Date: Tue, 24 Feb 2026 15:26:49 -0800
Subject: [PATCH 1/9] Add FMAPI tool calling contract tests for
 DatabricksOpenAI

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../test_fmapi_tool_calling.py                | 420 ++++++++++++++
 .../src/databricks_openai/utils/clients.py    | 151 +++++-
 .../test_fmapi_tool_calling.py                | 513 ++++++++++++++++++
 3 files changed, 1081 insertions(+), 3 deletions(-)
 create mode 100644 integrations/langchain/tests/integration_tests/test_fmapi_tool_calling.py
 create mode 100644 integrations/openai/tests/integration_tests/test_fmapi_tool_calling.py

diff --git a/integrations/langchain/tests/integration_tests/test_fmapi_tool_calling.py b/integrations/langchain/tests/integration_tests/test_fmapi_tool_calling.py
new file mode 100644
index 00000000..cc871128
--- /dev/null
+++ b/integrations/langchain/tests/integration_tests/test_fmapi_tool_calling.py
@@ -0,0 +1,420 @@
+"""
+End-to-end FMAPI tool calling tests for LangGraph agents mirroring app-templates CUJs.
+
+These tests replicate the exact user code patterns from app-templates
+(agent-langgraph, agent-langgraph-short-term-memory) to verify that
+single-turn, multi-turn, and streaming conversations don't break.
+
+Prerequisites:
+- FMAPI endpoints must be available on the test workspace
+"""
+
+from __future__ import annotations
+
+import os
+
+import pytest
+from databricks.sdk import WorkspaceClient
+from databricks_openai import DatabricksOpenAI
+from langchain_core.messages import AIMessage, AIMessageChunk, ToolMessage
+from langchain_core.tools import tool
+from langgraph.checkpoint.memory import MemorySaver
+from langgraph.prebuilt import create_react_agent
+from openai.types.chat import ChatCompletionToolParam
+
+from databricks_langchain import ChatDatabricks
+
+pytestmark = pytest.mark.skipif(
+    os.environ.get("RUN_FMAPI_TOOL_CALLING_TESTS") != "1",
+    reason="FMAPI tool calling tests disabled. Set RUN_FMAPI_TOOL_CALLING_TESTS=1 to enable.",
+)
+
+# Models that pass the tool calling probe but have known issues in agent/test flows.
+# These are skipped entirely to keep CI green. When a new model is added to FMAPI,
+# it will be discovered and tested automatically — add it here only if it fails.
+_SKIP_MODELS = {
+    "databricks-gpt-5-nano",  # too small for reliable tool calling
+    "databricks-gpt-oss-20b",  # hallucinates tool names in agent loop
+    "databricks-gpt-oss-120b",  # hallucinates tool names in agent loop
+    "databricks-llama-4-maverick",  # hallucinates tool names in agent loop
+    "databricks-gemini-3-flash",  # requires thought_signature on function calls
+    "databricks-gemini-3-pro",  # requires thought_signature on function calls
+    "databricks-gemini-3-1-pro",  # requires thought_signature on function calls
+    "databricks-gemma-3-12b",  # outputs raw tool call text instead of executing tools
+}
+
+# Max retries for flaky models (e.g. transient FMAPI errors, model non-determinism)
+_MAX_RETRIES = 3
+
+# Reasoning models (e.g. Gemini 2.5 Pro) consume reasoning tokens from the max_tokens
+# budget. With 2 tools they need 200-600 reasoning tokens, so 200 is too small.
+_MODEL_MAX_TOKENS: dict[str, int] = {
+    "databricks-gemini-2-5-pro": 1000,
+}
+_DEFAULT_MAX_TOKENS = 200
+
+
+def _max_tokens(model: str) -> int:
+    return _MODEL_MAX_TOKENS.get(model, _DEFAULT_MAX_TOKENS)
+
+
+# Minimal tool definition used to probe whether a model supports tool calling
+_PROBE_TOOL: ChatCompletionToolParam = {
+    "type": "function",
+    "function": {
+        "name": "probe",
+        "description": "probe",
+        "parameters": {
+            "type": "object",
+            "properties": {"x": {"type": "string"}},
+            "required": ["x"],
+        },
+    },
+}
+
+
+def _supports_tool_calling(client: DatabricksOpenAI, model: str) -> bool:
+    """Send a minimal tool call request to check if the model supports tools."""
+    try:
+        client.chat.completions.create(
+            model=model,
+            messages=[{"role": "user", "content": "call probe with x=test"}],
+            tools=[_PROBE_TOOL],
+            max_tokens=10,
+        )
+        return True
+    except Exception:
+        return False
+
+
+def _discover_foundation_models() -> list:
+    """Discover all FMAPI chat models that support tool calling.
+
+    1. List all serving endpoints with databricks- prefix and llm/v1/chat task
+    2. Probe each model with a minimal tool call to check if tools are supported
+    3. Models in _XFAIL_MODELS are included but marked as expected failures
+    """
+    import logging
+
+    log = logging.getLogger(__name__)
+
+    try:
+        w = WorkspaceClient()
+        endpoints = list(w.serving_endpoints.list())
+    except Exception as exc:
+        log.warning("Could not discover FMAPI models, using fallback list: %s", exc)
+        return _FALLBACK_MODELS
+
+    # Filter to FMAPI chat endpoints
+    chat_endpoints = [
+        e
+        for e in endpoints
+        if e.name and e.name.startswith("databricks-") and e.task == "llm/v1/chat"
+    ]
+
+    # Probe each model to check if it accepts tool definitions
+    client = DatabricksOpenAI(workspace_client=w)
+
+    models = []
+    for e in sorted(chat_endpoints, key=lambda e: e.name or ""):
+        name = e.name or ""
+        if not _supports_tool_calling(client, name):
+            log.info("Skipping %s: does not support tool calling", name)
+            continue
+        if name in _SKIP_MODELS:
+            log.info("Skipping %s: in skip list", name)
+            continue
+        models.append(name)
+
+    log.info("Discovered %d FMAPI models with tool calling support", len(models))
+    return models
+
+
+# Fallback list if dynamic discovery fails (e.g. auth not configured at collection time)
+_FALLBACK_MODELS = [
+    "databricks-claude-sonnet-4-6",
+    "databricks-claude-opus-4-6",
+    "databricks-meta-llama-3-3-70b-instruct",
+    "databricks-gpt-5-2",
+    "databricks-gpt-5-1",
+    "databricks-qwen3-next-80b-a3b-instruct",
+]
+
+_FOUNDATION_MODELS = _discover_foundation_models()
+
+
+def retry(fn, retries=_MAX_RETRIES):
+    """Retry a test function up to `retries` times. Only fails if all attempts fail."""
+    last_exc = None
+    for attempt in range(retries):
+        try:
+            return fn()
+        except Exception as exc:
+            last_exc = exc
+            if attempt < retries - 1:
+                import logging
+
+                logging.getLogger(__name__).warning(
+                    "Attempt %d/%d failed: %s — retrying", attempt + 1, retries, exc
+                )
+    raise last_exc  # type: ignore[misc]
+
+
+async def async_retry(fn, retries=_MAX_RETRIES):
+    """Retry an async test function up to `retries` times."""
+    last_exc = None
+    for attempt in range(retries):
+        try:
+            return await fn()
+        except Exception as exc:
+            last_exc = exc
+            if attempt < retries - 1:
+                import logging
+
+                logging.getLogger(__name__).warning(
+                    "Attempt %d/%d failed: %s — retrying", attempt + 1, retries, exc
+                )
+    raise last_exc  # type: ignore[misc]
+
+
+@tool
+def add(a: int, b: int) -> int:
+    """Add two integers.
+
+    Args:
+        a: First integer
+        b: Second integer
+    """
+    return a + b
+
+
+@tool
+def multiply(a: int, b: int) -> int:
+    """Multiply two integers.
+
+    Args:
+        a: First integer
+        b: Second integer
+    """
+    return a * b
+
+
+# =============================================================================
+# Sync LangGraph Agent
+# =============================================================================
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize("model", _FOUNDATION_MODELS)
+class TestLangGraphSync:
+    """Sync LangGraph agent tests mirroring app-templates/agent-langgraph.
+
+    Each test follows the pattern:
+      ChatDatabricks -> create_react_agent -> agent.invoke / agent.stream
+    """
+
+    def test_single_turn(self, model):
+        """Single-turn: agent calls tools and produces a final answer.
+
+        Mirrors the basic app-template @invoke() handler.
+        """
+
+        def _run():
+            llm = ChatDatabricks(model=model, max_tokens=_max_tokens(model))
+            agent = create_react_agent(llm, [add, multiply])
+
+            response = agent.invoke(
+                {
+                    "messages": [
+                        (
+                            "human",
+                            "Use the add tool to compute 10 + 5, then use the multiply tool "
+                            "to multiply the result by 3. You MUST use the tools.",
+                        )
+                    ]
+                }
+            )
+
+            last_message = response["messages"][-1]
+            assert isinstance(last_message, AIMessage)
+            assert "45" in last_message.content
+
+            tool_messages = [m for m in response["messages"] if isinstance(m, ToolMessage)]
+            assert len(tool_messages) > 0, "Expected tool calls in conversation history"
+
+        retry(_run)
+
+    def test_multi_turn(self, model):
+        """Multi-turn: agent maintains conversation context across turns.
+
+        Mirrors app-templates/agent-langgraph-short-term-memory with MemorySaver
+        checkpointer and thread_id for session continuity.
+        """
+
+        def _run():
+            llm = ChatDatabricks(model=model, max_tokens=_max_tokens(model))
+            agent = create_react_agent(llm, [add, multiply], checkpointer=MemorySaver())
+            config = {"configurable": {"thread_id": f"test-sync-multi-turn-{model}"}}
+
+            response = agent.invoke({"messages": [("human", "What is 10 + 5?")]}, config=config)
+            last_message = response["messages"][-1]
+            assert isinstance(last_message, AIMessage)
+            assert "15" in last_message.content
+
+            response = agent.invoke({"messages": [("human", "Multiply that by 3")]}, config=config)
+            last_message = response["messages"][-1]
+            assert isinstance(last_message, AIMessage)
+            assert "45" in last_message.content
+
+        retry(_run)
+
+    def test_streaming(self, model):
+        """Streaming: agent streams node updates and tool execution events.
+
+        Mirrors the app-template @stream() handler pattern using agent.stream().
+        """
+
+        def _run():
+            llm = ChatDatabricks(model=model, max_tokens=_max_tokens(model))
+            agent = create_react_agent(llm, [add, multiply])
+
+            events = list(
+                agent.stream(
+                    {
+                        "messages": [
+                            (
+                                "human",
+                                "Use the add tool to compute 10 + 5, then use the multiply tool "
+                                "to multiply the result by 3. You MUST use the tools.",
+                            )
+                        ]
+                    },
+                    stream_mode="updates",
+                )
+            )
+
+            assert len(events) > 0, "No stream events received"
+
+            nodes_seen = set()
+            for event in events:
+                nodes_seen.update(event.keys())
+
+            assert "agent" in nodes_seen, f"Expected 'agent' node, got: {nodes_seen}"
+            assert "tools" in nodes_seen, f"Expected 'tools' node, got: {nodes_seen}"
+
+            last_event = events[-1]
+            last_messages = list(last_event.values())[0]["messages"]
+            assert any("45" in str(m.content) for m in last_messages)
+
+        retry(_run)
+
+
+# =============================================================================
+# Async LangGraph Agent
+# =============================================================================
+
+
+@pytest.mark.integration
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model", _FOUNDATION_MODELS)
+class TestLangGraphAsync:
+    """Async LangGraph agent tests mirroring the app-templates @stream() handler.
+
+    Each test follows the exact async pattern deployed in production:
+      ChatDatabricks -> create_react_agent -> agent.ainvoke / agent.astream
+    """
+
+    async def test_single_turn(self, model):
+        """Single-turn via ainvoke."""
+
+        async def _run():
+            llm = ChatDatabricks(model=model, max_tokens=_max_tokens(model))
+            agent = create_react_agent(llm, [add, multiply])
+
+            response = await agent.ainvoke(
+                {
+                    "messages": [
+                        (
+                            "human",
+                            "Use the add tool to compute 10 + 5, then use the multiply tool "
+                            "to multiply the result by 3. You MUST use the tools.",
+                        )
+                    ]
+                }
+            )
+
+            last_message = response["messages"][-1]
+            assert isinstance(last_message, AIMessage)
+            assert "45" in last_message.content
+
+            tool_messages = [m for m in response["messages"] if isinstance(m, ToolMessage)]
+            assert len(tool_messages) > 0, "Expected tool calls in conversation history"
+
+        await async_retry(_run)
+
+    async def test_multi_turn(self, model):
+        """Multi-turn via ainvoke with MemorySaver checkpointer."""
+
+        async def _run():
+            llm = ChatDatabricks(model=model, max_tokens=_max_tokens(model))
+            agent = create_react_agent(llm, [add, multiply], checkpointer=MemorySaver())
+            config = {"configurable": {"thread_id": f"test-async-multi-turn-{model}"}}
+
+            response = await agent.ainvoke(
+                {"messages": [("human", "What is 10 + 5?")]}, config=config
+            )
+            last_message = response["messages"][-1]
+            assert isinstance(last_message, AIMessage)
+            assert "15" in last_message.content
+
+            response = await agent.ainvoke(
+                {"messages": [("human", "Multiply that by 3")]}, config=config
+            )
+            last_message = response["messages"][-1]
+            assert isinstance(last_message, AIMessage)
+            assert "45" in last_message.content
+
+        await async_retry(_run)
+
+    async def test_streaming(self, model):
+        """Streaming via astream — mirrors the exact app-templates production path.
+
+        Uses agent.astream(stream_mode=["updates", "messages"]) which is the
+        pattern in agent-langgraph and agent-langgraph-short-term-memory.
+        """
+
+        async def _run():
+            llm = ChatDatabricks(model=model, max_tokens=_max_tokens(model))
+            agent = create_react_agent(llm, [add, multiply])
+
+            nodes_seen = set()
+            got_message_chunks = False
+            event_count = 0
+
+            async for event in agent.astream(
+                {
+                    "messages": [
+                        (
+                            "human",
+                            "Use the add tool to compute 10 + 5, then use the multiply tool "
+                            "to multiply the result by 3. You MUST use the tools.",
+                        )
+                    ]
+                },
+                stream_mode=["updates", "messages"],
+            ):
+                event_count += 1
+                mode, data = event
+                if mode == "updates":
+                    nodes_seen.update(data.keys())
+                elif mode == "messages":
+                    chunk, _metadata = data
+                    if isinstance(chunk, AIMessageChunk):
+                        got_message_chunks = True
+
+            assert event_count > 0, "No stream events received"
+            assert "agent" in nodes_seen, f"Expected 'agent' node, got: {nodes_seen}"
+            assert "tools" in nodes_seen, f"Expected 'tools' node, got: {nodes_seen}"
+            assert got_message_chunks, "Expected AIMessageChunk tokens in message stream"
+
+        await async_retry(_run)
diff --git a/integrations/openai/src/databricks_openai/utils/clients.py b/integrations/openai/src/databricks_openai/utils/clients.py
index 13b3b51c..de7b4d10 100644
--- a/integrations/openai/src/databricks_openai/utils/clients.py
+++ b/integrations/openai/src/databricks_openai/utils/clients.py
@@ -1,4 +1,4 @@
-from typing import Any, Generator
+from typing import Any, AsyncIterator, Generator, Iterator
 
 from databricks.sdk import WorkspaceClient
 from httpx import AsyncClient, Auth, Client, Request, Response
@@ -51,6 +51,137 @@ def _should_strip_strict(model: str | None) -> bool:
     return "gpt" not in model.lower()
 
 
+def _is_gemini_model(model: str | None) -> bool:
+    """Returns True if the model is a Gemini variant."""
+    if not model:
+        return False
+    return "gemini" in model.lower() or "gemma" in model.lower()
+
+
+def _flatten_list_content(content: list) -> str:
+    """Extract text from a list of content blocks and join into a single string."""
+    text_parts = []
+    for part in content:
+        if isinstance(part, dict) and "text" in part:
+            text_parts.append(part["text"])
+        elif isinstance(part, str):
+            text_parts.append(part)
+        elif hasattr(part, "text"):
+            text_parts.append(part.text)
+    return "".join(text_parts)
+
+
+def _flatten_list_content_in_messages(messages: Any) -> None:
+    """Request-side fix: convert list content to string in tool messages.
+
+    Gemini FMAPI rejects tool messages where content is a list of content blocks
+    (e.g. [{"type": "text", "text": "hello"}]). The Agents SDK always produces
+    this list format when using MCP tools (via chatcmpl_converter.py). We flatten
+    it to a plain string before sending to FMAPI.
+    """
+    if not messages:
+        return
+    for message in messages:
+        if not isinstance(message, dict):
+            continue
+        content = message.get("content")
+        if message.get("role") == "tool" and isinstance(content, list):
+            message["content"] = _flatten_list_content(content)
+
+
+def _flatten_list_content_in_response(response: Any) -> None:
+    """Response-side fix: convert list content to string in non-streaming responses.
+
+    Gemini FMAPI sometimes returns assistant message content as a list of content
+    blocks instead of a string. The Agents SDK expects content to be a string and
+    fails with a ValidationError. We flatten it before returning to the SDK.
+    """
+    if not hasattr(response, "choices"):
+        return
+    for choice in response.choices:
+        message = getattr(choice, "message", None)
+        if message is None:
+            continue
+        content = getattr(message, "content", None)
+        if isinstance(content, list):
+            message.content = _flatten_list_content(content)
+
+
+def _fix_gemini_stream_chunk(chunk: Any) -> Any:
+    """Fix a single streaming chunk from Gemini FMAPI.
+
+    Gemini FMAPI returns delta.content as a list of content blocks instead of a
+    string in streaming responses. The Agents SDK expects string deltas and crashes
+    with a ValidationError when parsing ResponseTextDeltaEvent.
+    """
+    if not hasattr(chunk, "choices"):
+        return chunk
+    for choice in chunk.choices:
+        delta = getattr(choice, "delta", None)
+        if delta is None:
+            continue
+        content = getattr(delta, "content", None)
+        if isinstance(content, list):
+            delta.content = _flatten_list_content(content)
+    return chunk
+
+
+class _GeminiStreamWrapper:
+    """Wraps a sync Stream to fix Gemini list content in stream chunks."""
+
+    def __init__(self, stream: Any):
+        self._stream = stream
+
+    def __iter__(self) -> Iterator:
+        for chunk in self._stream:
+            yield _fix_gemini_stream_chunk(chunk)
+
+    def __next__(self):
+        return _fix_gemini_stream_chunk(next(self._stream))
+
+    def __enter__(self):
+        self._stream.__enter__()
+        return self
+
+    def __exit__(self, *args):
+        return self._stream.__exit__(*args)
+
+    def close(self):
+        self._stream.close()
+
+    @property
+    def response(self):
+        return self._stream.response
+
+
+class _AsyncGeminiStreamWrapper:
+    """Wraps an AsyncStream to fix Gemini list content in stream chunks."""
+
+    def __init__(self, stream: Any):
+        self._stream = stream
+
+    async def __aiter__(self) -> AsyncIterator:
+        async for chunk in self._stream:
+            yield _fix_gemini_stream_chunk(chunk)
+
+    async def __anext__(self):
+        return _fix_gemini_stream_chunk(await self._stream.__anext__())
+
+    async def __aenter__(self):
+        await self._stream.__aenter__()
+        return self
+
+    async def __aexit__(self, *args):
+        return await self._stream.__aexit__(*args)
+
+    async def close(self):
+        await self._stream.close()
+
+    @property
+    def response(self):
+        return self._stream.response
+
+
 def _is_claude_model(model: str | None) -> bool:
     """Returns True if the model is a Claude variant."""
     if not model:
@@ -189,7 +320,14 @@ def create(self, **kwargs):
             _strip_strict_from_tools(kwargs.get("tools"))
         if _is_claude_model(model):
             _fix_empty_assistant_content_in_messages(kwargs.get("messages"))
-        return super().create(**kwargs)
+        if _is_gemini_model(model):
+            _flatten_list_content_in_messages(kwargs.get("messages"))
+        response = super().create(**kwargs)
+        if _is_gemini_model(model):
+            if kwargs.get("stream"):
+                return _GeminiStreamWrapper(response)
+            _flatten_list_content_in_response(response)
+        return response
 
 
 class DatabricksChat(Chat):
@@ -336,7 +474,14 @@ async def create(self, **kwargs):
             _strip_strict_from_tools(kwargs.get("tools"))
         if _is_claude_model(model):
             _fix_empty_assistant_content_in_messages(kwargs.get("messages"))
-        return await super().create(**kwargs)
+        if _is_gemini_model(model):
+            _flatten_list_content_in_messages(kwargs.get("messages"))
+        response = await super().create(**kwargs)
+        if _is_gemini_model(model):
+            if kwargs.get("stream"):
+                return _AsyncGeminiStreamWrapper(response)
+            _flatten_list_content_in_response(response)
+        return response
 
 
 class AsyncDatabricksChat(AsyncChat):
diff --git a/integrations/openai/tests/integration_tests/test_fmapi_tool_calling.py b/integrations/openai/tests/integration_tests/test_fmapi_tool_calling.py
new file mode 100644
index 00000000..4740fe3e
--- /dev/null
+++ b/integrations/openai/tests/integration_tests/test_fmapi_tool_calling.py
@@ -0,0 +1,513 @@
+"""
+End-to-end FMAPI tool calling tests mirroring app-templates CUJs.
+
+These tests replicate the exact user code patterns from app-templates
+(agent-openai-agents-sdk) to verify that single-turn and multi-turn
+conversations don't break.
+
+Naturally exercises regressions like:
+  - PR #269: Agents SDK adds strict:True -> our client strips it -> FMAPI
+  - PR #333: Multi-turn agent loop replays assistant messages with empty
+    content + tool_calls -> our client fixes content -> FMAPI
+
+Prerequisites:
+- FMAPI endpoints must be available on the test workspace
+- echo_message UC function in integration_testing.databricks_ai_bridge_mcp_test
+"""
+
+from __future__ import annotations
+
+import json
+import os
+
+import pytest
+from databricks.sdk import WorkspaceClient
+from openai.types.chat import ChatCompletionToolParam
+
+from databricks_openai import AsyncDatabricksOpenAI, DatabricksOpenAI
+
+pytestmark = pytest.mark.skipif(
+    os.environ.get("RUN_FMAPI_TOOL_CALLING_TESTS") != "1",
+    reason="FMAPI tool calling tests disabled. Set RUN_FMAPI_TOOL_CALLING_TESTS=1 to enable.",
+)
+
+# Models that pass the tool calling probe but have known issues in agent/test flows.
+# These are skipped entirely to keep CI green. When a new model is added to FMAPI,
+# it will be discovered and tested automatically — add it here only if it fails.
+_SKIP_MODELS = {
+    "databricks-gpt-5-nano",  # too small for reliable tool calling
+    "databricks-gpt-oss-20b",  # hallucinates tool names in agent loop
+    "databricks-gpt-oss-120b",  # hallucinates tool names in agent loop
+    "databricks-llama-4-maverick",  # hallucinates tool names in agent loop
+    "databricks-gemini-3-flash",  # requires thought_signature on function calls
+    "databricks-gemini-3-pro",  # requires thought_signature on function calls
+    "databricks-gemini-3-1-pro",  # requires thought_signature on function calls
+}
+
+# Max retries for flaky models (e.g. transient FMAPI errors, model non-determinism)
+_MAX_RETRIES = 3
+
+# Minimal tool definition used to probe whether a model supports tool calling
+_PROBE_TOOL: ChatCompletionToolParam = {
+    "type": "function",
+    "function": {
+        "name": "probe",
+        "description": "probe",
+        "parameters": {
+            "type": "object",
+            "properties": {"x": {"type": "string"}},
+            "required": ["x"],
+        },
+    },
+}
+
+
+def _supports_tool_calling(client: DatabricksOpenAI, model: str) -> bool:
+    """Send a minimal tool call request to check if the model supports tools."""
+    try:
+        client.chat.completions.create(
+            model=model,
+            messages=[{"role": "user", "content": "call probe with x=test"}],
+            tools=[_PROBE_TOOL],
+            max_tokens=10,
+        )
+        return True
+    except Exception:
+        return False
+
+
+def _discover_foundation_models() -> list:
+    """Discover all FMAPI chat models that support tool calling.
+
+    1. List all serving endpoints with databricks- prefix and llm/v1/chat task
+    2. Probe each model with a minimal tool call to check if tools are supported
+    3. Models in _XFAIL_MODELS are included but marked as expected failures
+    """
+    import logging
+
+    log = logging.getLogger(__name__)
+
+    try:
+        w = WorkspaceClient()
+        endpoints = list(w.serving_endpoints.list())
+    except Exception as exc:
+        log.warning("Could not discover FMAPI models, using fallback list: %s", exc)
+        return _FALLBACK_MODELS
+
+    # Filter to FMAPI chat endpoints
+    chat_endpoints = [
+        e
+        for e in endpoints
+        if e.name and e.name.startswith("databricks-") and e.task == "llm/v1/chat"
+    ]
+
+    # Probe each model to check if it accepts tool definitions
+    client = DatabricksOpenAI(workspace_client=w)
+    models = []
+    for e in sorted(chat_endpoints, key=lambda e: e.name or ""):
+        name = e.name or ""
+        if not _supports_tool_calling(client, name):
+            log.info("Skipping %s: does not support tool calling", name)
+            continue
+        if name in _SKIP_MODELS:
+            log.info("Skipping %s: in skip list", name)
+            continue
+        models.append(name)
+
+    log.info("Discovered %d FMAPI models with tool calling support", len(models))
+    return models
+
+
+# Fallback list if dynamic discovery fails (e.g. auth not configured at collection time)
+_FALLBACK_MODELS = [
+    "databricks-claude-sonnet-4-6",
+    "databricks-claude-opus-4-6",
+    "databricks-meta-llama-3-3-70b-instruct",
+    "databricks-gpt-5-2",
+    "databricks-gpt-5-1",
+    "databricks-qwen3-next-80b-a3b-instruct",
+]
+
+
+_FOUNDATION_MODELS = _discover_foundation_models()
+
+
+def retry(fn, retries=_MAX_RETRIES):
+    """Retry a test function up to `retries` times. Only fails if all attempts fail."""
+    last_exc = None
+    for attempt in range(retries):
+        try:
+            return fn()
+        except Exception as exc:
+            last_exc = exc
+            if attempt < retries - 1:
+                import logging
+
+                logging.getLogger(__name__).warning(
+                    "Attempt %d/%d failed: %s — retrying", attempt + 1, retries, exc
+                )
+    raise last_exc  # type: ignore[misc]
+
+
+async def async_retry(fn, retries=_MAX_RETRIES):
+    """Retry an async test function up to `retries` times."""
+    last_exc = None
+    for attempt in range(retries):
+        try:
+            return await fn()
+        except Exception as exc:
+            last_exc = exc
+            if attempt < retries - 1:
+                import logging
+
+                logging.getLogger(__name__).warning(
+                    "Attempt %d/%d failed: %s — retrying", attempt + 1, retries, exc
+                )
+    raise last_exc  # type: ignore[misc]
+
+
+# MCP test infrastructure
+_MCP_CATALOG = "integration_testing"
+_MCP_SCHEMA = "databricks_ai_bridge_mcp_test"
+_MCP_FUNCTION = "echo_message"
+
+
+@pytest.fixture(scope="module")
+def workspace_client():
+    return WorkspaceClient()
+
+
+@pytest.fixture(scope="module")
+def sync_client(workspace_client):
+    return DatabricksOpenAI(workspace_client=workspace_client)
+
+
+@pytest.fixture(scope="module")
+def async_client(workspace_client):
+    return AsyncDatabricksOpenAI(workspace_client=workspace_client)
+
+
+# =============================================================================
+# Async DatabricksOpenAI
+# =============================================================================
+
+
+@pytest.mark.integration
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model", _FOUNDATION_MODELS)
+class TestAgentToolCalling:
+    """End-to-end agent tests mirroring app-templates/agent-openai-agents-sdk.
+
+    Each test follows the exact pattern users deploy:
+      AsyncDatabricksOpenAI -> set_default_openai_client -> McpServer -> Agent -> Runner.run
+    """
+
+    async def test_single_turn(self, async_client, workspace_client, model):
+        """Single-turn conversation: user sends one message, agent calls a tool and responds.
+
+        Mirrors the basic app-template @invoke() handler.
+        """
+        from agents import Agent, Runner, set_default_openai_api, set_default_openai_client
+        from agents.items import MessageOutputItem, ToolCallItem, ToolCallOutputItem
+
+        from databricks_openai.agents import McpServer
+
+        async def _run():
+            set_default_openai_client(async_client)
+            set_default_openai_api("chat_completions")
+
+            async with McpServer.from_uc_function(
+                catalog=_MCP_CATALOG,
+                schema=_MCP_SCHEMA,
+                function_name=_MCP_FUNCTION,
+                workspace_client=workspace_client,
+                timeout=60,
+            ) as server:
+                agent = Agent(
+                    name="echo-agent",
+                    instructions="Use the echo_message tool to echo messages when asked.",
+                    model=model,
+                    mcp_servers=[server],
+                )
+                result = await Runner.run(agent, "Echo the message 'hello from FMAPI test'")
+
+                assert result.final_output is not None
+                assert "hello from FMAPI test" in result.final_output
+
+                item_types = [type(item) for item in result.new_items]
+                assert ToolCallItem in item_types, f"Expected a tool call, got: {item_types}"
+                assert ToolCallOutputItem in item_types, f"Expected tool output, got: {item_types}"
+                assert MessageOutputItem in item_types, f"Expected a message, got: {item_types}"
+
+                input_list = result.to_input_list()
+                assert len(input_list) > 1, "Expected multi-item conversation history"
+
+        await async_retry(_run)
+
+    async def test_multi_turn(self, async_client, workspace_client, model):
+        """Multi-turn conversation: simulates a chat UI sending conversation history.
+
+        First turn: user asks to echo a message, agent calls the tool.
+        Second turn: user sends a followup with the full conversation history
+        (including the assistant's prior tool-calling turn), agent calls the tool again.
+
+        This is how the app-templates chat UI works: each request includes the
+        full conversation history. The second FMAPI call replays the assistant
+        message from the first turn, which may have empty content + tool_calls.
+        """
+        from agents import Agent, Runner, set_default_openai_api, set_default_openai_client
+        from agents.items import ToolCallItem
+
+        from databricks_openai.agents import McpServer
+
+        async def _run():
+            set_default_openai_client(async_client)
+            set_default_openai_api("chat_completions")
+
+            async with McpServer.from_uc_function(
+                catalog=_MCP_CATALOG,
+                schema=_MCP_SCHEMA,
+                function_name=_MCP_FUNCTION,
+                workspace_client=workspace_client,
+                timeout=60,
+            ) as server:
+                agent = Agent(
+                    name="echo-agent",
+                    instructions="Use the echo_message tool to echo messages when asked.",
+                    model=model,
+                    mcp_servers=[server],
+                )
+
+                first_result = await Runner.run(agent, "Echo the message 'hello'")
+                assert first_result.final_output is not None
+                assert "hello" in first_result.final_output
+
+                first_item_types = [type(item) for item in first_result.new_items]
+                assert ToolCallItem in first_item_types
+
+                history = first_result.to_input_list()
+                history.append({"role": "user", "content": "Now echo the message 'world'"})
+
+                second_result = await Runner.run(agent, history)
+                assert second_result.final_output is not None
+                assert "world" in second_result.final_output
+
+                second_item_types = [type(item) for item in second_result.new_items]
+                assert ToolCallItem in second_item_types
+
+                second_history = second_result.to_input_list()
+                assert len(second_history) > len(history), (
+                    f"Expected history to grow: {len(history)} -> {len(second_history)}"
+                )
+
+        await async_retry(_run)
+
+    async def test_streaming(self, async_client, workspace_client, model):
+        """Streaming conversation: mirrors the app-template @stream() handler.
+
+        Uses Runner.run_streamed() which is the streaming path in app-templates.
+        Verifies that stream events arrive in the expected order and contain
+        the expected item types.
+        """
+        from agents import Agent, Runner, set_default_openai_api, set_default_openai_client
+        from agents.stream_events import RunItemStreamEvent
+
+        from databricks_openai.agents import McpServer
+
+        async def _run():
+            set_default_openai_client(async_client)
+            set_default_openai_api("chat_completions")
+
+            async with McpServer.from_uc_function(
+                catalog=_MCP_CATALOG,
+                schema=_MCP_SCHEMA,
+                function_name=_MCP_FUNCTION,
+                workspace_client=workspace_client,
+                timeout=60,
+            ) as server:
+                agent = Agent(
+                    name="echo-agent",
+                    instructions="Use the echo_message tool to echo messages when asked.",
+                    model=model,
+                    mcp_servers=[server],
+                )
+                result = Runner.run_streamed(agent, input="Echo the message 'streaming test'")
+
+                run_item_events = []
+                event_count = 0
+                async for event in result.stream_events():
+                    event_count += 1
+                    if isinstance(event, RunItemStreamEvent):
+                        run_item_events.append(event)
+
+                assert event_count > 0, "No stream events received"
+
+                event_names = [e.name for e in run_item_events]
+                assert "tool_called" in event_names, (
+                    f"Expected tool_called event, got: {event_names}"
+                )
+                assert "tool_output" in event_names, (
+                    f"Expected tool_output event, got: {event_names}"
+                )
+                assert "message_output_created" in event_names, (
+                    f"Expected message_output_created event, got: {event_names}"
+                )
+
+                assert result.final_output is not None
+                assert "streaming test" in result.final_output
+
+        await async_retry(_run)
+
+
+# =============================================================================
+# Sync DatabricksOpenAI — direct chat.completions.create()
+# =============================================================================
+
+# echo_message tool definition (mirrors the UC function signature)
+_ECHO_MESSAGE_TOOL = {
+    "type": "function",
+    "function": {
+        "name": "echo_message",
+        "description": "Echo back the provided message",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "message": {
+                    "type": "string",
+                    "description": "The message to echo back",
+                }
+            },
+            "required": ["message"],
+        },
+    },
+}
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize("model", _FOUNDATION_MODELS)
+class TestSyncClientToolCalling:
+    """Sync DatabricksOpenAI tests using direct client.chat.completions.create().
+
+    The Agents SDK requires an async client, so the sync DatabricksOpenAI CUJ
+    is direct chat.completions.create() calls with tool definitions.
+
+    This exercises DatabricksCompletions.create() (the sync counterpart to
+    AsyncDatabricksCompletions.create() used by the Agents SDK).
+    """
+
+    def test_single_turn(self, sync_client, model):
+        """Single-turn: model receives tool, produces a tool call."""
+
+        def _run():
+            response = sync_client.chat.completions.create(
+                model=model,
+                messages=[{"role": "user", "content": "Echo the message 'sync hello'"}],
+                tools=[_ECHO_MESSAGE_TOOL],
+                max_tokens=200,
+            )
+            message = response.choices[0].message
+            assert message.tool_calls is not None
+            assert len(message.tool_calls) >= 1
+
+            tool_call = message.tool_calls[0]
+            assert tool_call.id is not None
+            assert tool_call.type == "function"
+            assert tool_call.function.name == "echo_message"
+            args = json.loads(tool_call.function.arguments)
+            assert "message" in args
+
+        retry(_run)
+
+    def test_multi_turn(self, sync_client, model):
+        """Multi-turn: tool_call -> tool result -> text response.
+
+        The second FMAPI call replays the assistant message (potentially with
+        empty content + tool_calls), exercising the PR #333 fix.
+        """
+
+        def _run():
+            # Turn 1: get tool call
+            response = sync_client.chat.completions.create(
+                model=model,
+                messages=[{"role": "user", "content": "Echo the message 'sync world'"}],
+                tools=[_ECHO_MESSAGE_TOOL],
+                max_tokens=200,
+            )
+            assistant_msg = response.choices[0].message
+            assert assistant_msg.tool_calls is not None
+            tool_call = assistant_msg.tool_calls[0]
+
+            # Turn 2: send tool result back, get text response
+            # Manually construct the assistant message to avoid extra fields
+            # (e.g. "annotations") that model_dump() includes but FMAPI rejects
+            assistant_dict = {
+                "role": "assistant",
+                "content": assistant_msg.content or "",
+                "tool_calls": [
+                    {
+                        "id": tc.id,
+                        "type": tc.type,
+                        "function": {
+                            "name": tc.function.name,
+                            "arguments": tc.function.arguments,
+                        },
+                    }
+                    for tc in assistant_msg.tool_calls
+                ],
+            }
+            response = sync_client.chat.completions.create(
+                model=model,
+                messages=[
+                    {"role": "user", "content": "Echo the message 'sync world'"},
+                    assistant_dict,
+                    {
+                        "role": "tool",
+                        "tool_call_id": tool_call.id,
+                        "content": "sync world",
+                    },
+                ],
+                tools=[_ECHO_MESSAGE_TOOL],
+                max_tokens=200,
+            )
+            followup = response.choices[0].message
+            assert followup.content is not None
+            assert "sync world" in followup.content
+
+        retry(_run)
+
+    def test_streaming(self, sync_client, model):
+        """Streaming: tool call arrives as chunked deltas."""
+
+        def _run():
+            stream = sync_client.chat.completions.create(
+                model=model,
+                messages=[{"role": "user", "content": "Echo the message 'sync stream'"}],
+                tools=[_ECHO_MESSAGE_TOOL],
+                max_tokens=200,
+                stream=True,
+            )
+            chunks = list(stream)
+            assert len(chunks) > 0
+
+            # Reassemble tool call from streamed deltas
+            tool_call_name = ""
+            tool_call_args = ""
+            tool_call_id = None
+            for chunk in chunks:
+                delta = chunk.choices[0].delta if chunk.choices else None
+                if delta and delta.tool_calls:
+                    tc = delta.tool_calls[0]
+                    if tc.id:
+                        tool_call_id = tc.id
+                    if tc.function:
+                        if tc.function.name:
+                            tool_call_name += tc.function.name
+                        if tc.function.arguments:
+                            tool_call_args += tc.function.arguments
+
+            assert tool_call_id is not None, "No tool call ID found in stream"
+            assert tool_call_name == "echo_message"
+            args = json.loads(tool_call_args)
+            assert "message" in args
+
+        retry(_run)

From 6ebbfdc92032caa33911ea46577a3bf54c321485 Mon Sep 17 00:00:00 2001
From: Dhruv Gupta <dhruv.gupta@databricks.com>
Date: Mon, 2 Mar 2026 14:35:41 -0800
Subject: [PATCH 2/9] Fix missing Iterator/AsyncIterator imports for Gemini
 stream wrappers

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 integrations/openai/src/databricks_openai/utils/clients.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/integrations/openai/src/databricks_openai/utils/clients.py b/integrations/openai/src/databricks_openai/utils/clients.py
index 977a3802..f62c8d4d 100644
--- a/integrations/openai/src/databricks_openai/utils/clients.py
+++ b/integrations/openai/src/databricks_openai/utils/clients.py
@@ -1,4 +1,5 @@
 import os
+from collections.abc import AsyncIterator, Iterator
 from typing import Any, Generator
 
 from databricks.sdk import WorkspaceClient

From 016c01c240602560ace5288c5e585a32a7ce85c2 Mon Sep 17 00:00:00 2001
From: Dhruv Gupta <dhruv.gupta@databricks.com>
Date: Mon, 2 Mar 2026 14:51:52 -0800
Subject: [PATCH 3/9] Clean up logging and stale references in FMAPI test files

- Move import logging and log = logging.getLogger(__name__) to module level
- Remove inline import logging from retry functions and _discover_foundation_models
- Fix stale _XFAIL_MODELS references in docstrings

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../test_fmapi_tool_calling.py                | 21 ++++++------------
 .../test_fmapi_tool_calling.py                | 22 ++++++-------------
 2 files changed, 14 insertions(+), 29 deletions(-)

diff --git a/integrations/langchain/tests/integration_tests/test_fmapi_tool_calling.py b/integrations/langchain/tests/integration_tests/test_fmapi_tool_calling.py
index cc871128..23a842e6 100644
--- a/integrations/langchain/tests/integration_tests/test_fmapi_tool_calling.py
+++ b/integrations/langchain/tests/integration_tests/test_fmapi_tool_calling.py
@@ -11,6 +11,7 @@
 
 from __future__ import annotations
 
+import logging
 import os
 
 import pytest
@@ -87,16 +88,16 @@ def _supports_tool_calling(client: DatabricksOpenAI, model: str) -> bool:
         return False
 
 
+log = logging.getLogger(__name__)
+
+
 def _discover_foundation_models() -> list:
     """Discover all FMAPI chat models that support tool calling.
 
     1. List all serving endpoints with databricks- prefix and llm/v1/chat task
     2. Probe each model with a minimal tool call to check if tools are supported
-    3. Models in _XFAIL_MODELS are included but marked as expected failures
+    3. Models in _SKIP_MODELS are excluded entirely
     """
-    import logging
-
-    log = logging.getLogger(__name__)
 
     try:
         w = WorkspaceClient()
@@ -152,11 +153,7 @@ def retry(fn, retries=_MAX_RETRIES):
         except Exception as exc:
             last_exc = exc
             if attempt < retries - 1:
-                import logging
-
-                logging.getLogger(__name__).warning(
-                    "Attempt %d/%d failed: %s — retrying", attempt + 1, retries, exc
-                )
+                log.warning("Attempt %d/%d failed: %s — retrying", attempt + 1, retries, exc)
     raise last_exc  # type: ignore[misc]
 
 
@@ -169,11 +166,7 @@ async def async_retry(fn, retries=_MAX_RETRIES):
         except Exception as exc:
             last_exc = exc
             if attempt < retries - 1:
-                import logging
-
-                logging.getLogger(__name__).warning(
-                    "Attempt %d/%d failed: %s — retrying", attempt + 1, retries, exc
-                )
+                log.warning("Attempt %d/%d failed: %s — retrying", attempt + 1, retries, exc)
     raise last_exc  # type: ignore[misc]
 
 
diff --git a/integrations/openai/tests/integration_tests/test_fmapi_tool_calling.py b/integrations/openai/tests/integration_tests/test_fmapi_tool_calling.py
index 4740fe3e..90b74808 100644
--- a/integrations/openai/tests/integration_tests/test_fmapi_tool_calling.py
+++ b/integrations/openai/tests/integration_tests/test_fmapi_tool_calling.py
@@ -18,6 +18,7 @@
 from __future__ import annotations
 
 import json
+import logging
 import os
 
 import pytest
@@ -76,17 +77,16 @@ def _supports_tool_calling(client: DatabricksOpenAI, model: str) -> bool:
         return False
 
 
+log = logging.getLogger(__name__)
+
+
 def _discover_foundation_models() -> list:
     """Discover all FMAPI chat models that support tool calling.
 
     1. List all serving endpoints with databricks- prefix and llm/v1/chat task
     2. Probe each model with a minimal tool call to check if tools are supported
-    3. Models in _XFAIL_MODELS are included but marked as expected failures
+    3. Models in _SKIP_MODELS are excluded entirely
     """
-    import logging
-
-    log = logging.getLogger(__name__)
-
     try:
         w = WorkspaceClient()
         endpoints = list(w.serving_endpoints.list())
@@ -141,11 +141,7 @@ def retry(fn, retries=_MAX_RETRIES):
         except Exception as exc:
             last_exc = exc
             if attempt < retries - 1:
-                import logging
-
-                logging.getLogger(__name__).warning(
-                    "Attempt %d/%d failed: %s — retrying", attempt + 1, retries, exc
-                )
+                log.warning("Attempt %d/%d failed: %s — retrying", attempt + 1, retries, exc)
     raise last_exc  # type: ignore[misc]
 
 
@@ -158,11 +154,7 @@ async def async_retry(fn, retries=_MAX_RETRIES):
         except Exception as exc:
             last_exc = exc
             if attempt < retries - 1:
-                import logging
-
-                logging.getLogger(__name__).warning(
-                    "Attempt %d/%d failed: %s — retrying", attempt + 1, retries, exc
-                )
+                log.warning("Attempt %d/%d failed: %s — retrying", attempt + 1, retries, exc)
     raise last_exc  # type: ignore[misc]
 
 

From 5ca144ee6abd1ef6c3e0aaf2c75320eae6e67e09 Mon Sep 17 00:00:00 2001
From: Dhruv Gupta <dhruv.gupta@databricks.com>
Date: Tue, 3 Mar 2026 13:07:07 -0800
Subject: [PATCH 4/9] Centralize shared test config, use capabilities API,
 comment out response-side fix

- Extract shared skip lists, discovery, retry helpers to test_utils/fmapi.py
- Use capabilities.function_calling API instead of probe-based detection
- Remove app-templates references from test docstrings
- Comment out response-side list content fix to test if only request-side + streaming is needed

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../test_fmapi_tool_calling.py                | 202 ++----------------
 .../src/databricks_openai/utils/clients.py    |   6 +-
 .../test_fmapi_tool_calling.py                | 189 ++--------------
 src/databricks_ai_bridge/test_utils/fmapi.py  | 127 +++++++++++
 4 files changed, 168 insertions(+), 356 deletions(-)
 create mode 100644 src/databricks_ai_bridge/test_utils/fmapi.py

diff --git a/integrations/langchain/tests/integration_tests/test_fmapi_tool_calling.py b/integrations/langchain/tests/integration_tests/test_fmapi_tool_calling.py
index 23a842e6..312949c1 100644
--- a/integrations/langchain/tests/integration_tests/test_fmapi_tool_calling.py
+++ b/integrations/langchain/tests/integration_tests/test_fmapi_tool_calling.py
@@ -1,9 +1,5 @@
 """
-End-to-end FMAPI tool calling tests for LangGraph agents mirroring app-templates CUJs.
-
-These tests replicate the exact user code patterns from app-templates
-(agent-langgraph, agent-langgraph-short-term-memory) to verify that
-single-turn, multi-turn, and streaming conversations don't break.
+End-to-end FMAPI tool calling tests for ChatDatabricks via LangGraph.
 
 Prerequisites:
 - FMAPI endpoints must be available on the test workspace
@@ -11,17 +7,20 @@
 
 from __future__ import annotations
 
-import logging
 import os
 
 import pytest
-from databricks.sdk import WorkspaceClient
-from databricks_openai import DatabricksOpenAI
+from databricks_ai_bridge.test_utils.fmapi import (
+    LANGCHAIN_SKIP_MODELS,
+    async_retry,
+    discover_foundation_models,
+    max_tokens_for_model,
+    retry,
+)
 from langchain_core.messages import AIMessage, AIMessageChunk, ToolMessage
 from langchain_core.tools import tool
 from langgraph.checkpoint.memory import MemorySaver
 from langgraph.prebuilt import create_react_agent
-from openai.types.chat import ChatCompletionToolParam
 
 from databricks_langchain import ChatDatabricks
 
@@ -30,144 +29,7 @@
     reason="FMAPI tool calling tests disabled. Set RUN_FMAPI_TOOL_CALLING_TESTS=1 to enable.",
 )
 
-# Models that pass the tool calling probe but have known issues in agent/test flows.
-# These are skipped entirely to keep CI green. When a new model is added to FMAPI,
-# it will be discovered and tested automatically — add it here only if it fails.
-_SKIP_MODELS = {
-    "databricks-gpt-5-nano",  # too small for reliable tool calling
-    "databricks-gpt-oss-20b",  # hallucinates tool names in agent loop
-    "databricks-gpt-oss-120b",  # hallucinates tool names in agent loop
-    "databricks-llama-4-maverick",  # hallucinates tool names in agent loop
-    "databricks-gemini-3-flash",  # requires thought_signature on function calls
-    "databricks-gemini-3-pro",  # requires thought_signature on function calls
-    "databricks-gemini-3-1-pro",  # requires thought_signature on function calls
-    "databricks-gemma-3-12b",  # outputs raw tool call text instead of executing tools
-}
-
-# Max retries for flaky models (e.g. transient FMAPI errors, model non-determinism)
-_MAX_RETRIES = 3
-
-# Reasoning models (e.g. Gemini 2.5 Pro) consume reasoning tokens from the max_tokens
-# budget. With 2 tools they need 200-600 reasoning tokens, so 200 is too small.
-_MODEL_MAX_TOKENS: dict[str, int] = {
-    "databricks-gemini-2-5-pro": 1000,
-}
-_DEFAULT_MAX_TOKENS = 200
-
-
-def _max_tokens(model: str) -> int:
-    return _MODEL_MAX_TOKENS.get(model, _DEFAULT_MAX_TOKENS)
-
-
-# Minimal tool definition used to probe whether a model supports tool calling
-_PROBE_TOOL: ChatCompletionToolParam = {
-    "type": "function",
-    "function": {
-        "name": "probe",
-        "description": "probe",
-        "parameters": {
-            "type": "object",
-            "properties": {"x": {"type": "string"}},
-            "required": ["x"],
-        },
-    },
-}
-
-
-def _supports_tool_calling(client: DatabricksOpenAI, model: str) -> bool:
-    """Send a minimal tool call request to check if the model supports tools."""
-    try:
-        client.chat.completions.create(
-            model=model,
-            messages=[{"role": "user", "content": "call probe with x=test"}],
-            tools=[_PROBE_TOOL],
-            max_tokens=10,
-        )
-        return True
-    except Exception:
-        return False
-
-
-log = logging.getLogger(__name__)
-
-
-def _discover_foundation_models() -> list:
-    """Discover all FMAPI chat models that support tool calling.
-
-    1. List all serving endpoints with databricks- prefix and llm/v1/chat task
-    2. Probe each model with a minimal tool call to check if tools are supported
-    3. Models in _SKIP_MODELS are excluded entirely
-    """
-
-    try:
-        w = WorkspaceClient()
-        endpoints = list(w.serving_endpoints.list())
-    except Exception as exc:
-        log.warning("Could not discover FMAPI models, using fallback list: %s", exc)
-        return _FALLBACK_MODELS
-
-    # Filter to FMAPI chat endpoints
-    chat_endpoints = [
-        e
-        for e in endpoints
-        if e.name and e.name.startswith("databricks-") and e.task == "llm/v1/chat"
-    ]
-
-    # Probe each model to check if it accepts tool definitions
-    client = DatabricksOpenAI(workspace_client=w)
-
-    models = []
-    for e in sorted(chat_endpoints, key=lambda e: e.name or ""):
-        name = e.name or ""
-        if not _supports_tool_calling(client, name):
-            log.info("Skipping %s: does not support tool calling", name)
-            continue
-        if name in _SKIP_MODELS:
-            log.info("Skipping %s: in skip list", name)
-            continue
-        models.append(name)
-
-    log.info("Discovered %d FMAPI models with tool calling support", len(models))
-    return models
-
-
-# Fallback list if dynamic discovery fails (e.g. auth not configured at collection time)
-_FALLBACK_MODELS = [
-    "databricks-claude-sonnet-4-6",
-    "databricks-claude-opus-4-6",
-    "databricks-meta-llama-3-3-70b-instruct",
-    "databricks-gpt-5-2",
-    "databricks-gpt-5-1",
-    "databricks-qwen3-next-80b-a3b-instruct",
-]
-
-_FOUNDATION_MODELS = _discover_foundation_models()
-
-
-def retry(fn, retries=_MAX_RETRIES):
-    """Retry a test function up to `retries` times. Only fails if all attempts fail."""
-    last_exc = None
-    for attempt in range(retries):
-        try:
-            return fn()
-        except Exception as exc:
-            last_exc = exc
-            if attempt < retries - 1:
-                log.warning("Attempt %d/%d failed: %s — retrying", attempt + 1, retries, exc)
-    raise last_exc  # type: ignore[misc]
-
-
-async def async_retry(fn, retries=_MAX_RETRIES):
-    """Retry an async test function up to `retries` times."""
-    last_exc = None
-    for attempt in range(retries):
-        try:
-            return await fn()
-        except Exception as exc:
-            last_exc = exc
-            if attempt < retries - 1:
-                log.warning("Attempt %d/%d failed: %s — retrying", attempt + 1, retries, exc)
-    raise last_exc  # type: ignore[misc]
+_FOUNDATION_MODELS = discover_foundation_models(LANGCHAIN_SKIP_MODELS)
 
 
 @tool
@@ -200,20 +62,13 @@ def multiply(a: int, b: int) -> int:
 @pytest.mark.integration
 @pytest.mark.parametrize("model", _FOUNDATION_MODELS)
 class TestLangGraphSync:
-    """Sync LangGraph agent tests mirroring app-templates/agent-langgraph.
-
-    Each test follows the pattern:
-      ChatDatabricks -> create_react_agent -> agent.invoke / agent.stream
-    """
+    """Sync LangGraph agent tests using ChatDatabricks + create_react_agent."""
 
     def test_single_turn(self, model):
-        """Single-turn: agent calls tools and produces a final answer.
-
-        Mirrors the basic app-template @invoke() handler.
-        """
+        """Single-turn: agent calls tools and produces a final answer."""
 
         def _run():
-            llm = ChatDatabricks(model=model, max_tokens=_max_tokens(model))
+            llm = ChatDatabricks(model=model, max_tokens=max_tokens_for_model(model))
             agent = create_react_agent(llm, [add, multiply])
 
             response = agent.invoke(
@@ -238,14 +93,10 @@ def _run():
         retry(_run)
 
     def test_multi_turn(self, model):
-        """Multi-turn: agent maintains conversation context across turns.
-
-        Mirrors app-templates/agent-langgraph-short-term-memory with MemorySaver
-        checkpointer and thread_id for session continuity.
-        """
+        """Multi-turn: agent maintains conversation context across turns."""
 
         def _run():
-            llm = ChatDatabricks(model=model, max_tokens=_max_tokens(model))
+            llm = ChatDatabricks(model=model, max_tokens=max_tokens_for_model(model))
             agent = create_react_agent(llm, [add, multiply], checkpointer=MemorySaver())
             config = {"configurable": {"thread_id": f"test-sync-multi-turn-{model}"}}
 
@@ -262,13 +113,10 @@ def _run():
         retry(_run)
 
     def test_streaming(self, model):
-        """Streaming: agent streams node updates and tool execution events.
-
-        Mirrors the app-template @stream() handler pattern using agent.stream().
-        """
+        """Streaming: agent streams node updates and tool execution events."""
 
         def _run():
-            llm = ChatDatabricks(model=model, max_tokens=_max_tokens(model))
+            llm = ChatDatabricks(model=model, max_tokens=max_tokens_for_model(model))
             agent = create_react_agent(llm, [add, multiply])
 
             events = list(
@@ -311,17 +159,13 @@ def _run():
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model", _FOUNDATION_MODELS)
 class TestLangGraphAsync:
-    """Async LangGraph agent tests mirroring the app-templates @stream() handler.
-
-    Each test follows the exact async pattern deployed in production:
-      ChatDatabricks -> create_react_agent -> agent.ainvoke / agent.astream
-    """
+    """Async LangGraph agent tests using ChatDatabricks + create_react_agent."""
 
     async def test_single_turn(self, model):
         """Single-turn via ainvoke."""
 
         async def _run():
-            llm = ChatDatabricks(model=model, max_tokens=_max_tokens(model))
+            llm = ChatDatabricks(model=model, max_tokens=max_tokens_for_model(model))
             agent = create_react_agent(llm, [add, multiply])
 
             response = await agent.ainvoke(
@@ -349,7 +193,7 @@ async def test_multi_turn(self, model):
         """Multi-turn via ainvoke with MemorySaver checkpointer."""
 
         async def _run():
-            llm = ChatDatabricks(model=model, max_tokens=_max_tokens(model))
+            llm = ChatDatabricks(model=model, max_tokens=max_tokens_for_model(model))
             agent = create_react_agent(llm, [add, multiply], checkpointer=MemorySaver())
             config = {"configurable": {"thread_id": f"test-async-multi-turn-{model}"}}
 
@@ -370,14 +214,10 @@ async def _run():
         await async_retry(_run)
 
     async def test_streaming(self, model):
-        """Streaming via astream — mirrors the exact app-templates production path.
-
-        Uses agent.astream(stream_mode=["updates", "messages"]) which is the
-        pattern in agent-langgraph and agent-langgraph-short-term-memory.
-        """
+        """Streaming via astream with updates + messages stream modes."""
 
         async def _run():
-            llm = ChatDatabricks(model=model, max_tokens=_max_tokens(model))
+            llm = ChatDatabricks(model=model, max_tokens=max_tokens_for_model(model))
             agent = create_react_agent(llm, [add, multiply])
 
             nodes_seen = set()
diff --git a/integrations/openai/src/databricks_openai/utils/clients.py b/integrations/openai/src/databricks_openai/utils/clients.py
index f62c8d4d..4788de26 100644
--- a/integrations/openai/src/databricks_openai/utils/clients.py
+++ b/integrations/openai/src/databricks_openai/utils/clients.py
@@ -337,7 +337,8 @@ def create(self, **kwargs):
         if _is_gemini_model(model):
             if kwargs.get("stream"):
                 return _GeminiStreamWrapper(response)
-            _flatten_list_content_in_response(response)
+            # TODO: re-enable if non-streaming list content issues surface
+            # _flatten_list_content_in_response(response)
         return response
 
 
@@ -491,7 +492,8 @@ async def create(self, **kwargs):
         if _is_gemini_model(model):
             if kwargs.get("stream"):
                 return _AsyncGeminiStreamWrapper(response)
-            _flatten_list_content_in_response(response)
+            # TODO: re-enable if non-streaming list content issues surface
+            # _flatten_list_content_in_response(response)
         return response
 
 
diff --git a/integrations/openai/tests/integration_tests/test_fmapi_tool_calling.py b/integrations/openai/tests/integration_tests/test_fmapi_tool_calling.py
index 90b74808..2207239a 100644
--- a/integrations/openai/tests/integration_tests/test_fmapi_tool_calling.py
+++ b/integrations/openai/tests/integration_tests/test_fmapi_tool_calling.py
@@ -1,14 +1,5 @@
 """
-End-to-end FMAPI tool calling tests mirroring app-templates CUJs.
-
-These tests replicate the exact user code patterns from app-templates
-(agent-openai-agents-sdk) to verify that single-turn and multi-turn
-conversations don't break.
-
-Naturally exercises regressions like:
-  - PR #269: Agents SDK adds strict:True -> our client strips it -> FMAPI
-  - PR #333: Multi-turn agent loop replays assistant messages with empty
-    content + tool_calls -> our client fixes content -> FMAPI
+End-to-end FMAPI tool calling tests for DatabricksOpenAI (sync + async via Agents SDK).
 
 Prerequisites:
 - FMAPI endpoints must be available on the test workspace
@@ -18,12 +9,16 @@
 from __future__ import annotations
 
 import json
-import logging
 import os
 
 import pytest
 from databricks.sdk import WorkspaceClient
-from openai.types.chat import ChatCompletionToolParam
+from databricks_ai_bridge.test_utils.fmapi import (
+    COMMON_SKIP_MODELS,
+    async_retry,
+    discover_foundation_models,
+    retry,
+)
 
 from databricks_openai import AsyncDatabricksOpenAI, DatabricksOpenAI
 
@@ -32,130 +27,7 @@
     reason="FMAPI tool calling tests disabled. Set RUN_FMAPI_TOOL_CALLING_TESTS=1 to enable.",
 )
 
-# Models that pass the tool calling probe but have known issues in agent/test flows.
-# These are skipped entirely to keep CI green. When a new model is added to FMAPI,
-# it will be discovered and tested automatically — add it here only if it fails.
-_SKIP_MODELS = {
-    "databricks-gpt-5-nano",  # too small for reliable tool calling
-    "databricks-gpt-oss-20b",  # hallucinates tool names in agent loop
-    "databricks-gpt-oss-120b",  # hallucinates tool names in agent loop
-    "databricks-llama-4-maverick",  # hallucinates tool names in agent loop
-    "databricks-gemini-3-flash",  # requires thought_signature on function calls
-    "databricks-gemini-3-pro",  # requires thought_signature on function calls
-    "databricks-gemini-3-1-pro",  # requires thought_signature on function calls
-}
-
-# Max retries for flaky models (e.g. transient FMAPI errors, model non-determinism)
-_MAX_RETRIES = 3
-
-# Minimal tool definition used to probe whether a model supports tool calling
-_PROBE_TOOL: ChatCompletionToolParam = {
-    "type": "function",
-    "function": {
-        "name": "probe",
-        "description": "probe",
-        "parameters": {
-            "type": "object",
-            "properties": {"x": {"type": "string"}},
-            "required": ["x"],
-        },
-    },
-}
-
-
-def _supports_tool_calling(client: DatabricksOpenAI, model: str) -> bool:
-    """Send a minimal tool call request to check if the model supports tools."""
-    try:
-        client.chat.completions.create(
-            model=model,
-            messages=[{"role": "user", "content": "call probe with x=test"}],
-            tools=[_PROBE_TOOL],
-            max_tokens=10,
-        )
-        return True
-    except Exception:
-        return False
-
-
-log = logging.getLogger(__name__)
-
-
-def _discover_foundation_models() -> list:
-    """Discover all FMAPI chat models that support tool calling.
-
-    1. List all serving endpoints with databricks- prefix and llm/v1/chat task
-    2. Probe each model with a minimal tool call to check if tools are supported
-    3. Models in _SKIP_MODELS are excluded entirely
-    """
-    try:
-        w = WorkspaceClient()
-        endpoints = list(w.serving_endpoints.list())
-    except Exception as exc:
-        log.warning("Could not discover FMAPI models, using fallback list: %s", exc)
-        return _FALLBACK_MODELS
-
-    # Filter to FMAPI chat endpoints
-    chat_endpoints = [
-        e
-        for e in endpoints
-        if e.name and e.name.startswith("databricks-") and e.task == "llm/v1/chat"
-    ]
-
-    # Probe each model to check if it accepts tool definitions
-    client = DatabricksOpenAI(workspace_client=w)
-    models = []
-    for e in sorted(chat_endpoints, key=lambda e: e.name or ""):
-        name = e.name or ""
-        if not _supports_tool_calling(client, name):
-            log.info("Skipping %s: does not support tool calling", name)
-            continue
-        if name in _SKIP_MODELS:
-            log.info("Skipping %s: in skip list", name)
-            continue
-        models.append(name)
-
-    log.info("Discovered %d FMAPI models with tool calling support", len(models))
-    return models
-
-
-# Fallback list if dynamic discovery fails (e.g. auth not configured at collection time)
-_FALLBACK_MODELS = [
-    "databricks-claude-sonnet-4-6",
-    "databricks-claude-opus-4-6",
-    "databricks-meta-llama-3-3-70b-instruct",
-    "databricks-gpt-5-2",
-    "databricks-gpt-5-1",
-    "databricks-qwen3-next-80b-a3b-instruct",
-]
-
-
-_FOUNDATION_MODELS = _discover_foundation_models()
-
-
-def retry(fn, retries=_MAX_RETRIES):
-    """Retry a test function up to `retries` times. Only fails if all attempts fail."""
-    last_exc = None
-    for attempt in range(retries):
-        try:
-            return fn()
-        except Exception as exc:
-            last_exc = exc
-            if attempt < retries - 1:
-                log.warning("Attempt %d/%d failed: %s — retrying", attempt + 1, retries, exc)
-    raise last_exc  # type: ignore[misc]
-
-
-async def async_retry(fn, retries=_MAX_RETRIES):
-    """Retry an async test function up to `retries` times."""
-    last_exc = None
-    for attempt in range(retries):
-        try:
-            return await fn()
-        except Exception as exc:
-            last_exc = exc
-            if attempt < retries - 1:
-                log.warning("Attempt %d/%d failed: %s — retrying", attempt + 1, retries, exc)
-    raise last_exc  # type: ignore[misc]
+_FOUNDATION_MODELS = discover_foundation_models(COMMON_SKIP_MODELS)
 
 
 # MCP test infrastructure
@@ -188,17 +60,13 @@ def async_client(workspace_client):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model", _FOUNDATION_MODELS)
 class TestAgentToolCalling:
-    """End-to-end agent tests mirroring app-templates/agent-openai-agents-sdk.
+    """Async agent tests using the OpenAI Agents SDK with MCP tools.
 
-    Each test follows the exact pattern users deploy:
-      AsyncDatabricksOpenAI -> set_default_openai_client -> McpServer -> Agent -> Runner.run
+    Pattern: AsyncDatabricksOpenAI -> McpServer -> Agent -> Runner.run
     """
 
     async def test_single_turn(self, async_client, workspace_client, model):
-        """Single-turn conversation: user sends one message, agent calls a tool and responds.
-
-        Mirrors the basic app-template @invoke() handler.
-        """
+        """Single-turn: user sends one message, agent calls a tool and responds."""
         from agents import Agent, Runner, set_default_openai_api, set_default_openai_client
         from agents.items import MessageOutputItem, ToolCallItem, ToolCallOutputItem
 
@@ -237,16 +105,7 @@ async def _run():
         await async_retry(_run)
 
     async def test_multi_turn(self, async_client, workspace_client, model):
-        """Multi-turn conversation: simulates a chat UI sending conversation history.
-
-        First turn: user asks to echo a message, agent calls the tool.
-        Second turn: user sends a followup with the full conversation history
-        (including the assistant's prior tool-calling turn), agent calls the tool again.
-
-        This is how the app-templates chat UI works: each request includes the
-        full conversation history. The second FMAPI call replays the assistant
-        message from the first turn, which may have empty content + tool_calls.
-        """
+        """Multi-turn: two-turn conversation with full history replay."""
         from agents import Agent, Runner, set_default_openai_api, set_default_openai_client
         from agents.items import ToolCallItem
 
@@ -295,12 +154,7 @@ async def _run():
         await async_retry(_run)
 
     async def test_streaming(self, async_client, workspace_client, model):
-        """Streaming conversation: mirrors the app-template @stream() handler.
-
-        Uses Runner.run_streamed() which is the streaming path in app-templates.
-        Verifies that stream events arrive in the expected order and contain
-        the expected item types.
-        """
+        """Streaming: verify stream events via Runner.run_streamed()."""
         from agents import Agent, Runner, set_default_openai_api, set_default_openai_client
         from agents.stream_events import RunItemStreamEvent
 
@@ -355,7 +209,7 @@ async def _run():
 # Sync DatabricksOpenAI — direct chat.completions.create()
 # =============================================================================
 
-# echo_message tool definition (mirrors the UC function signature)
+# echo_message tool definition for direct chat.completions.create() tests
 _ECHO_MESSAGE_TOOL = {
     "type": "function",
     "function": {
@@ -378,14 +232,7 @@ async def _run():
 @pytest.mark.integration
 @pytest.mark.parametrize("model", _FOUNDATION_MODELS)
 class TestSyncClientToolCalling:
-    """Sync DatabricksOpenAI tests using direct client.chat.completions.create().
-
-    The Agents SDK requires an async client, so the sync DatabricksOpenAI CUJ
-    is direct chat.completions.create() calls with tool definitions.
-
-    This exercises DatabricksCompletions.create() (the sync counterpart to
-    AsyncDatabricksCompletions.create() used by the Agents SDK).
-    """
+    """Sync DatabricksOpenAI tests using direct chat.completions.create()."""
 
     def test_single_turn(self, sync_client, model):
         """Single-turn: model receives tool, produces a tool call."""
@@ -411,11 +258,7 @@ def _run():
         retry(_run)
 
     def test_multi_turn(self, sync_client, model):
-        """Multi-turn: tool_call -> tool result -> text response.
-
-        The second FMAPI call replays the assistant message (potentially with
-        empty content + tool_calls), exercising the PR #333 fix.
-        """
+        """Multi-turn: tool_call -> tool result -> text response."""
 
         def _run():
             # Turn 1: get tool call
diff --git a/src/databricks_ai_bridge/test_utils/fmapi.py b/src/databricks_ai_bridge/test_utils/fmapi.py
new file mode 100644
index 00000000..e0b5343f
--- /dev/null
+++ b/src/databricks_ai_bridge/test_utils/fmapi.py
@@ -0,0 +1,127 @@
+"""Shared test utilities for FMAPI tool calling integration tests.
+
+Used by both databricks-openai and databricks-langchain test suites.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+from databricks.sdk import WorkspaceClient
+
+log = logging.getLogger(__name__)
+
+# Max retries for flaky models (e.g. transient FMAPI errors, model non-determinism)
+MAX_RETRIES = 3
+
+# Default max_tokens for test requests
+DEFAULT_MAX_TOKENS = 200
+
+# Models shared across both OpenAI and LangChain skip lists
+COMMON_SKIP_MODELS = {
+    "databricks-gpt-5-nano",  # too small for reliable tool calling
+    "databricks-gpt-oss-20b",  # hallucinates tool names in agent loop
+    "databricks-gpt-oss-120b",  # hallucinates tool names in agent loop
+    "databricks-llama-4-maverick",  # hallucinates tool names in agent loop
+    "databricks-gemini-3-flash",  # requires thought_signature on function calls
+    "databricks-gemini-3-pro",  # requires thought_signature on function calls
+    "databricks-gemini-3-1-pro",  # requires thought_signature on function calls
+}
+
+# Additional models skipped only in LangChain tests
+LANGCHAIN_SKIP_MODELS = COMMON_SKIP_MODELS | {
+    "databricks-gemma-3-12b",  # outputs raw tool call text instead of executing tools
+}
+
+# Reasoning models consume reasoning tokens from the max_tokens budget.
+# Gemini 2.5 Pro needs 200-600 reasoning tokens with 2 tools, so 200 is too small.
+MODEL_MAX_TOKENS: dict[str, int] = {
+    "databricks-gemini-2-5-pro": 1000,
+}
+
+
+def max_tokens_for_model(model: str) -> int:
+    """Return appropriate max_tokens for a model, accounting for reasoning token overhead."""
+    return MODEL_MAX_TOKENS.get(model, DEFAULT_MAX_TOKENS)
+
+
+# Fallback list if dynamic discovery fails (e.g. auth not configured at collection time)
+FALLBACK_MODELS = [
+    "databricks-claude-sonnet-4-6",
+    "databricks-claude-opus-4-6",
+    "databricks-meta-llama-3-3-70b-instruct",
+    "databricks-gpt-5-2",
+    "databricks-gpt-5-1",
+    "databricks-qwen3-next-80b-a3b-instruct",
+]
+
+
+def has_function_calling(w: WorkspaceClient, endpoint_name: str) -> bool:
+    """Check if an endpoint supports function calling via the capabilities API."""
+    try:
+        resp: dict[str, Any] = w.api_client.do("GET", f"/api/2.0/serving-endpoints/{endpoint_name}")
+        return resp.get("capabilities", {}).get("function_calling", False)
+    except Exception:
+        return False
+
+
+def discover_foundation_models(skip_models: set[str]) -> list[str]:
+    """Discover all FMAPI chat models that support tool calling.
+
+    1. List all serving endpoints with databricks- prefix and llm/v1/chat task
+    2. Check capabilities.function_calling via the serving-endpoints API
+    3. Models in skip_models are excluded entirely
+    """
+    try:
+        w = WorkspaceClient()
+        endpoints = list(w.serving_endpoints.list())
+    except Exception as exc:
+        log.warning("Could not discover FMAPI models, using fallback list: %s", exc)
+        return FALLBACK_MODELS
+
+    chat_endpoints = [
+        e
+        for e in endpoints
+        if e.name and e.name.startswith("databricks-") and e.task == "llm/v1/chat"
+    ]
+
+    models = []
+    for e in sorted(chat_endpoints, key=lambda e: e.name or ""):
+        name = e.name or ""
+        if not has_function_calling(w, name):
+            log.info("Skipping %s: does not support function calling", name)
+            continue
+        if name in skip_models:
+            log.info("Skipping %s: in skip list", name)
+            continue
+        models.append(name)
+
+    log.info("Discovered %d FMAPI models with function calling support", len(models))
+    return models
+
+
+def retry(fn, retries=MAX_RETRIES):
+    """Retry a test function up to `retries` times. Only fails if all attempts fail."""
+    last_exc = None
+    for attempt in range(retries):
+        try:
+            return fn()
+        except Exception as exc:
+            last_exc = exc
+            if attempt < retries - 1:
+                log.warning("Attempt %d/%d failed: %s — retrying", attempt + 1, retries, exc)
+    raise last_exc  # type: ignore[misc]
+
+
+async def async_retry(fn, retries=MAX_RETRIES):
+    """Retry an async test function up to `retries` times."""
+    last_exc = None
+    for attempt in range(retries):
+        try:
+            return await fn()
+        except Exception as exc:
+            last_exc = exc
+            if attempt < retries - 1:
+                log.warning("Attempt %d/%d failed: %s — retrying", attempt + 1, retries, exc)
+    raise last_exc  # type: ignore[misc]

From cca62cc09a3e4b0d7a2fefda4298798fe7780344 Mon Sep 17 00:00:00 2001
From: Dhruv Gupta <dhruv.gupta@databricks.com>
Date: Tue, 3 Mar 2026 13:38:43 -0800
Subject: [PATCH 5/9] Simplify Gemini fixes, re-enable response-side fix, add
 codex to skip list

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../src/databricks_openai/utils/clients.py    | 140 +++++-------------
 src/databricks_ai_bridge/test_utils/fmapi.py  |   4 +
 2 files changed, 43 insertions(+), 101 deletions(-)

diff --git a/integrations/openai/src/databricks_openai/utils/clients.py b/integrations/openai/src/databricks_openai/utils/clients.py
index 4788de26..70044482 100644
--- a/integrations/openai/src/databricks_openai/utils/clients.py
+++ b/integrations/openai/src/databricks_openai/utils/clients.py
@@ -62,6 +62,11 @@ def _should_strip_strict(model: str | None) -> bool:
     return "gpt" not in model.lower()
 
 
+# Gemini FMAPI compat: rejects list content in tool messages (request side)
+# and returns list content in responses (response side). We flatten both.
+# Note: Gemini 3.x requires thought_signature which is an Agents SDK issue, not fixable here.
+
+
 def _is_gemini_model(model: str | None) -> bool:
     """Returns True if the model is a Gemini variant."""
     if not model:
@@ -69,128 +74,63 @@ def _is_gemini_model(model: str | None) -> bool:
     return "gemini" in model.lower() or "gemma" in model.lower()
 
 
-def _flatten_list_content(content: list) -> str:
-    """Extract text from a list of content blocks and join into a single string."""
-    text_parts = []
-    for part in content:
-        if isinstance(part, dict) and "text" in part:
-            text_parts.append(part["text"])
-        elif isinstance(part, str):
-            text_parts.append(part)
-        elif hasattr(part, "text"):
-            text_parts.append(part.text)
-    return "".join(text_parts)
-
-
-def _flatten_list_content_in_messages(messages: Any) -> None:
-    """Request-side fix: convert list content to string in tool messages.
-
-    Gemini FMAPI rejects tool messages where content is a list of content blocks
-    (e.g. [{"type": "text", "text": "hello"}]). The Agents SDK always produces
-    this list format when using MCP tools (via chatcmpl_converter.py). We flatten
-    it to a plain string before sending to FMAPI.
-    """
+def _fix_gemini_messages(messages: Any) -> None:
+    """Flatten list content in outbound tool messages for Gemini."""
     if not messages:
         return
-    for message in messages:
-        if not isinstance(message, dict):
-            continue
-        content = message.get("content")
-        if message.get("role") == "tool" and isinstance(content, list):
-            message["content"] = _flatten_list_content(content)
-
+    for msg in messages:
+        if (
+            isinstance(msg, dict)
+            and msg.get("role") == "tool"
+            and isinstance(msg.get("content"), list)
+        ):
+            msg["content"] = "".join(
+                p.get("text", "") if isinstance(p, dict) else getattr(p, "text", p)
+                for p in msg["content"]
+            )
 
-def _flatten_list_content_in_response(response: Any) -> None:
-    """Response-side fix: convert list content to string in non-streaming responses.
 
-    Gemini FMAPI sometimes returns assistant message content as a list of content
-    blocks instead of a string. The Agents SDK expects content to be a string and
-    fails with a ValidationError. We flatten it before returning to the SDK.
-    """
+def _fix_gemini_content(response: Any) -> None:
+    """Flatten list content in response messages/deltas for Gemini."""
     if not hasattr(response, "choices"):
         return
     for choice in response.choices:
-        message = getattr(choice, "message", None)
-        if message is None:
-            continue
-        content = getattr(message, "content", None)
-        if isinstance(content, list):
-            message.content = _flatten_list_content(content)
-
-
-def _fix_gemini_stream_chunk(chunk: Any) -> Any:
-    """Fix a single streaming chunk from Gemini FMAPI.
-
-    Gemini FMAPI returns delta.content as a list of content blocks instead of a
-    string in streaming responses. The Agents SDK expects string deltas and crashes
-    with a ValidationError when parsing ResponseTextDeltaEvent.
-    """
-    if not hasattr(chunk, "choices"):
-        return chunk
-    for choice in chunk.choices:
-        delta = getattr(choice, "delta", None)
-        if delta is None:
-            continue
-        content = getattr(delta, "content", None)
-        if isinstance(content, list):
-            delta.content = _flatten_list_content(content)
-    return chunk
+        obj = getattr(choice, "message", None) or getattr(choice, "delta", None)
+        if obj is not None and isinstance(getattr(obj, "content", None), list):
+            obj.content = "".join(
+                p.get("text", "") if isinstance(p, dict) else getattr(p, "text", p)
+                for p in obj.content
+            )
 
 
 class _GeminiStreamWrapper:
-    """Wraps a sync Stream to fix Gemini list content in stream chunks."""
+    """Wraps a sync Stream, flattening list content in each chunk."""
 
     def __init__(self, stream: Any):
         self._stream = stream
 
     def __iter__(self) -> Iterator:
         for chunk in self._stream:
-            yield _fix_gemini_stream_chunk(chunk)
-
-    def __next__(self):
-        return _fix_gemini_stream_chunk(next(self._stream))
-
-    def __enter__(self):
-        self._stream.__enter__()
-        return self
-
-    def __exit__(self, *args):
-        return self._stream.__exit__(*args)
+            _fix_gemini_content(chunk)
+            yield chunk
 
-    def close(self):
-        self._stream.close()
-
-    @property
-    def response(self):
-        return self._stream.response
+    def __getattr__(self, name: str):
+        return getattr(self._stream, name)
 
 
 class _AsyncGeminiStreamWrapper:
-    """Wraps an AsyncStream to fix Gemini list content in stream chunks."""
+    """Wraps an AsyncStream, flattening list content in each chunk."""
 
     def __init__(self, stream: Any):
         self._stream = stream
 
     async def __aiter__(self) -> AsyncIterator:
         async for chunk in self._stream:
-            yield _fix_gemini_stream_chunk(chunk)
-
-    async def __anext__(self):
-        return _fix_gemini_stream_chunk(await self._stream.__anext__())
-
-    async def __aenter__(self):
-        await self._stream.__aenter__()
-        return self
+            _fix_gemini_content(chunk)
+            yield chunk
 
-    async def __aexit__(self, *args):
-        return await self._stream.__aexit__(*args)
-
-    async def close(self):
-        await self._stream.close()
-
-    @property
-    def response(self):
-        return self._stream.response
+    def __getattr__(self, name: str):
+        return getattr(self._stream, name)
 
 
 def _is_claude_model(model: str | None) -> bool:
@@ -332,13 +272,12 @@ def create(self, **kwargs):
         if _is_claude_model(model):
             _fix_empty_assistant_content_in_messages(kwargs.get("messages"))
         if _is_gemini_model(model):
-            _flatten_list_content_in_messages(kwargs.get("messages"))
+            _fix_gemini_messages(kwargs.get("messages"))
         response = super().create(**kwargs)
         if _is_gemini_model(model):
             if kwargs.get("stream"):
                 return _GeminiStreamWrapper(response)
-            # TODO: re-enable if non-streaming list content issues surface
-            # _flatten_list_content_in_response(response)
+            _fix_gemini_content(response)
         return response
 
 
@@ -487,13 +426,12 @@ async def create(self, **kwargs):
         if _is_claude_model(model):
             _fix_empty_assistant_content_in_messages(kwargs.get("messages"))
         if _is_gemini_model(model):
-            _flatten_list_content_in_messages(kwargs.get("messages"))
+            _fix_gemini_messages(kwargs.get("messages"))
         response = await super().create(**kwargs)
         if _is_gemini_model(model):
             if kwargs.get("stream"):
                 return _AsyncGeminiStreamWrapper(response)
-            # TODO: re-enable if non-streaming list content issues surface
-            # _flatten_list_content_in_response(response)
+            _fix_gemini_content(response)
         return response
 
 
diff --git a/src/databricks_ai_bridge/test_utils/fmapi.py b/src/databricks_ai_bridge/test_utils/fmapi.py
index e0b5343f..f1887500 100644
--- a/src/databricks_ai_bridge/test_utils/fmapi.py
+++ b/src/databricks_ai_bridge/test_utils/fmapi.py
@@ -27,6 +27,10 @@
     "databricks-gemini-3-flash",  # requires thought_signature on function calls
     "databricks-gemini-3-pro",  # requires thought_signature on function calls
     "databricks-gemini-3-1-pro",  # requires thought_signature on function calls
+    "databricks-gpt-5-1-codex-max",  # Responses API only, no Chat Completions support
+    "databricks-gpt-5-1-codex-mini",  # Responses API only, no Chat Completions support
+    "databricks-gpt-5-2-codex",  # Responses API only, no Chat Completions support
+    "databricks-gpt-5-3-codex",  # Responses API only, no Chat Completions support
 }
 
 # Additional models skipped only in LangChain tests

From 6e49d61d82ade7bfe1a41f6ac0bd8a04cbbfef98 Mon Sep 17 00:00:00 2001
From: Dhruv Gupta <dhruv.gupta@databricks.com>
Date: Tue, 3 Mar 2026 13:48:37 -0800
Subject: [PATCH 6/9] Fix ty error: suppress invalid-assignment on
 api_client.do() return type

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/databricks_ai_bridge/test_utils/fmapi.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/databricks_ai_bridge/test_utils/fmapi.py b/src/databricks_ai_bridge/test_utils/fmapi.py
index f1887500..55c3c235 100644
--- a/src/databricks_ai_bridge/test_utils/fmapi.py
+++ b/src/databricks_ai_bridge/test_utils/fmapi.py
@@ -6,7 +6,6 @@
 from __future__ import annotations
 
 import logging
-from typing import Any
 
 from databricks.sdk import WorkspaceClient
 
@@ -64,8 +63,8 @@ def max_tokens_for_model(model: str) -> int:
 def has_function_calling(w: WorkspaceClient, endpoint_name: str) -> bool:
     """Check if an endpoint supports function calling via the capabilities API."""
     try:
-        resp: dict[str, Any] = w.api_client.do("GET", f"/api/2.0/serving-endpoints/{endpoint_name}")
-        return resp.get("capabilities", {}).get("function_calling", False)
+        resp = w.api_client.do("GET", f"/api/2.0/serving-endpoints/{endpoint_name}")
+        return resp.get("capabilities", {}).get("function_calling", False)  # type: ignore[invalid-assignment]
     except Exception:
         return False
 

From 306de63a245c011d67d70a74f03d11723ee0bdb3 Mon Sep 17 00:00:00 2001
From: Dhruv Gupta <dhruv.gupta@databricks.com>
Date: Tue, 3 Mar 2026 21:02:12 -0800
Subject: [PATCH 7/9] Revert Gemini stream fixes, skip Gemini 2.5 models
 instead

---
 .../src/databricks_openai/utils/clients.py    | 108 +-----------------
 src/databricks_ai_bridge/test_utils/fmapi.py  |   2 +
 2 files changed, 8 insertions(+), 102 deletions(-)

diff --git a/integrations/openai/src/databricks_openai/utils/clients.py b/integrations/openai/src/databricks_openai/utils/clients.py
index 70044482..13b3b51c 100644
--- a/integrations/openai/src/databricks_openai/utils/clients.py
+++ b/integrations/openai/src/databricks_openai/utils/clients.py
@@ -1,5 +1,3 @@
-import os
-from collections.abc import AsyncIterator, Iterator
 from typing import Any, Generator
 
 from databricks.sdk import WorkspaceClient
@@ -16,15 +14,6 @@
 _DATABRICKS_APPS_DOMAIN = "databricksapps"
 
 
-def _get_openai_api_key():
-    """Return OPENAI_API_KEY from env if set, otherwise 'no-token'.
-
-    Passed through to the OpenAI client so that the agents SDK tracing
-    client can authenticate with the OpenAI API.
-    """
-    return os.environ.get("OPENAI_API_KEY") or "no-token"
-
-
 class BearerAuth(Auth):
     def __init__(self, get_headers_func):
         self.get_headers_func = get_headers_func
@@ -62,77 +51,6 @@ def _should_strip_strict(model: str | None) -> bool:
     return "gpt" not in model.lower()
 
 
-# Gemini FMAPI compat: rejects list content in tool messages (request side)
-# and returns list content in responses (response side). We flatten both.
-# Note: Gemini 3.x requires thought_signature which is an Agents SDK issue, not fixable here.
-
-
-def _is_gemini_model(model: str | None) -> bool:
-    """Returns True if the model is a Gemini variant."""
-    if not model:
-        return False
-    return "gemini" in model.lower() or "gemma" in model.lower()
-
-
-def _fix_gemini_messages(messages: Any) -> None:
-    """Flatten list content in outbound tool messages for Gemini."""
-    if not messages:
-        return
-    for msg in messages:
-        if (
-            isinstance(msg, dict)
-            and msg.get("role") == "tool"
-            and isinstance(msg.get("content"), list)
-        ):
-            msg["content"] = "".join(
-                p.get("text", "") if isinstance(p, dict) else getattr(p, "text", p)
-                for p in msg["content"]
-            )
-
-
-def _fix_gemini_content(response: Any) -> None:
-    """Flatten list content in response messages/deltas for Gemini."""
-    if not hasattr(response, "choices"):
-        return
-    for choice in response.choices:
-        obj = getattr(choice, "message", None) or getattr(choice, "delta", None)
-        if obj is not None and isinstance(getattr(obj, "content", None), list):
-            obj.content = "".join(
-                p.get("text", "") if isinstance(p, dict) else getattr(p, "text", p)
-                for p in obj.content
-            )
-
-
-class _GeminiStreamWrapper:
-    """Wraps a sync Stream, flattening list content in each chunk."""
-
-    def __init__(self, stream: Any):
-        self._stream = stream
-
-    def __iter__(self) -> Iterator:
-        for chunk in self._stream:
-            _fix_gemini_content(chunk)
-            yield chunk
-
-    def __getattr__(self, name: str):
-        return getattr(self._stream, name)
-
-
-class _AsyncGeminiStreamWrapper:
-    """Wraps an AsyncStream, flattening list content in each chunk."""
-
-    def __init__(self, stream: Any):
-        self._stream = stream
-
-    async def __aiter__(self) -> AsyncIterator:
-        async for chunk in self._stream:
-            _fix_gemini_content(chunk)
-            yield chunk
-
-    def __getattr__(self, name: str):
-        return getattr(self._stream, name)
-
-
 def _is_claude_model(model: str | None) -> bool:
     """Returns True if the model is a Claude variant."""
     if not model:
@@ -271,14 +189,7 @@ def create(self, **kwargs):
             _strip_strict_from_tools(kwargs.get("tools"))
         if _is_claude_model(model):
             _fix_empty_assistant_content_in_messages(kwargs.get("messages"))
-        if _is_gemini_model(model):
-            _fix_gemini_messages(kwargs.get("messages"))
-        response = super().create(**kwargs)
-        if _is_gemini_model(model):
-            if kwargs.get("stream"):
-                return _GeminiStreamWrapper(response)
-            _fix_gemini_content(response)
-        return response
+        return super().create(**kwargs)
 
 
 class DatabricksChat(Chat):
@@ -303,7 +214,7 @@ def _get_app_client(self, app_name: str) -> OpenAI:
             # Authentication is handled via http_client, not api_key
             self._app_clients_cache[app_name] = OpenAI(
                 base_url=app_url,
-                api_key=_get_openai_api_key(),
+                api_key="no-token",
                 http_client=_get_authorized_http_client(self._workspace_client),
             )
         return self._app_clients_cache[app_name]
@@ -392,7 +303,7 @@ def __init__(
         # Authentication is handled via http_client, not api_key
         super().__init__(
             base_url=target_base_url,
-            api_key=_get_openai_api_key(),
+            api_key="no-token",
             http_client=_get_authorized_http_client(workspace_client),
         )
 
@@ -425,14 +336,7 @@ async def create(self, **kwargs):
             _strip_strict_from_tools(kwargs.get("tools"))
         if _is_claude_model(model):
             _fix_empty_assistant_content_in_messages(kwargs.get("messages"))
-        if _is_gemini_model(model):
-            _fix_gemini_messages(kwargs.get("messages"))
-        response = await super().create(**kwargs)
-        if _is_gemini_model(model):
-            if kwargs.get("stream"):
-                return _AsyncGeminiStreamWrapper(response)
-            _fix_gemini_content(response)
-        return response
+        return await super().create(**kwargs)
 
 
 class AsyncDatabricksChat(AsyncChat):
@@ -457,7 +361,7 @@ def _get_app_client(self, app_name: str) -> AsyncOpenAI:
             # Authentication is handled via http_client, not api_key
             self._app_clients_cache[app_name] = AsyncOpenAI(
                 base_url=app_url,
-                api_key=_get_openai_api_key(),
+                api_key="no-token",
                 http_client=_get_authorized_async_http_client(self._workspace_client),
             )
         return self._app_clients_cache[app_name]
@@ -546,7 +450,7 @@ def __init__(
         # Authentication is handled via http_client, not api_key
         super().__init__(
             base_url=target_base_url,
-            api_key=_get_openai_api_key(),
+            api_key="no-token",
             http_client=_get_authorized_async_http_client(workspace_client),
         )
 
diff --git a/src/databricks_ai_bridge/test_utils/fmapi.py b/src/databricks_ai_bridge/test_utils/fmapi.py
index 55c3c235..cca1e513 100644
--- a/src/databricks_ai_bridge/test_utils/fmapi.py
+++ b/src/databricks_ai_bridge/test_utils/fmapi.py
@@ -26,6 +26,8 @@
     "databricks-gemini-3-flash",  # requires thought_signature on function calls
     "databricks-gemini-3-pro",  # requires thought_signature on function calls
     "databricks-gemini-3-1-pro",  # requires thought_signature on function calls
+    "databricks-gemini-2-5-pro",  # returns list content that breaks Agents SDK parsing
+    "databricks-gemini-2-5-flash",  # returns list content that breaks Agents SDK parsing
     "databricks-gpt-5-1-codex-max",  # Responses API only, no Chat Completions support
     "databricks-gpt-5-1-codex-mini",  # Responses API only, no Chat Completions support
     "databricks-gpt-5-2-codex",  # Responses API only, no Chat Completions support

From 7344fcb4c0a88e02862666360e47a9aeda4275a3 Mon Sep 17 00:00:00 2001
From: Dhruv Gupta <dhruv.gupta@databricks.com>
Date: Tue, 3 Mar 2026 21:04:48 -0800
Subject: [PATCH 8/9] Revert test_clients.py to main (removed
 _get_openai_api_key import)

---
 .../openai/tests/unit_tests/test_clients.py   | 42 +------------------
 1 file changed, 2 insertions(+), 40 deletions(-)

diff --git a/integrations/openai/tests/unit_tests/test_clients.py b/integrations/openai/tests/unit_tests/test_clients.py
index add39671..6d874447 100644
--- a/integrations/openai/tests/unit_tests/test_clients.py
+++ b/integrations/openai/tests/unit_tests/test_clients.py
@@ -1,4 +1,3 @@
-import os
 from typing import Any, cast
 from unittest.mock import AsyncMock, MagicMock, patch
 
@@ -16,7 +15,6 @@
     _get_app_url,
     _get_authorized_async_http_client,
     _get_authorized_http_client,
-    _get_openai_api_key,
     _should_strip_strict,
     _strip_strict_from_tools,
     _validate_oauth_for_apps,
@@ -68,11 +66,7 @@ class TestDatabricksOpenAI:
 
     def test_init_with_default_workspace_client(self):
         """Test initialization with default WorkspaceClient."""
-        env = {k: v for k, v in os.environ.items() if k != "OPENAI_API_KEY"}
-        with (
-            patch.dict("os.environ", env, clear=True),
-            patch("databricks_openai.utils.clients.WorkspaceClient") as mock_ws_client_class,
-        ):
+        with patch("databricks_openai.utils.clients.WorkspaceClient") as mock_ws_client_class:
             mock_client = MagicMock(spec=WorkspaceClient)
             mock_client.config.host = "https://default.databricks.com"
             mock_client.config.authenticate.return_value = {"Authorization": "Bearer default-token"}
@@ -89,19 +83,6 @@ def test_init_with_default_workspace_client(self):
             assert "default.databricks.com" in str(client.base_url)
             assert client.api_key == "no-token"
 
-    def test_init_uses_openai_api_key_env_var(self):
-        with (
-            patch.dict("os.environ", {"OPENAI_API_KEY": "sk-from-env"}),
-            patch("databricks_openai.utils.clients.WorkspaceClient") as mock_ws_client_class,
-        ):
-            mock_client = MagicMock(spec=WorkspaceClient)
-            mock_client.config.host = "https://default.databricks.com"
-            mock_client.config.authenticate.return_value = {"Authorization": "Bearer token"}
-            mock_ws_client_class.return_value = mock_client
-
-            client = DatabricksOpenAI()
-            assert client.api_key == "sk-from-env"
-
     def test_bearer_auth_flow(self, mock_workspace_client):
         """Test that BearerAuth correctly adds Authorization header."""
 
@@ -128,11 +109,7 @@ class TestAsyncDatabricksOpenAI:
 
     def test_init_with_default_workspace_client(self):
         """Test initialization with default WorkspaceClient."""
-        env = {k: v for k, v in os.environ.items() if k != "OPENAI_API_KEY"}
-        with (
-            patch.dict("os.environ", env, clear=True),
-            patch("databricks_openai.utils.clients.WorkspaceClient") as mock_ws_client_class,
-        ):
+        with patch("databricks_openai.utils.clients.WorkspaceClient") as mock_ws_client_class:
             mock_client = MagicMock(spec=WorkspaceClient)
             mock_client.config.host = "https://default.databricks.com"
             mock_client.config.authenticate.return_value = {"Authorization": "Bearer default-token"}
@@ -665,18 +642,3 @@ def _messages_with_empty_assistant_content() -> list[dict[str, Any]]:
         {"role": "assistant", "content": "", "tool_calls": [{"id": "1"}]},
         {"role": "tool", "content": "result", "tool_call_id": "1"},
     ]
-
-
-class TestOpenAIApiKey:
-    def test_uses_env_var_when_set(self):
-        with patch.dict("os.environ", {"OPENAI_API_KEY": "sk-test-key"}):
-            assert _get_openai_api_key() == "sk-test-key"
-
-    def test_falls_back_to_no_token_when_unset(self):
-        env = {k: v for k, v in os.environ.items() if k != "OPENAI_API_KEY"}
-        with patch.dict("os.environ", env, clear=True):
-            assert _get_openai_api_key() == "no-token"
-
-    def test_falls_back_to_no_token_when_empty_string(self):
-        with patch.dict("os.environ", {"OPENAI_API_KEY": ""}):
-            assert _get_openai_api_key() == "no-token"

From 30052ff7570366ff817cc0b06edbe62e78e9f413 Mon Sep 17 00:00:00 2001
From: Dhruv Gupta <dhruv.gupta@databricks.com>
Date: Tue, 3 Mar 2026 21:21:34 -0800
Subject: [PATCH 9/9] Sync clients.py and test_clients.py with current main

---
 .../src/databricks_openai/utils/clients.py    | 18 ++++++--
 .../openai/tests/unit_tests/test_clients.py   | 42 ++++++++++++++++++-
 2 files changed, 54 insertions(+), 6 deletions(-)

diff --git a/integrations/openai/src/databricks_openai/utils/clients.py b/integrations/openai/src/databricks_openai/utils/clients.py
index 13b3b51c..7d138ee8 100644
--- a/integrations/openai/src/databricks_openai/utils/clients.py
+++ b/integrations/openai/src/databricks_openai/utils/clients.py
@@ -1,3 +1,4 @@
+import os
 from typing import Any, Generator
 
 from databricks.sdk import WorkspaceClient
@@ -14,6 +15,15 @@
 _DATABRICKS_APPS_DOMAIN = "databricksapps"
 
 
+def _get_openai_api_key():
+    """Return OPENAI_API_KEY from env if set, otherwise 'no-token'.
+
+    Passed through to the OpenAI client so that the agents SDK tracing
+    client can authenticate with the OpenAI API.
+    """
+    return os.environ.get("OPENAI_API_KEY") or "no-token"
+
+
 class BearerAuth(Auth):
     def __init__(self, get_headers_func):
         self.get_headers_func = get_headers_func
@@ -214,7 +224,7 @@ def _get_app_client(self, app_name: str) -> OpenAI:
             # Authentication is handled via http_client, not api_key
             self._app_clients_cache[app_name] = OpenAI(
                 base_url=app_url,
-                api_key="no-token",
+                api_key=_get_openai_api_key(),
                 http_client=_get_authorized_http_client(self._workspace_client),
             )
         return self._app_clients_cache[app_name]
@@ -303,7 +313,7 @@ def __init__(
         # Authentication is handled via http_client, not api_key
         super().__init__(
             base_url=target_base_url,
-            api_key="no-token",
+            api_key=_get_openai_api_key(),
             http_client=_get_authorized_http_client(workspace_client),
         )
 
@@ -361,7 +371,7 @@ def _get_app_client(self, app_name: str) -> AsyncOpenAI:
             # Authentication is handled via http_client, not api_key
             self._app_clients_cache[app_name] = AsyncOpenAI(
                 base_url=app_url,
-                api_key="no-token",
+                api_key=_get_openai_api_key(),
                 http_client=_get_authorized_async_http_client(self._workspace_client),
             )
         return self._app_clients_cache[app_name]
@@ -450,7 +460,7 @@ def __init__(
         # Authentication is handled via http_client, not api_key
         super().__init__(
             base_url=target_base_url,
-            api_key="no-token",
+            api_key=_get_openai_api_key(),
             http_client=_get_authorized_async_http_client(workspace_client),
         )
 
diff --git a/integrations/openai/tests/unit_tests/test_clients.py b/integrations/openai/tests/unit_tests/test_clients.py
index 6d874447..add39671 100644
--- a/integrations/openai/tests/unit_tests/test_clients.py
+++ b/integrations/openai/tests/unit_tests/test_clients.py
@@ -1,3 +1,4 @@
+import os
 from typing import Any, cast
 from unittest.mock import AsyncMock, MagicMock, patch
 
@@ -15,6 +16,7 @@
     _get_app_url,
     _get_authorized_async_http_client,
     _get_authorized_http_client,
+    _get_openai_api_key,
     _should_strip_strict,
     _strip_strict_from_tools,
     _validate_oauth_for_apps,
@@ -66,7 +68,11 @@ class TestDatabricksOpenAI:
 
     def test_init_with_default_workspace_client(self):
         """Test initialization with default WorkspaceClient."""
-        with patch("databricks_openai.utils.clients.WorkspaceClient") as mock_ws_client_class:
+        env = {k: v for k, v in os.environ.items() if k != "OPENAI_API_KEY"}
+        with (
+            patch.dict("os.environ", env, clear=True),
+            patch("databricks_openai.utils.clients.WorkspaceClient") as mock_ws_client_class,
+        ):
             mock_client = MagicMock(spec=WorkspaceClient)
             mock_client.config.host = "https://default.databricks.com"
             mock_client.config.authenticate.return_value = {"Authorization": "Bearer default-token"}
@@ -83,6 +89,19 @@ def test_init_with_default_workspace_client(self):
             assert "default.databricks.com" in str(client.base_url)
             assert client.api_key == "no-token"
 
+    def test_init_uses_openai_api_key_env_var(self):
+        with (
+            patch.dict("os.environ", {"OPENAI_API_KEY": "sk-from-env"}),
+            patch("databricks_openai.utils.clients.WorkspaceClient") as mock_ws_client_class,
+        ):
+            mock_client = MagicMock(spec=WorkspaceClient)
+            mock_client.config.host = "https://default.databricks.com"
+            mock_client.config.authenticate.return_value = {"Authorization": "Bearer token"}
+            mock_ws_client_class.return_value = mock_client
+
+            client = DatabricksOpenAI()
+            assert client.api_key == "sk-from-env"
+
     def test_bearer_auth_flow(self, mock_workspace_client):
         """Test that BearerAuth correctly adds Authorization header."""
 
@@ -109,7 +128,11 @@ class TestAsyncDatabricksOpenAI:
 
     def test_init_with_default_workspace_client(self):
         """Test initialization with default WorkspaceClient."""
-        with patch("databricks_openai.utils.clients.WorkspaceClient") as mock_ws_client_class:
+        env = {k: v for k, v in os.environ.items() if k != "OPENAI_API_KEY"}
+        with (
+            patch.dict("os.environ", env, clear=True),
+            patch("databricks_openai.utils.clients.WorkspaceClient") as mock_ws_client_class,
+        ):
             mock_client = MagicMock(spec=WorkspaceClient)
             mock_client.config.host = "https://default.databricks.com"
             mock_client.config.authenticate.return_value = {"Authorization": "Bearer default-token"}
@@ -642,3 +665,18 @@ def _messages_with_empty_assistant_content() -> list[dict[str, Any]]:
         {"role": "assistant", "content": "", "tool_calls": [{"id": "1"}]},
         {"role": "tool", "content": "result", "tool_call_id": "1"},
     ]
+
+
+class TestOpenAIApiKey:
+    def test_uses_env_var_when_set(self):
+        with patch.dict("os.environ", {"OPENAI_API_KEY": "sk-test-key"}):
+            assert _get_openai_api_key() == "sk-test-key"
+
+    def test_falls_back_to_no_token_when_unset(self):
+        env = {k: v for k, v in os.environ.items() if k != "OPENAI_API_KEY"}
+        with patch.dict("os.environ", env, clear=True):
+            assert _get_openai_api_key() == "no-token"
+
+    def test_falls_back_to_no_token_when_empty_string(self):
+        with patch.dict("os.environ", {"OPENAI_API_KEY": ""}):
+            assert _get_openai_api_key() == "no-token"