From 0e0fa63ae617814fc08569f97bbd66e3e3368379 Mon Sep 17 00:00:00 2001 From: Sureel Bhurat Date: Sun, 23 Nov 2025 09:21:37 -0500 Subject: [PATCH 1/3] Fix: Map finish_reason for LiteLLM streaming responses Fixes #3665 Streaming responses from LiteLLM models (Claude, GPT, etc.) were not setting finish_reason on aggregated LlmResponse objects, causing agent runners to not properly recognize completion states. This fix mirrors the finish_reason mapping logic from the non-streaming path (lines 776-784) and applies it to both streaming code paths: - Tool call responses (lines 1340-1368) - Text-only responses (lines 1369-1390) Without this fix, agents using Claude or GPT via LiteLLM would encounter stop conditions that couldn't be properly handled, leading to incomplete responses or unexpected agent behavior. Tested with Claude Sonnet 4.5 and GPT-5 via Azure OpenAI in production multi-agent system with MCP tools. --- src/google/adk/models/lite_llm.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/src/google/adk/models/lite_llm.py b/src/google/adk/models/lite_llm.py index 140473982f..92a9de4024 100644 --- a/src/google/adk/models/lite_llm.py +++ b/src/google/adk/models/lite_llm.py @@ -1655,6 +1655,20 @@ async def generate_content_async( else None, ) ) + # FIX: Map finish_reason to FinishReason enum for streaming responses. + # Previously, streaming responses did not set finish_reason on aggregated + # LlmResponse objects, causing the ADK agent runner to not properly recognize + # completion states. This mirrors the logic from non-streaming path (lines 776-784) + # to ensure consistent behavior across both streaming and non-streaming modes. + # Without this, Claude and other models via LiteLLM would hit stop conditions + # that the agent couldn't properly handle. + if isinstance(finish_reason, types.FinishReason): + aggregated_llm_response_with_tool_call.finish_reason = finish_reason + else: + finish_reason_str = str(finish_reason).lower() + aggregated_llm_response_with_tool_call.finish_reason = _FINISH_REASON_MAPPING.get( + finish_reason_str, types.FinishReason.OTHER + ) text = "" reasoning_parts = [] function_calls.clear() @@ -1669,6 +1683,20 @@ async def generate_content_async( if reasoning_parts else None, ) + # FIX: Map finish_reason to FinishReason enum for streaming text-only responses. + # Previously, streaming responses did not set finish_reason on aggregated + # LlmResponse objects, causing the ADK agent runner to not properly recognize + # completion states. This mirrors the logic from non-streaming path (lines 776-784) + # to ensure consistent behavior across both streaming and non-streaming modes. + # Without this, Claude and other models via LiteLLM would hit stop conditions + # that the agent couldn't properly handle. + if isinstance(finish_reason, types.FinishReason): + aggregated_llm_response.finish_reason = finish_reason + else: + finish_reason_str = str(finish_reason).lower() + aggregated_llm_response.finish_reason = _FINISH_REASON_MAPPING.get( + finish_reason_str, types.FinishReason.OTHER + ) text = "" reasoning_parts = [] From 22a627b95be5fa74fd43073c3075d7193e7e5aac Mon Sep 17 00:00:00 2001 From: Sureel Bhurat Date: Mon, 24 Nov 2025 16:22:27 -0500 Subject: [PATCH 2/3] fix: Set content=None for tool-only messages in streaming to avoid duplication - Fixes content duplication where planning/reasoning text appears twice (once during streaming, again in aggregated tool-call message) - Aligns with OpenAI/LiteLLM conventions for tool-call messages - Planning text is preserved in thought_parts and already streamed to users - Resolves semantic confusion where tool-call messages contained text content Fixes #3697 --- src/google/adk/models/lite_llm.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/google/adk/models/lite_llm.py b/src/google/adk/models/lite_llm.py index 92a9de4024..0cbff47c2f 100644 --- a/src/google/adk/models/lite_llm.py +++ b/src/google/adk/models/lite_llm.py @@ -1646,7 +1646,13 @@ async def generate_content_async( _message_to_generate_content_response( ChatCompletionAssistantMessage( role="assistant", - content=text, + # FIX: Set content=None for tool-only messages to avoid duplication + # and follow OpenAI/LiteLLM conventions. Planning/reasoning text is + # already streamed (lines 1288-1296) and preserved in thought_parts + # (line 1357). Including it again in content causes duplication and + # violates API specifications for tool-call messages. + # See: https://github.com/google/adk-python/issues/3697 + content=None, tool_calls=tool_calls, ), model_version=part.model, From d2640149239303db590f0e21929129866699f4b3 Mon Sep 17 00:00:00 2001 From: Sureel Bhurat Date: Tue, 25 Nov 2025 07:41:03 -0500 Subject: [PATCH 3/3] refactor: Extract _set_finish_reason helper to eliminate code duplication Extract finish_reason mapping logic into a reusable helper function to address code duplication feedback from Gemini Code Assist review on PR #3698. Changes: - Added _set_finish_reason(response, finish_reason) helper function - Replaced three duplicate mapping blocks with single helper call: * Non-streaming path (line ~880) * Streaming tool-call path (line ~1387) * Streaming text-only path (line ~1409) - Preserved all existing comments and behavior - Improved maintainability - single source of truth for mapping logic Addresses: https://github.com/google/adk-python/pull/3698\#discussion_r18xxxxx --- src/google/adk/models/lite_llm.py | 43 ++++++++++++++++--------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/src/google/adk/models/lite_llm.py b/src/google/adk/models/lite_llm.py index 0cbff47c2f..a0d7ca8a54 100644 --- a/src/google/adk/models/lite_llm.py +++ b/src/google/adk/models/lite_llm.py @@ -176,6 +176,25 @@ def _infer_mime_type_from_uri(uri: str) -> Optional[str]: return None +def _set_finish_reason( + response: types.LlmResponse, finish_reason: Any +) -> None: + """Sets the finish reason on the LlmResponse, mapping from string if necessary. + + Args: + response: The LlmResponse object to update. + finish_reason: The finish reason value, either a FinishReason enum or a string + that needs to be mapped. + """ + if isinstance(finish_reason, types.FinishReason): + response.finish_reason = finish_reason + else: + finish_reason_str = str(finish_reason).lower() + response.finish_reason = _FINISH_REASON_MAPPING.get( + finish_reason_str, types.FinishReason.OTHER + ) + + def _decode_inline_text_data(raw_bytes: bytes) -> str: """Decodes inline file bytes that represent textual content.""" try: @@ -1081,13 +1100,7 @@ def _model_response_to_generate_content_response( if finish_reason: # If LiteLLM already provides a FinishReason enum (e.g., for Gemini), use # it directly. Otherwise, map the finish_reason string to the enum. - if isinstance(finish_reason, types.FinishReason): - llm_response.finish_reason = finish_reason - else: - finish_reason_str = str(finish_reason).lower() - llm_response.finish_reason = _FINISH_REASON_MAPPING.get( - finish_reason_str, types.FinishReason.OTHER - ) + _set_finish_reason(llm_response, finish_reason) if response.get("usage", None): llm_response.usage_metadata = types.GenerateContentResponseUsageMetadata( prompt_token_count=response["usage"].get("prompt_tokens", 0), @@ -1668,13 +1681,7 @@ async def generate_content_async( # to ensure consistent behavior across both streaming and non-streaming modes. # Without this, Claude and other models via LiteLLM would hit stop conditions # that the agent couldn't properly handle. - if isinstance(finish_reason, types.FinishReason): - aggregated_llm_response_with_tool_call.finish_reason = finish_reason - else: - finish_reason_str = str(finish_reason).lower() - aggregated_llm_response_with_tool_call.finish_reason = _FINISH_REASON_MAPPING.get( - finish_reason_str, types.FinishReason.OTHER - ) + _set_finish_reason(aggregated_llm_response_with_tool_call, finish_reason) text = "" reasoning_parts = [] function_calls.clear() @@ -1696,13 +1703,7 @@ async def generate_content_async( # to ensure consistent behavior across both streaming and non-streaming modes. # Without this, Claude and other models via LiteLLM would hit stop conditions # that the agent couldn't properly handle. - if isinstance(finish_reason, types.FinishReason): - aggregated_llm_response.finish_reason = finish_reason - else: - finish_reason_str = str(finish_reason).lower() - aggregated_llm_response.finish_reason = _FINISH_REASON_MAPPING.get( - finish_reason_str, types.FinishReason.OTHER - ) + _set_finish_reason(aggregated_llm_response, finish_reason) text = "" reasoning_parts = []