From bcb0a114d629248bc1189540d403076ba650a7c1 Mon Sep 17 00:00:00 2001 From: openhands Date: Tue, 10 Mar 2026 00:29:27 +0000 Subject: [PATCH 1/5] Add gpt-5.4 to resolve_model_config.py --- .github/run-eval/resolve_model_config.py | 5 +++++ .../openhands/sdk/llm/utils/model_prompt_spec.py | 2 +- tests/github_workflows/test_resolve_model_config.py | 9 +++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/.github/run-eval/resolve_model_config.py b/.github/run-eval/resolve_model_config.py index c21ecc8888..dba4d416ca 100755 --- a/.github/run-eval/resolve_model_config.py +++ b/.github/run-eval/resolve_model_config.py @@ -140,6 +140,11 @@ "reasoning_effort": "high", }, }, + "gpt-5.4": { + "id": "gpt-5.4", + "display_name": "GPT-5.4", + "llm_config": {"model": "litellm_proxy/openai/gpt-5.4"}, + }, "minimax-m2": { "id": "minimax-m2", "display_name": "MiniMax M2", diff --git a/openhands-sdk/openhands/sdk/llm/utils/model_prompt_spec.py b/openhands-sdk/openhands/sdk/llm/utils/model_prompt_spec.py index da226703a3..a3fe71fffe 100644 --- a/openhands-sdk/openhands/sdk/llm/utils/model_prompt_spec.py +++ b/openhands-sdk/openhands/sdk/llm/utils/model_prompt_spec.py @@ -40,7 +40,7 @@ class ModelPromptSpec(BaseModel): "gpt-5-codex", ("gpt-5-codex", "gpt-5.1-codex", "gpt-5.2-codex", "gpt-5.3-codex"), ), - ("gpt-5", ("gpt-5", "gpt-5.1", "gpt-5.2")), + ("gpt-5", ("gpt-5", "gpt-5.1", "gpt-5.2", "gpt-5.4")), ), } diff --git a/tests/github_workflows/test_resolve_model_config.py b/tests/github_workflows/test_resolve_model_config.py index 2ed496df68..d1e756a98a 100644 --- a/tests/github_workflows/test_resolve_model_config.py +++ b/tests/github_workflows/test_resolve_model_config.py @@ -491,3 +491,12 @@ def test_models_importable_without_litellm(): f"stderr: {result.stderr}" ) assert "SUCCESS" in result.stdout + +def test_gpt_5_4_config(): + """Test that gpt-5.4 has correct configuration.""" + model = MODELS["gpt-5.4"] + + assert model["id"] == "gpt-5.4" + assert model["display_name"] == "GPT-5.4" + assert model["llm_config"]["model"] == "litellm_proxy/openai/gpt-5.4" + From 3cc370f1fd615ef42db2a15e7a68474537ce067c Mon Sep 17 00:00:00 2001 From: openhands Date: Tue, 10 Mar 2026 00:35:22 +0000 Subject: [PATCH 2/5] Fix formatting --- tests/github_workflows/test_resolve_model_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/github_workflows/test_resolve_model_config.py b/tests/github_workflows/test_resolve_model_config.py index d1e756a98a..55f66c3505 100644 --- a/tests/github_workflows/test_resolve_model_config.py +++ b/tests/github_workflows/test_resolve_model_config.py @@ -492,6 +492,7 @@ def test_models_importable_without_litellm(): ) assert "SUCCESS" in result.stdout + def test_gpt_5_4_config(): """Test that gpt-5.4 has correct configuration.""" model = MODELS["gpt-5.4"] @@ -499,4 +500,3 @@ def test_gpt_5_4_config(): assert model["id"] == "gpt-5.4" assert model["display_name"] == "GPT-5.4" assert model["llm_config"]["model"] == "litellm_proxy/openai/gpt-5.4" - From 897362cc011a0cfdd5244530b390acaf58230f48 Mon Sep 17 00:00:00 2001 From: openhands Date: Tue, 10 Mar 2026 21:53:46 +0000 Subject: [PATCH 3/5] Treat gpt-5.4 as reasoning model Co-authored-by: openhands --- .github/run-eval/resolve_model_config.py | 5 ++++- tests/github_workflows/test_resolve_model_config.py | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/run-eval/resolve_model_config.py b/.github/run-eval/resolve_model_config.py index dba4d416ca..0705f9ccb6 100755 --- a/.github/run-eval/resolve_model_config.py +++ b/.github/run-eval/resolve_model_config.py @@ -143,7 +143,10 @@ "gpt-5.4": { "id": "gpt-5.4", "display_name": "GPT-5.4", - "llm_config": {"model": "litellm_proxy/openai/gpt-5.4"}, + "llm_config": { + "model": "litellm_proxy/openai/gpt-5.4", + "reasoning_effort": "high", + }, }, "minimax-m2": { "id": "minimax-m2", diff --git a/tests/github_workflows/test_resolve_model_config.py b/tests/github_workflows/test_resolve_model_config.py index 55f66c3505..7e4c2ce145 100644 --- a/tests/github_workflows/test_resolve_model_config.py +++ b/tests/github_workflows/test_resolve_model_config.py @@ -500,3 +500,4 @@ def test_gpt_5_4_config(): assert model["id"] == "gpt-5.4" assert model["display_name"] == "GPT-5.4" assert model["llm_config"]["model"] == "litellm_proxy/openai/gpt-5.4" + assert model["llm_config"]["reasoning_effort"] == "high" From 91f248877b801cfd50e85c3b9c722716226111d7 Mon Sep 17 00:00:00 2001 From: openhands Date: Tue, 10 Mar 2026 22:59:56 +0000 Subject: [PATCH 4/5] Add gpt-5.4 to reasoning effort model patterns Co-authored-by: openhands --- openhands-sdk/openhands/sdk/llm/utils/model_features.py | 1 + tests/sdk/llm/test_model_features.py | 1 + 2 files changed, 2 insertions(+) diff --git a/openhands-sdk/openhands/sdk/llm/utils/model_features.py b/openhands-sdk/openhands/sdk/llm/utils/model_features.py index 2b1fec563f..6ca4b1badd 100644 --- a/openhands-sdk/openhands/sdk/llm/utils/model_features.py +++ b/openhands-sdk/openhands/sdk/llm/utils/model_features.py @@ -69,6 +69,7 @@ class ModelFeatures: "gemini-3.1-pro-preview", # OpenAI GPT-5 family (includes mini variants) "gpt-5", + "gpt-5.4", # Anthropic Opus 4.5 and 4.6 "claude-opus-4-5", "claude-opus-4-6", diff --git a/tests/sdk/llm/test_model_features.py b/tests/sdk/llm/test_model_features.py index 9032c8dd39..13c5f8e668 100644 --- a/tests/sdk/llm/test_model_features.py +++ b/tests/sdk/llm/test_model_features.py @@ -39,6 +39,7 @@ def test_model_matches(name, pattern, expected): # GPT-5 family ("gpt-5.2", True), ("gpt-5.2-codex", True), + ("gpt-5.4", True), ("gpt-4o", False), ("claude-3-5-sonnet", False), ("gemini-1.5-pro", False), From 71b1dfadb068c39f701e8d6c2d1035067818784c Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 11 Mar 2026 15:57:24 +0000 Subject: [PATCH 5/5] Fix c01_thinking_block_condenser to skip models without extended thinking The test was failing for GPT-5.4 because it only checks for thinking_blocks (Anthropic-specific), but was not skipping models with reasoning_effort (like OpenAI models) which produce reasoning items instead. Changes: - Import get_features to check model capabilities - Update skip logic to only run for models with extended_thinking support - Update docstrings to clarify test scope Co-authored-by: openhands --- .../tests/c01_thinking_block_condenser.py | 29 ++++++++++++------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/tests/integration/tests/c01_thinking_block_condenser.py b/tests/integration/tests/c01_thinking_block_condenser.py index 7d806ee2f9..7cef276893 100644 --- a/tests/integration/tests/c01_thinking_block_condenser.py +++ b/tests/integration/tests/c01_thinking_block_condenser.py @@ -1,9 +1,13 @@ """ Integration test for thinking block handling during condensation. -This test validates that Claude Opus's thinking blocks are properly handled +This test validates that Anthropic Claude's thinking blocks are properly handled during conversation condensation, preventing malformed signature errors that can occur when thinking blocks are included in conversation history. + +Note: This test only applies to models that support extended_thinking (Anthropic +Claude models). Models with reasoning_effort (like OpenAI o-series and GPT-5.x) +produce reasoning items instead of thinking blocks, and are skipped. """ from openhands.sdk import LLM, Message, TextContent, Tool @@ -11,6 +15,7 @@ from openhands.sdk.context.view import View from openhands.sdk.conversation.impl.local_conversation import LocalConversation from openhands.sdk.event import ActionEvent, Condensation +from openhands.sdk.llm.utils.model_features import get_features from openhands.sdk.tool import register_tool from openhands.tools.terminal import TerminalTool from tests.integration.base import BaseIntegrationTest, SkipTest, TestResult @@ -135,16 +140,16 @@ def setup(self) -> None: """ Validate that the model supports extended thinking. - Thinking blocks are primarily supported by: - - Anthropic Claude models (extended_thinking) - - Some Gemini models (extended_thinking) - - Some other models (reasoning_effort) + Thinking blocks are specifically supported by Anthropic Claude models + with extended_thinking enabled. Models that only support reasoning_effort + (like OpenAI o-series and GPT-5.x) produce reasoning items instead of + thinking blocks, so they should be skipped. """ model = self.llm_config.get("model", "") + features = get_features(model) - # Check if model has extended thinking or reasoning effort configured + # Check if model has extended thinking configured has_extended_thinking = self.llm_config.get("extended_thinking", False) - has_reasoning_effort = "reasoning_effort" in self.llm_config # For Claude Opus, automatically enable extended thinking if not set if "opus" in model.lower() and not has_extended_thinking: @@ -154,11 +159,15 @@ def setup(self) -> None: **{**self.llm.model_dump(), **self.llm_config} ) self.agent.llm = self.llm + has_extended_thinking = True - # Skip test if model doesn't support thinking blocks - if not has_extended_thinking and not has_reasoning_effort: + # Skip test if model doesn't support extended thinking (which produces + # thinking_blocks). Models that only support reasoning_effort produce + # responses_reasoning_item instead, which is a different mechanism. + if not has_extended_thinking and not features.supports_extended_thinking: raise SkipTest( - f"Model {model} does not support extended thinking or reasoning effort" + f"Model {model} does not support extended thinking " + "(produces reasoning items instead of thinking blocks)" ) def conversation_callback(self, event):