From bcb0a114d629248bc1189540d403076ba650a7c1 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 10 Mar 2026 00:29:27 +0000
Subject: [PATCH 1/5] Add gpt-5.4 to resolve_model_config.py

---
 .github/run-eval/resolve_model_config.py                 | 5 +++++
 .../openhands/sdk/llm/utils/model_prompt_spec.py         | 2 +-
 tests/github_workflows/test_resolve_model_config.py      | 9 +++++++++
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/.github/run-eval/resolve_model_config.py b/.github/run-eval/resolve_model_config.py
index c21ecc8888..dba4d416ca 100755
--- a/.github/run-eval/resolve_model_config.py
+++ b/.github/run-eval/resolve_model_config.py
@@ -140,6 +140,11 @@
             "reasoning_effort": "high",
         },
     },
+    "gpt-5.4": {
+        "id": "gpt-5.4",
+        "display_name": "GPT-5.4",
+        "llm_config": {"model": "litellm_proxy/openai/gpt-5.4"},
+    },
     "minimax-m2": {
         "id": "minimax-m2",
         "display_name": "MiniMax M2",
diff --git a/openhands-sdk/openhands/sdk/llm/utils/model_prompt_spec.py b/openhands-sdk/openhands/sdk/llm/utils/model_prompt_spec.py
index da226703a3..a3fe71fffe 100644
--- a/openhands-sdk/openhands/sdk/llm/utils/model_prompt_spec.py
+++ b/openhands-sdk/openhands/sdk/llm/utils/model_prompt_spec.py
@@ -40,7 +40,7 @@ class ModelPromptSpec(BaseModel):
             "gpt-5-codex",
             ("gpt-5-codex", "gpt-5.1-codex", "gpt-5.2-codex", "gpt-5.3-codex"),
         ),
-        ("gpt-5", ("gpt-5", "gpt-5.1", "gpt-5.2")),
+        ("gpt-5", ("gpt-5", "gpt-5.1", "gpt-5.2", "gpt-5.4")),
     ),
 }
 
diff --git a/tests/github_workflows/test_resolve_model_config.py b/tests/github_workflows/test_resolve_model_config.py
index 2ed496df68..d1e756a98a 100644
--- a/tests/github_workflows/test_resolve_model_config.py
+++ b/tests/github_workflows/test_resolve_model_config.py
@@ -491,3 +491,12 @@ def test_models_importable_without_litellm():
         f"stderr: {result.stderr}"
     )
     assert "SUCCESS" in result.stdout
+
+def test_gpt_5_4_config():
+    """Test that gpt-5.4 has correct configuration."""
+    model = MODELS["gpt-5.4"]
+
+    assert model["id"] == "gpt-5.4"
+    assert model["display_name"] == "GPT-5.4"
+    assert model["llm_config"]["model"] == "litellm_proxy/openai/gpt-5.4"
+

From 3cc370f1fd615ef42db2a15e7a68474537ce067c Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 10 Mar 2026 00:35:22 +0000
Subject: [PATCH 2/5] Fix formatting

---
 tests/github_workflows/test_resolve_model_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/github_workflows/test_resolve_model_config.py b/tests/github_workflows/test_resolve_model_config.py
index d1e756a98a..55f66c3505 100644
--- a/tests/github_workflows/test_resolve_model_config.py
+++ b/tests/github_workflows/test_resolve_model_config.py
@@ -492,6 +492,7 @@ def test_models_importable_without_litellm():
     )
     assert "SUCCESS" in result.stdout
 
+
 def test_gpt_5_4_config():
     """Test that gpt-5.4 has correct configuration."""
     model = MODELS["gpt-5.4"]
@@ -499,4 +500,3 @@ def test_gpt_5_4_config():
     assert model["id"] == "gpt-5.4"
     assert model["display_name"] == "GPT-5.4"
     assert model["llm_config"]["model"] == "litellm_proxy/openai/gpt-5.4"
-

From 897362cc011a0cfdd5244530b390acaf58230f48 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 10 Mar 2026 21:53:46 +0000
Subject: [PATCH 3/5] Treat gpt-5.4 as reasoning model

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/run-eval/resolve_model_config.py            | 5 ++++-
 tests/github_workflows/test_resolve_model_config.py | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/run-eval/resolve_model_config.py b/.github/run-eval/resolve_model_config.py
index dba4d416ca..0705f9ccb6 100755
--- a/.github/run-eval/resolve_model_config.py
+++ b/.github/run-eval/resolve_model_config.py
@@ -143,7 +143,10 @@
     "gpt-5.4": {
         "id": "gpt-5.4",
         "display_name": "GPT-5.4",
-        "llm_config": {"model": "litellm_proxy/openai/gpt-5.4"},
+        "llm_config": {
+            "model": "litellm_proxy/openai/gpt-5.4",
+            "reasoning_effort": "high",
+        },
     },
     "minimax-m2": {
         "id": "minimax-m2",
diff --git a/tests/github_workflows/test_resolve_model_config.py b/tests/github_workflows/test_resolve_model_config.py
index 55f66c3505..7e4c2ce145 100644
--- a/tests/github_workflows/test_resolve_model_config.py
+++ b/tests/github_workflows/test_resolve_model_config.py
@@ -500,3 +500,4 @@ def test_gpt_5_4_config():
     assert model["id"] == "gpt-5.4"
     assert model["display_name"] == "GPT-5.4"
     assert model["llm_config"]["model"] == "litellm_proxy/openai/gpt-5.4"
+    assert model["llm_config"]["reasoning_effort"] == "high"

From 91f248877b801cfd50e85c3b9c722716226111d7 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 10 Mar 2026 22:59:56 +0000
Subject: [PATCH 4/5] Add gpt-5.4 to reasoning effort model patterns

Co-authored-by: openhands <openhands@all-hands.dev>
---
 openhands-sdk/openhands/sdk/llm/utils/model_features.py | 1 +
 tests/sdk/llm/test_model_features.py                    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/openhands-sdk/openhands/sdk/llm/utils/model_features.py b/openhands-sdk/openhands/sdk/llm/utils/model_features.py
index 2b1fec563f..6ca4b1badd 100644
--- a/openhands-sdk/openhands/sdk/llm/utils/model_features.py
+++ b/openhands-sdk/openhands/sdk/llm/utils/model_features.py
@@ -69,6 +69,7 @@ class ModelFeatures:
     "gemini-3.1-pro-preview",
     # OpenAI GPT-5 family (includes mini variants)
     "gpt-5",
+    "gpt-5.4",
     # Anthropic Opus 4.5 and 4.6
     "claude-opus-4-5",
     "claude-opus-4-6",
diff --git a/tests/sdk/llm/test_model_features.py b/tests/sdk/llm/test_model_features.py
index 9032c8dd39..13c5f8e668 100644
--- a/tests/sdk/llm/test_model_features.py
+++ b/tests/sdk/llm/test_model_features.py
@@ -39,6 +39,7 @@ def test_model_matches(name, pattern, expected):
         # GPT-5 family
         ("gpt-5.2", True),
         ("gpt-5.2-codex", True),
+        ("gpt-5.4", True),
         ("gpt-4o", False),
         ("claude-3-5-sonnet", False),
         ("gemini-1.5-pro", False),

From 71b1dfadb068c39f701e8d6c2d1035067818784c Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 11 Mar 2026 15:57:24 +0000
Subject: [PATCH 5/5] Fix c01_thinking_block_condenser to skip models without
 extended thinking

The test was failing for GPT-5.4 because it only checks for thinking_blocks
(Anthropic-specific), but was not skipping models with reasoning_effort
(like OpenAI models) which produce reasoning items instead.

Changes:
- Import get_features to check model capabilities
- Update skip logic to only run for models with extended_thinking support
- Update docstrings to clarify test scope

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .../tests/c01_thinking_block_condenser.py     | 29 ++++++++++++-------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/tests/integration/tests/c01_thinking_block_condenser.py b/tests/integration/tests/c01_thinking_block_condenser.py
index 7d806ee2f9..7cef276893 100644
--- a/tests/integration/tests/c01_thinking_block_condenser.py
+++ b/tests/integration/tests/c01_thinking_block_condenser.py
@@ -1,9 +1,13 @@
 """
 Integration test for thinking block handling during condensation.
 
-This test validates that Claude Opus's thinking blocks are properly handled
+This test validates that Anthropic Claude's thinking blocks are properly handled
 during conversation condensation, preventing malformed signature errors that
 can occur when thinking blocks are included in conversation history.
+
+Note: This test only applies to models that support extended_thinking (Anthropic
+Claude models). Models with reasoning_effort (like OpenAI o-series and GPT-5.x)
+produce reasoning items instead of thinking blocks, and are skipped.
 """
 
 from openhands.sdk import LLM, Message, TextContent, Tool
@@ -11,6 +15,7 @@
 from openhands.sdk.context.view import View
 from openhands.sdk.conversation.impl.local_conversation import LocalConversation
 from openhands.sdk.event import ActionEvent, Condensation
+from openhands.sdk.llm.utils.model_features import get_features
 from openhands.sdk.tool import register_tool
 from openhands.tools.terminal import TerminalTool
 from tests.integration.base import BaseIntegrationTest, SkipTest, TestResult
@@ -135,16 +140,16 @@ def setup(self) -> None:
         """
         Validate that the model supports extended thinking.
 
-        Thinking blocks are primarily supported by:
-        - Anthropic Claude models (extended_thinking)
-        - Some Gemini models (extended_thinking)
-        - Some other models (reasoning_effort)
+        Thinking blocks are specifically supported by Anthropic Claude models
+        with extended_thinking enabled. Models that only support reasoning_effort
+        (like OpenAI o-series and GPT-5.x) produce reasoning items instead of
+        thinking blocks, so they should be skipped.
         """
         model = self.llm_config.get("model", "")
+        features = get_features(model)
 
-        # Check if model has extended thinking or reasoning effort configured
+        # Check if model has extended thinking configured
         has_extended_thinking = self.llm_config.get("extended_thinking", False)
-        has_reasoning_effort = "reasoning_effort" in self.llm_config
 
         # For Claude Opus, automatically enable extended thinking if not set
         if "opus" in model.lower() and not has_extended_thinking:
@@ -154,11 +159,15 @@ def setup(self) -> None:
                 **{**self.llm.model_dump(), **self.llm_config}
             )
             self.agent.llm = self.llm
+            has_extended_thinking = True
 
-        # Skip test if model doesn't support thinking blocks
-        if not has_extended_thinking and not has_reasoning_effort:
+        # Skip test if model doesn't support extended thinking (which produces
+        # thinking_blocks). Models that only support reasoning_effort produce
+        # responses_reasoning_item instead, which is a different mechanism.
+        if not has_extended_thinking and not features.supports_extended_thinking:
             raise SkipTest(
-                f"Model {model} does not support extended thinking or reasoning effort"
+                f"Model {model} does not support extended thinking "
+                "(produces reasoning items instead of thinking blocks)"
             )
 
     def conversation_callback(self, event):