From 96db2446ceea3c0114e1a126ef0124dcea9fa6b6 Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 25 Feb 2026 20:57:11 +0000 Subject: [PATCH 01/13] feat: Add API compliance monitoring for conversation events Implements an APIComplianceMonitor that watches events as they're added to conversations and detects API compliance violations. Currently operates in observation mode (violations are logged but events continue processing). New components: - APICompliancePropertyBase: Base class for compliance properties - ComplianceState: Shared state tracking pending/completed tool calls - ComplianceViolation: Represents detected violations with context - APIComplianceMonitor: Orchestrates checking across properties Four properties covering all 8 API compliance patterns: - InterleavedMessageProperty: a01, a03, a04, a07 - UnmatchedToolResultProperty: a02, a06 - DuplicateToolResultProperty: a05 - ToolResultOrderProperty: a08 Integration: - Added add_event() to ConversationState with compliance checking - Updated LocalConversation callback to use add_event() The design supports future per-property reconciliation strategies. Co-authored-by: openhands --- .../sdk/conversation/compliance/__init__.py | 31 ++ .../sdk/conversation/compliance/base.py | 92 +++++ .../sdk/conversation/compliance/monitor.py | 114 +++++++ .../sdk/conversation/compliance/properties.py | 188 ++++++++++ .../conversation/impl/local_conversation.py | 7 +- .../openhands/sdk/conversation/state.py | 49 ++- tests/sdk/conversation/compliance/__init__.py | 1 + tests/sdk/conversation/compliance/conftest.py | 116 +++++++ .../conversation/compliance/test_monitor.py | 157 +++++++++ .../compliance/test_properties.py | 321 ++++++++++++++++++ .../compliance/test_state_integration.py | 136 ++++++++ 11 files changed, 1207 insertions(+), 5 deletions(-) create mode 100644 openhands-sdk/openhands/sdk/conversation/compliance/__init__.py create mode 100644 openhands-sdk/openhands/sdk/conversation/compliance/base.py create mode 100644 openhands-sdk/openhands/sdk/conversation/compliance/monitor.py create mode 100644 openhands-sdk/openhands/sdk/conversation/compliance/properties.py create mode 100644 tests/sdk/conversation/compliance/__init__.py create mode 100644 tests/sdk/conversation/compliance/conftest.py create mode 100644 tests/sdk/conversation/compliance/test_monitor.py create mode 100644 tests/sdk/conversation/compliance/test_properties.py create mode 100644 tests/sdk/conversation/compliance/test_state_integration.py diff --git a/openhands-sdk/openhands/sdk/conversation/compliance/__init__.py b/openhands-sdk/openhands/sdk/conversation/compliance/__init__.py new file mode 100644 index 0000000000..11c398ef97 --- /dev/null +++ b/openhands-sdk/openhands/sdk/conversation/compliance/__init__.py @@ -0,0 +1,31 @@ +"""API Compliance monitoring for conversation events. + +This module provides monitors that detect violations of LLM API requirements +in the event stream. Violations are currently logged for data gathering; +future versions may support per-property reconciliation strategies. +""" + +from openhands.sdk.conversation.compliance.base import ( + APICompliancePropertyBase, + ComplianceState, + ComplianceViolation, +) +from openhands.sdk.conversation.compliance.monitor import APIComplianceMonitor +from openhands.sdk.conversation.compliance.properties import ( + DuplicateToolResultProperty, + InterleavedMessageProperty, + ToolResultOrderProperty, + UnmatchedToolResultProperty, +) + + +__all__ = [ + "APICompliancePropertyBase", + "APIComplianceMonitor", + "ComplianceState", + "ComplianceViolation", + "DuplicateToolResultProperty", + "InterleavedMessageProperty", + "ToolResultOrderProperty", + "UnmatchedToolResultProperty", +] diff --git a/openhands-sdk/openhands/sdk/conversation/compliance/base.py b/openhands-sdk/openhands/sdk/conversation/compliance/base.py new file mode 100644 index 0000000000..d303733e02 --- /dev/null +++ b/openhands-sdk/openhands/sdk/conversation/compliance/base.py @@ -0,0 +1,92 @@ +"""Base classes for API compliance monitoring.""" + +from abc import ABC, abstractmethod +from dataclasses import dataclass, field + +from openhands.sdk.event import Event +from openhands.sdk.event.types import EventID, ToolCallID + + +@dataclass +class ComplianceViolation: + """Represents an API compliance violation. + + Attributes: + property_name: Name of the property that was violated. + event_id: ID of the event that caused the violation. + description: Human-readable description of the violation. + context: Optional additional context (e.g., related tool_call_ids). + """ + + property_name: str + event_id: EventID + description: str + context: dict[str, object] | None = None + + +@dataclass +class ComplianceState: + """Shared state for tracking API compliance across properties. + + This state is updated as events are processed and provides the context + needed by individual properties to detect violations. + + Attributes: + pending_tool_call_ids: Tool calls that have been made but not yet + received results. Maps tool_call_id to the ActionEvent id. + completed_tool_call_ids: Tool calls that have received results. + Used to detect duplicate results. + all_tool_call_ids: All tool_call_ids seen so far (for detecting + results referencing unknown calls). + """ + + pending_tool_call_ids: dict[ToolCallID, EventID] = field(default_factory=dict) + completed_tool_call_ids: set[ToolCallID] = field(default_factory=set) + all_tool_call_ids: set[ToolCallID] = field(default_factory=set) + + +class APICompliancePropertyBase(ABC): + """Base class for API compliance properties. + + Each property represents a single compliance rule that LLM APIs expect. + Properties check incoming events for violations and update shared state. + + The design allows for future per-property reconciliation strategies + while currently defaulting to logging violations. + """ + + @property + @abstractmethod + def name(self) -> str: + """Unique identifier for this property. + + Used in violation reports and logging. + """ + + @abstractmethod + def check( + self, + event: Event, + state: ComplianceState, + ) -> ComplianceViolation | None: + """Check if an event violates this property. + + Args: + event: The event about to be added to the conversation. + state: Current compliance state for context. + + Returns: + A ComplianceViolation if the event violates this property, + None otherwise. + """ + + def update_state(self, event: Event, state: ComplianceState) -> None: + """Update compliance state after an event is processed. + + Override this method if the property needs to track state. + Default implementation does nothing. + + Args: + event: The event that was just processed. + state: The compliance state to update. + """ diff --git a/openhands-sdk/openhands/sdk/conversation/compliance/monitor.py b/openhands-sdk/openhands/sdk/conversation/compliance/monitor.py new file mode 100644 index 0000000000..ce6e8ca929 --- /dev/null +++ b/openhands-sdk/openhands/sdk/conversation/compliance/monitor.py @@ -0,0 +1,114 @@ +"""API Compliance Monitor that checks events before adding to conversation.""" + +from openhands.sdk.conversation.compliance.base import ( + APICompliancePropertyBase, + ComplianceState, + ComplianceViolation, +) +from openhands.sdk.conversation.compliance.properties import ALL_COMPLIANCE_PROPERTIES +from openhands.sdk.event import Event +from openhands.sdk.logger import get_logger + + +logger = get_logger(__name__) + + +class APIComplianceMonitor: + """Monitors events for API compliance violations. + + This monitor checks incoming events against a set of compliance properties + and logs any violations. Currently operates in observation mode (violations + are logged but events are still processed). Future versions may support + per-property reconciliation strategies. + + Attributes: + state: Shared compliance state tracking tool calls. + properties: List of compliance properties to check. + """ + + def __init__( + self, + properties: list[APICompliancePropertyBase] | None = None, + ) -> None: + """Initialize the compliance monitor. + + Args: + properties: Compliance properties to check. If None, uses all + default properties. + """ + self.state = ComplianceState() + self.properties = ( + properties if properties is not None else list(ALL_COMPLIANCE_PROPERTIES) + ) + self._violations: list[ComplianceViolation] = [] + + def check_event(self, event: Event) -> list[ComplianceViolation]: + """Check an event for compliance violations. + + This method checks the event against all properties and returns any + violations found. It does NOT update state - call process_event() + after adding the event to update tracking state. + + Args: + event: The event to check. + + Returns: + List of violations detected (empty if compliant). + """ + violations: list[ComplianceViolation] = [] + + for prop in self.properties: + violation = prop.check(event, self.state) + if violation is not None: + violations.append(violation) + logger.warning( + "API compliance violation detected: %s - %s (event_id=%s)", + violation.property_name, + violation.description, + violation.event_id, + ) + + return violations + + def update_state(self, event: Event) -> None: + """Update compliance state after an event is processed. + + Call this after the event has been added to the event log. + + Args: + event: The event that was just processed. + """ + for prop in self.properties: + prop.update_state(event, self.state) + + def process_event(self, event: Event) -> list[ComplianceViolation]: + """Check an event and update state. + + This is a convenience method that combines check_event() and + update_state(). Use this when you want to check and track in + one call. + + Args: + event: The event to process. + + Returns: + List of violations detected (empty if compliant). + """ + violations = self.check_event(event) + self._violations.extend(violations) + self.update_state(event) + return violations + + @property + def violations(self) -> list[ComplianceViolation]: + """Get all violations detected so far.""" + return list(self._violations) + + @property + def violation_count(self) -> int: + """Get the total number of violations detected.""" + return len(self._violations) + + def clear_violations(self) -> None: + """Clear the violations history.""" + self._violations.clear() diff --git a/openhands-sdk/openhands/sdk/conversation/compliance/properties.py b/openhands-sdk/openhands/sdk/conversation/compliance/properties.py new file mode 100644 index 0000000000..abce522005 --- /dev/null +++ b/openhands-sdk/openhands/sdk/conversation/compliance/properties.py @@ -0,0 +1,188 @@ +"""Concrete implementations of API compliance properties. + +Each property corresponds to one or more API compliance patterns: +- InterleavedMessageProperty: a01, a03, a04, a07 +- UnmatchedToolResultProperty: a02, a06 +- DuplicateToolResultProperty: a05 +- ToolResultOrderProperty: a08 +""" + +from openhands.sdk.conversation.compliance.base import ( + APICompliancePropertyBase, + ComplianceState, + ComplianceViolation, +) +from openhands.sdk.event import ActionEvent, Event, MessageEvent, ObservationBaseEvent + + +class InterleavedMessageProperty(APICompliancePropertyBase): + """Detects messages interleaved between tool_use and tool_result. + + Violations: + - User or assistant message arrives while tool calls are pending. + + Corresponds to patterns: a01 (unmatched tool_use), a03 (interleaved user), + a04 (interleaved assistant), a07 (parallel missing result). + """ + + @property + def name(self) -> str: + return "interleaved_message" + + def check( + self, + event: Event, + state: ComplianceState, + ) -> ComplianceViolation | None: + # Only check MessageEvent + if not isinstance(event, MessageEvent): + return None + + # Violation if there are pending tool calls + if state.pending_tool_call_ids: + pending_ids = list(state.pending_tool_call_ids.keys()) + return ComplianceViolation( + property_name=self.name, + event_id=event.id, + description=( + f"Message event interleaved with {len(pending_ids)} pending " + f"tool call(s)" + ), + context={"pending_tool_call_ids": pending_ids}, + ) + + return None + + def update_state(self, event: Event, state: ComplianceState) -> None: + """Track pending and completed tool calls.""" + if isinstance(event, ActionEvent): + state.pending_tool_call_ids[event.tool_call_id] = event.id + state.all_tool_call_ids.add(event.tool_call_id) + elif isinstance(event, ObservationBaseEvent): + # Move from pending to completed + state.pending_tool_call_ids.pop(event.tool_call_id, None) + state.completed_tool_call_ids.add(event.tool_call_id) + + +class UnmatchedToolResultProperty(APICompliancePropertyBase): + """Detects tool results that reference unknown tool_call_ids. + + Violations: + - Tool result references a tool_call_id that was never seen. + + Corresponds to patterns: a02 (unmatched tool_result), a06 (wrong tool_call_id). + """ + + @property + def name(self) -> str: + return "unmatched_tool_result" + + def check( + self, + event: Event, + state: ComplianceState, + ) -> ComplianceViolation | None: + if not isinstance(event, ObservationBaseEvent): + return None + + # Check if tool_call_id was ever seen + if event.tool_call_id not in state.all_tool_call_ids: + return ComplianceViolation( + property_name=self.name, + event_id=event.id, + description=( + f"Tool result references unknown tool_call_id: {event.tool_call_id}" + ), + context={"tool_call_id": event.tool_call_id}, + ) + + return None + + +class DuplicateToolResultProperty(APICompliancePropertyBase): + """Detects duplicate tool results for the same tool_call_id. + + Violations: + - Tool result arrives for a tool_call_id that already has a result. + + Corresponds to pattern: a05 (duplicate tool_call_id). + """ + + @property + def name(self) -> str: + return "duplicate_tool_result" + + def check( + self, + event: Event, + state: ComplianceState, + ) -> ComplianceViolation | None: + if not isinstance(event, ObservationBaseEvent): + return None + + # Check if this tool_call_id already has a result + if event.tool_call_id in state.completed_tool_call_ids: + return ComplianceViolation( + property_name=self.name, + event_id=event.id, + description=( + f"Duplicate tool result for tool_call_id: {event.tool_call_id}" + ), + context={"tool_call_id": event.tool_call_id}, + ) + + return None + + +class ToolResultOrderProperty(APICompliancePropertyBase): + """Detects tool results that arrive before their corresponding actions. + + Violations: + - Tool result arrives before any action with that tool_call_id. + + Corresponds to pattern: a08 (parallel wrong order). + + Note: This is similar to UnmatchedToolResultProperty but semantically + different - here the action may arrive later, whereas unmatched means + the action never existed. + """ + + @property + def name(self) -> str: + return "tool_result_order" + + def check( + self, + event: Event, + state: ComplianceState, + ) -> ComplianceViolation | None: + if not isinstance(event, ObservationBaseEvent): + return None + + # Check if we've seen an action with this tool_call_id + # (pending or completed - either means we saw it) + seen_in_pending = event.tool_call_id in state.pending_tool_call_ids + seen_in_completed = event.tool_call_id in state.completed_tool_call_ids + seen_in_all = event.tool_call_id in state.all_tool_call_ids + + if not (seen_in_pending or seen_in_completed or seen_in_all): + return ComplianceViolation( + property_name=self.name, + event_id=event.id, + description=( + f"Tool result arrived before action for tool_call_id: " + f"{event.tool_call_id}" + ), + context={"tool_call_id": event.tool_call_id}, + ) + + return None + + +# All properties in recommended check order +ALL_COMPLIANCE_PROPERTIES: list[APICompliancePropertyBase] = [ + ToolResultOrderProperty(), + UnmatchedToolResultProperty(), + DuplicateToolResultProperty(), + InterleavedMessageProperty(), +] diff --git a/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py b/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py index 885ec5abdf..926c5f1081 100644 --- a/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py +++ b/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py @@ -179,12 +179,15 @@ def __init__( cipher=cipher, ) - # Default callback: persist every event to state + # Default callback: persist every event to state with compliance checking def _default_callback(e): # This callback runs while holding the conversation state's lock # (see BaseConversation.compose_callbacks usage inside `with self._state:` # regions), so updating state here is thread-safe. - self._state.events.append(e) + # + # Use add_event() to check API compliance before appending. + # Violations are logged but events are still processed. + self._state.add_event(e) # Track user MessageEvent IDs here so hook callbacks (which may # synthesize or alter user messages) are captured in one place. if isinstance(e, MessageEvent) and e.source == "user": diff --git a/openhands-sdk/openhands/sdk/conversation/state.py b/openhands-sdk/openhands/sdk/conversation/state.py index 8cef2c04b2..23c216aa2e 100644 --- a/openhands-sdk/openhands/sdk/conversation/state.py +++ b/openhands-sdk/openhands/sdk/conversation/state.py @@ -1,9 +1,11 @@ # state.py +from __future__ import annotations + import json from collections.abc import Sequence from enum import Enum from pathlib import Path -from typing import Any, Self +from typing import TYPE_CHECKING, Any, Self from pydantic import Field, PrivateAttr, model_validator @@ -30,6 +32,10 @@ from openhands.sdk.workspace.base import BaseWorkspace +if TYPE_CHECKING: + from openhands.sdk.conversation.compliance import APIComplianceMonitor + + logger = get_logger(__name__) @@ -177,6 +183,9 @@ class ConversationState(OpenHandsModel): _lock: FIFOLock = PrivateAttr( default_factory=FIFOLock ) # FIFO lock for thread safety + _compliance_monitor: APIComplianceMonitor | None = PrivateAttr( + default=None + ) # API compliance monitor (lazy-initialized) @model_validator(mode="before") @classmethod @@ -244,7 +253,7 @@ def _save_base_state(self, fs: FileStore) -> None: # ===== Factory: open-or-create (no load/save methods needed) ===== @classmethod def create( - cls: type["ConversationState"], + cls: type[ConversationState], id: ConversationID, agent: AgentBase, workspace: BaseWorkspace, @@ -252,7 +261,7 @@ def create( max_iterations: int = 500, stuck_detection: bool = True, cipher: Cipher | None = None, - ) -> "ConversationState": + ) -> ConversationState: """Create a new conversation state or resume from persistence. This factory method handles both new conversation creation and resumption @@ -506,3 +515,37 @@ def owned(self) -> bool: Return True if the lock is currently held by the calling thread. """ return self._lock.owned() + + # ===== API Compliance Monitoring ===== + + @property + def compliance_monitor(self) -> APIComplianceMonitor: + """Get or create the API compliance monitor. + + The monitor is lazily initialized on first access. + """ + if self._compliance_monitor is None: + from openhands.sdk.conversation.compliance import APIComplianceMonitor + + self._compliance_monitor = APIComplianceMonitor() + return self._compliance_monitor + + def add_event(self, event: Event) -> None: + """Add an event to the conversation, checking for API compliance. + + This method should be used instead of directly appending to events. + It checks the event against API compliance properties and logs any + violations before adding the event to the event log. + + Currently operates in observation mode: violations are logged but + events are still processed. Future versions may support per-property + reconciliation strategies. + + Args: + event: The event to add to the conversation. + """ + # Check for compliance violations (logs warnings if any) + self.compliance_monitor.process_event(event) + + # Add to event log regardless of violations (observation mode) + self._events.append(event) diff --git a/tests/sdk/conversation/compliance/__init__.py b/tests/sdk/conversation/compliance/__init__.py new file mode 100644 index 0000000000..4c378bbe9f --- /dev/null +++ b/tests/sdk/conversation/compliance/__init__.py @@ -0,0 +1 @@ +"""Tests for API compliance monitoring.""" diff --git a/tests/sdk/conversation/compliance/conftest.py b/tests/sdk/conversation/compliance/conftest.py new file mode 100644 index 0000000000..28ab335ca2 --- /dev/null +++ b/tests/sdk/conversation/compliance/conftest.py @@ -0,0 +1,116 @@ +"""Fixtures for API compliance tests.""" + +import uuid + +import pytest + +from openhands.sdk.event import ActionEvent, MessageEvent, ObservationEvent +from openhands.sdk.event.types import ToolCallID +from openhands.sdk.llm import Message, MessageToolCall, TextContent +from openhands.sdk.tool import Observation + + +class SimpleObservation(Observation): + """Simple observation for testing.""" + + result: str + + @property + def to_llm_content(self) -> list[TextContent]: + return [TextContent(text=self.result)] + + +def make_action_event( + tool_call_id: ToolCallID | None = None, + tool_name: str = "terminal", +) -> ActionEvent: + """Create an ActionEvent for testing.""" + call_id = tool_call_id or f"call_{uuid.uuid4().hex[:8]}" + event_id = str(uuid.uuid4()) + return ActionEvent( + id=event_id, + source="agent", + thought=[TextContent(text="Let me do this")], + tool_name=tool_name, + tool_call_id=call_id, + tool_call=MessageToolCall( + id=call_id, + name=tool_name, + arguments='{"command": "ls"}', + origin="completion", + ), + llm_response_id=str(uuid.uuid4()), + ) + + +def make_observation_event( + action_event: ActionEvent, + result: str = "output", +) -> ObservationEvent: + """Create an ObservationEvent matching an ActionEvent.""" + return ObservationEvent( + id=str(uuid.uuid4()), + source="environment", + tool_name=action_event.tool_name, + tool_call_id=action_event.tool_call_id, + action_id=action_event.id, + observation=SimpleObservation(result=result), + ) + + +def make_orphan_observation_event( + tool_call_id: ToolCallID, + tool_name: str = "terminal", + action_id: str | None = None, +) -> ObservationEvent: + """Create an ObservationEvent with no matching action.""" + return ObservationEvent( + id=str(uuid.uuid4()), + source="environment", + tool_name=tool_name, + tool_call_id=tool_call_id, + action_id=action_id or str(uuid.uuid4()), + observation=SimpleObservation(result="orphan result"), + ) + + +def make_user_message_event(text: str = "Hello") -> MessageEvent: + """Create a user MessageEvent.""" + return MessageEvent( + id=str(uuid.uuid4()), + source="user", + llm_message=Message( + role="user", + content=[TextContent(text=text)], + ), + ) + + +def make_assistant_message_event(text: str = "I'll help") -> MessageEvent: + """Create an assistant MessageEvent.""" + return MessageEvent( + id=str(uuid.uuid4()), + source="agent", + llm_message=Message( + role="assistant", + content=[TextContent(text=text)], + ), + ) + + +@pytest.fixture +def action_event(): + """A single action event.""" + return make_action_event() + + +@pytest.fixture +def user_message_event(): + """A user message event.""" + return make_user_message_event() + + +@pytest.fixture +def assistant_message_event(): + """An assistant message event.""" + return make_assistant_message_event() diff --git a/tests/sdk/conversation/compliance/test_monitor.py b/tests/sdk/conversation/compliance/test_monitor.py new file mode 100644 index 0000000000..7cdf55b82e --- /dev/null +++ b/tests/sdk/conversation/compliance/test_monitor.py @@ -0,0 +1,157 @@ +"""Tests for the APIComplianceMonitor.""" + +from openhands.sdk.conversation.compliance import ( + APIComplianceMonitor, + InterleavedMessageProperty, +) +from tests.sdk.conversation.compliance.conftest import ( + make_action_event, + make_observation_event, + make_orphan_observation_event, + make_user_message_event, +) + + +def test_monitor_no_violations_normal_flow(): + """Normal conversation flow should have no violations.""" + monitor = APIComplianceMonitor() + + # Normal flow: action -> observation -> message + action = make_action_event(tool_call_id="call_1") + violations = monitor.process_event(action) + assert len(violations) == 0 + + obs = make_observation_event(action) + violations = monitor.process_event(obs) + assert len(violations) == 0 + + user_msg = make_user_message_event() + violations = monitor.process_event(user_msg) + assert len(violations) == 0 + + assert monitor.violation_count == 0 + + +def test_monitor_detects_interleaved_message(): + """Monitor should detect interleaved message violation.""" + monitor = APIComplianceMonitor() + + action = make_action_event(tool_call_id="call_1") + monitor.process_event(action) + + # User message before observation - violation + user_msg = make_user_message_event() + violations = monitor.process_event(user_msg) + + assert len(violations) == 1 + assert violations[0].property_name == "interleaved_message" + assert monitor.violation_count == 1 + + +def test_monitor_detects_multiple_violations(): + """Monitor should detect multiple violations.""" + monitor = APIComplianceMonitor() + + # Orphan observation (unmatched + order violation) + orphan = make_orphan_observation_event(tool_call_id="call_unknown") + violations = monitor.process_event(orphan) + + # Should detect both tool_result_order and unmatched_tool_result + assert len(violations) >= 2 + property_names = {v.property_name for v in violations} + assert "tool_result_order" in property_names + assert "unmatched_tool_result" in property_names + + +def test_monitor_tracks_violations_history(): + """Monitor should accumulate violations over time.""" + monitor = APIComplianceMonitor() + + # First violation + action = make_action_event(tool_call_id="call_1") + monitor.process_event(action) + monitor.process_event(make_user_message_event()) # interleaved + + initial_count = monitor.violation_count + assert initial_count > 0 + + # Second violation + monitor.process_event(make_orphan_observation_event(tool_call_id="unknown")) + + assert monitor.violation_count > initial_count + assert len(monitor.violations) == monitor.violation_count + + +def test_monitor_clear_violations(): + """Monitor should allow clearing violation history.""" + monitor = APIComplianceMonitor() + + action = make_action_event(tool_call_id="call_1") + monitor.process_event(action) + monitor.process_event(make_user_message_event()) + + assert monitor.violation_count > 0 + + monitor.clear_violations() + assert monitor.violation_count == 0 + assert len(monitor.violations) == 0 + + +def test_monitor_custom_properties(): + """Monitor can be initialized with custom properties.""" + # Only check interleaved messages + monitor = APIComplianceMonitor(properties=[InterleavedMessageProperty()]) + + # This would normally trigger tool_result_order violation + orphan = make_orphan_observation_event(tool_call_id="call_unknown") + violations = monitor.process_event(orphan) + + # But we only have InterleavedMessageProperty, so no violations + assert len(violations) == 0 + + +def test_monitor_state_persists_across_events(): + """Monitor state should persist correctly across events.""" + monitor = APIComplianceMonitor() + + # Add action + action1 = make_action_event(tool_call_id="call_1") + monitor.process_event(action1) + + assert "call_1" in monitor.state.pending_tool_call_ids + assert "call_1" in monitor.state.all_tool_call_ids + + # Add observation + obs1 = make_observation_event(action1) + monitor.process_event(obs1) + + assert "call_1" not in monitor.state.pending_tool_call_ids + assert "call_1" in monitor.state.completed_tool_call_ids + assert "call_1" in monitor.state.all_tool_call_ids + + +def test_monitor_parallel_tool_calls(): + """Monitor should handle parallel tool calls correctly.""" + monitor = APIComplianceMonitor() + + # Three parallel actions + action1 = make_action_event(tool_call_id="call_sf") + action2 = make_action_event(tool_call_id="call_tokyo") + action3 = make_action_event(tool_call_id="call_paris") + + for action in [action1, action2, action3]: + monitor.process_event(action) + + assert len(monitor.state.pending_tool_call_ids) == 3 + + # Two results arrive + monitor.process_event(make_observation_event(action1)) + monitor.process_event(make_observation_event(action2)) + + assert len(monitor.state.pending_tool_call_ids) == 1 + assert "call_paris" in monitor.state.pending_tool_call_ids + + # User message with one pending - violation + violations = monitor.process_event(make_user_message_event()) + assert len(violations) == 1 + assert "call_paris" in str(violations[0].context) diff --git a/tests/sdk/conversation/compliance/test_properties.py b/tests/sdk/conversation/compliance/test_properties.py new file mode 100644 index 0000000000..7d8477f27a --- /dev/null +++ b/tests/sdk/conversation/compliance/test_properties.py @@ -0,0 +1,321 @@ +"""Tests for individual API compliance properties. + +These tests correspond to the API compliance patterns in tests/integration/tests/a*.py: +- a01: Unmatched tool_use -> InterleavedMessageProperty +- a02: Unmatched tool_result -> UnmatchedToolResultProperty +- a03: Interleaved user message -> InterleavedMessageProperty +- a04: Interleaved assistant message -> InterleavedMessageProperty +- a05: Duplicate tool_call_id -> DuplicateToolResultProperty +- a06: Wrong tool_call_id -> UnmatchedToolResultProperty +- a07: Parallel missing result -> InterleavedMessageProperty +- a08: Parallel wrong order -> ToolResultOrderProperty +""" + +from openhands.sdk.conversation.compliance import ( + ComplianceState, + DuplicateToolResultProperty, + InterleavedMessageProperty, + ToolResultOrderProperty, + UnmatchedToolResultProperty, +) +from tests.sdk.conversation.compliance.conftest import ( + make_action_event, + make_assistant_message_event, + make_observation_event, + make_orphan_observation_event, + make_user_message_event, +) + + +# ============================================================================= +# InterleavedMessageProperty Tests +# Detects: a01, a03, a04, a07 +# ============================================================================= + + +def test_interleaved_no_violation_when_no_pending_actions(): + """User message is fine when no tool calls are pending.""" + prop = InterleavedMessageProperty() + state = ComplianceState() + + user_msg = make_user_message_event() + violation = prop.check(user_msg, state) + + assert violation is None + + +def test_interleaved_violation_user_message_with_pending_action(): + """User message while action is pending violates the property (a01/a03).""" + prop = InterleavedMessageProperty() + state = ComplianceState() + + # Simulate an action that's pending + action = make_action_event(tool_call_id="call_123") + state.pending_tool_call_ids["call_123"] = action.id + + user_msg = make_user_message_event() + violation = prop.check(user_msg, state) + + assert violation is not None + assert violation.property_name == "interleaved_message" + assert "pending" in violation.description.lower() + assert "call_123" in str(violation.context) + + +def test_interleaved_violation_assistant_message_with_pending_action(): + """Assistant message while action is pending violates the property (a04).""" + prop = InterleavedMessageProperty() + state = ComplianceState() + + action = make_action_event(tool_call_id="call_456") + state.pending_tool_call_ids["call_456"] = action.id + + assistant_msg = make_assistant_message_event() + violation = prop.check(assistant_msg, state) + + assert violation is not None + assert violation.property_name == "interleaved_message" + + +def test_interleaved_violation_parallel_missing_result(): + """User message with partial parallel results violates property (a07).""" + prop = InterleavedMessageProperty() + state = ComplianceState() + + # Simulate 3 parallel actions, only 2 have results + state.pending_tool_call_ids["call_sf"] = "action_sf" + state.pending_tool_call_ids["call_tokyo"] = "action_tokyo" + state.pending_tool_call_ids["call_paris"] = "action_paris" + + # Two results arrive + state.pending_tool_call_ids.pop("call_sf") + state.completed_tool_call_ids.add("call_sf") + state.pending_tool_call_ids.pop("call_tokyo") + state.completed_tool_call_ids.add("call_tokyo") + # call_paris is still pending + + user_msg = make_user_message_event("What about Paris?") + violation = prop.check(user_msg, state) + + assert violation is not None + assert "call_paris" in str(violation.context) + + +def test_interleaved_no_violation_for_action_events(): + """ActionEvent itself doesn't trigger interleaved violation.""" + prop = InterleavedMessageProperty() + state = ComplianceState() + + # Even with pending actions, a new action is fine + state.pending_tool_call_ids["call_existing"] = "action_existing" + action = make_action_event() + + violation = prop.check(action, state) + assert violation is None + + +def test_interleaved_no_violation_for_observation_events(): + """ObservationEvent doesn't trigger interleaved violation.""" + prop = InterleavedMessageProperty() + state = ComplianceState() + + action = make_action_event(tool_call_id="call_789") + state.pending_tool_call_ids["call_789"] = action.id + + obs = make_observation_event(action) + violation = prop.check(obs, state) + + assert violation is None + + +# ============================================================================= +# UnmatchedToolResultProperty Tests +# Detects: a02, a06 +# ============================================================================= + + +def test_unmatched_result_no_violation_when_action_exists(): + """Tool result is fine when its tool_call_id exists.""" + prop = UnmatchedToolResultProperty() + state = ComplianceState() + + action = make_action_event(tool_call_id="call_valid") + state.all_tool_call_ids.add("call_valid") + state.pending_tool_call_ids["call_valid"] = action.id + + obs = make_observation_event(action) + violation = prop.check(obs, state) + + assert violation is None + + +def test_unmatched_result_violation_unknown_tool_call_id(): + """Tool result with unknown tool_call_id violates property (a02/a06).""" + prop = UnmatchedToolResultProperty() + state = ComplianceState() + + # No actions have been seen + orphan_obs = make_orphan_observation_event(tool_call_id="call_unknown") + violation = prop.check(orphan_obs, state) + + assert violation is not None + assert violation.property_name == "unmatched_tool_result" + assert "call_unknown" in violation.description + + +def test_unmatched_result_violation_wrong_tool_call_id(): + """Tool result referencing wrong tool_call_id violates property (a06).""" + prop = UnmatchedToolResultProperty() + state = ComplianceState() + + # We have action with call_correct, but result references call_wrong + state.all_tool_call_ids.add("call_correct") + state.pending_tool_call_ids["call_correct"] = "action_correct" + + orphan_obs = make_orphan_observation_event(tool_call_id="call_wrong") + violation = prop.check(orphan_obs, state) + + assert violation is not None + assert "call_wrong" in violation.description + + +def test_unmatched_result_no_violation_for_non_observation(): + """Non-observation events don't trigger this property.""" + prop = UnmatchedToolResultProperty() + state = ComplianceState() + + user_msg = make_user_message_event() + violation = prop.check(user_msg, state) + + assert violation is None + + +# ============================================================================= +# DuplicateToolResultProperty Tests +# Detects: a05 +# ============================================================================= + + +def test_duplicate_result_no_violation_first_result(): + """First tool result for a tool_call_id is fine.""" + prop = DuplicateToolResultProperty() + state = ComplianceState() + + action = make_action_event(tool_call_id="call_first") + state.all_tool_call_ids.add("call_first") + state.pending_tool_call_ids["call_first"] = action.id + + obs = make_observation_event(action) + violation = prop.check(obs, state) + + assert violation is None + + +def test_duplicate_result_violation_second_result(): + """Second tool result for same tool_call_id violates property (a05).""" + prop = DuplicateToolResultProperty() + state = ComplianceState() + + # First result already processed + state.completed_tool_call_ids.add("call_duplicate") + state.all_tool_call_ids.add("call_duplicate") + + # Second result arrives + duplicate_obs = make_orphan_observation_event(tool_call_id="call_duplicate") + violation = prop.check(duplicate_obs, state) + + assert violation is not None + assert violation.property_name == "duplicate_tool_result" + assert "call_duplicate" in violation.description + + +def test_duplicate_result_no_violation_for_non_observation(): + """Non-observation events don't trigger this property.""" + prop = DuplicateToolResultProperty() + state = ComplianceState() + + action = make_action_event() + violation = prop.check(action, state) + + assert violation is None + + +# ============================================================================= +# ToolResultOrderProperty Tests +# Detects: a08 +# ============================================================================= + + +def test_result_order_no_violation_normal_order(): + """Tool result after action is fine.""" + prop = ToolResultOrderProperty() + state = ComplianceState() + + action = make_action_event(tool_call_id="call_ordered") + state.all_tool_call_ids.add("call_ordered") + state.pending_tool_call_ids["call_ordered"] = action.id + + obs = make_observation_event(action) + violation = prop.check(obs, state) + + assert violation is None + + +def test_result_order_violation_result_before_action(): + """Tool result before action violates property (a08).""" + prop = ToolResultOrderProperty() + state = ComplianceState() + + # Result arrives, but no action has been seen with this ID + # (different from unmatched - here the action may come later) + orphan_obs = make_orphan_observation_event(tool_call_id="call_early") + violation = prop.check(orphan_obs, state) + + assert violation is not None + assert violation.property_name == "tool_result_order" + assert "call_early" in violation.description + + +def test_result_order_no_violation_for_non_observation(): + """Non-observation events don't trigger this property.""" + prop = ToolResultOrderProperty() + state = ComplianceState() + + user_msg = make_user_message_event() + violation = prop.check(user_msg, state) + + assert violation is None + + +# ============================================================================= +# State Update Tests +# ============================================================================= + + +def test_state_update_action_adds_pending(): + """Adding an action should update pending_tool_call_ids.""" + prop = InterleavedMessageProperty() + state = ComplianceState() + + action = make_action_event(tool_call_id="call_new") + prop.update_state(action, state) + + assert "call_new" in state.pending_tool_call_ids + assert "call_new" in state.all_tool_call_ids + + +def test_state_update_observation_resolves_pending(): + """Adding an observation should move from pending to completed.""" + prop = InterleavedMessageProperty() + state = ComplianceState() + + # Setup: action is pending + action = make_action_event(tool_call_id="call_resolve") + state.pending_tool_call_ids["call_resolve"] = action.id + state.all_tool_call_ids.add("call_resolve") + + obs = make_observation_event(action) + prop.update_state(obs, state) + + assert "call_resolve" not in state.pending_tool_call_ids + assert "call_resolve" in state.completed_tool_call_ids diff --git a/tests/sdk/conversation/compliance/test_state_integration.py b/tests/sdk/conversation/compliance/test_state_integration.py new file mode 100644 index 0000000000..cade3f08fd --- /dev/null +++ b/tests/sdk/conversation/compliance/test_state_integration.py @@ -0,0 +1,136 @@ +"""Tests for ConversationState.add_event() integration with compliance monitoring.""" + +import uuid + +import pytest +from pydantic import SecretStr + +from openhands.sdk import LLM, Agent +from openhands.sdk.conversation.state import ConversationState +from openhands.sdk.workspace import LocalWorkspace +from tests.sdk.conversation.compliance.conftest import ( + make_action_event, + make_observation_event, + make_user_message_event, +) + + +@pytest.fixture +def temp_workspace(tmp_path): + """Create a temporary workspace for testing.""" + workspace_dir = tmp_path / "workspace" + workspace_dir.mkdir() + return LocalWorkspace(working_dir=workspace_dir) + + +@pytest.fixture +def mock_agent(): + """Create a mock agent for testing.""" + llm = LLM(model="mock-model", api_key=SecretStr("fake-key")) + return Agent(llm=llm) + + +@pytest.fixture +def conversation_state(temp_workspace, mock_agent): + """Create a ConversationState for testing.""" + return ConversationState.create( + id=uuid.uuid4(), + agent=mock_agent, + workspace=temp_workspace, + persistence_dir=None, # In-memory for testing + ) + + +def test_add_event_appends_to_event_log(conversation_state): + """add_event() should append events to the event log.""" + initial_count = len(conversation_state.events) + + user_msg = make_user_message_event("Hello") + conversation_state.add_event(user_msg) + + assert len(conversation_state.events) == initial_count + 1 + assert conversation_state.events[-1].id == user_msg.id + + +def test_add_event_lazy_creates_monitor(conversation_state): + """compliance_monitor should be lazily initialized.""" + # Initially None + assert conversation_state._compliance_monitor is None + + # Access via property triggers creation + monitor = conversation_state.compliance_monitor + + assert monitor is not None + assert conversation_state._compliance_monitor is monitor + + +def test_add_event_checks_compliance(conversation_state): + """add_event() should check compliance and detect violations.""" + # Add an action + action = make_action_event(tool_call_id="call_1") + conversation_state.add_event(action) + + # User message while action pending should create violation + user_msg = make_user_message_event() + conversation_state.add_event(user_msg) + + # Should have recorded violation + assert conversation_state.compliance_monitor.violation_count > 0 + violations = conversation_state.compliance_monitor.violations + assert any(v.property_name == "interleaved_message" for v in violations) + + +def test_add_event_normal_flow_no_violations(conversation_state): + """Normal conversation flow should have no violations.""" + # Normal flow: action -> observation -> user message + action = make_action_event(tool_call_id="call_1") + conversation_state.add_event(action) + + obs = make_observation_event(action) + conversation_state.add_event(obs) + + user_msg = make_user_message_event() + conversation_state.add_event(user_msg) + + # No violations + assert conversation_state.compliance_monitor.violation_count == 0 + + +def test_add_event_still_adds_on_violation(conversation_state): + """Events should still be added even when violations occur (observation mode).""" + action = make_action_event(tool_call_id="call_1") + conversation_state.add_event(action) + + # User message while action pending - violation + user_msg = make_user_message_event() + conversation_state.add_event(user_msg) + + # Event should still be in the log + assert conversation_state.events[-1].id == user_msg.id + + # Violation should be recorded + assert conversation_state.compliance_monitor.violation_count > 0 + + +def test_add_event_tracks_state_correctly(conversation_state): + """add_event() should correctly update compliance state.""" + action = make_action_event(tool_call_id="call_track") + conversation_state.add_event(action) + + monitor = conversation_state.compliance_monitor + assert "call_track" in monitor.state.pending_tool_call_ids + assert "call_track" in monitor.state.all_tool_call_ids + + obs = make_observation_event(action) + conversation_state.add_event(obs) + + assert "call_track" not in monitor.state.pending_tool_call_ids + assert "call_track" in monitor.state.completed_tool_call_ids + + +def test_compliance_monitor_property_returns_same_instance(conversation_state): + """compliance_monitor property should return the same instance each time.""" + monitor1 = conversation_state.compliance_monitor + monitor2 = conversation_state.compliance_monitor + + assert monitor1 is monitor2 From b9a791071063cf22dac0fe7f966bfba8d77622d4 Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 25 Feb 2026 21:11:31 +0000 Subject: [PATCH 02/13] refactor: Address review feedback on API compliance monitor - Replace all_tool_call_ids field with computed property in ComplianceState to avoid maintaining redundant state (pending and completed already tracked) - Simplify ToolResultOrderProperty.check() to use single membership check instead of three redundant checks - Update InterleavedMessageProperty.update_state() to not maintain removed field - Clarify add_event() docstring that it is the only supported way to add events and direct mutation of events list bypasses compliance monitoring Co-authored-by: openhands --- .../openhands/sdk/conversation/compliance/base.py | 12 +++++++++--- .../sdk/conversation/compliance/properties.py | 11 +++-------- openhands-sdk/openhands/sdk/conversation/state.py | 5 ++++- 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/openhands-sdk/openhands/sdk/conversation/compliance/base.py b/openhands-sdk/openhands/sdk/conversation/compliance/base.py index d303733e02..de8ca21c6f 100644 --- a/openhands-sdk/openhands/sdk/conversation/compliance/base.py +++ b/openhands-sdk/openhands/sdk/conversation/compliance/base.py @@ -36,13 +36,19 @@ class ComplianceState: received results. Maps tool_call_id to the ActionEvent id. completed_tool_call_ids: Tool calls that have received results. Used to detect duplicate results. - all_tool_call_ids: All tool_call_ids seen so far (for detecting - results referencing unknown calls). """ pending_tool_call_ids: dict[ToolCallID, EventID] = field(default_factory=dict) completed_tool_call_ids: set[ToolCallID] = field(default_factory=set) - all_tool_call_ids: set[ToolCallID] = field(default_factory=set) + + @property + def all_tool_call_ids(self) -> set[ToolCallID]: + """All tool_call_ids seen so far (pending or completed). + + This is derived from pending_tool_call_ids (dict keys) and + completed_tool_call_ids to avoid maintaining redundant state. + """ + return set(self.pending_tool_call_ids) | self.completed_tool_call_ids class APICompliancePropertyBase(ABC): diff --git a/openhands-sdk/openhands/sdk/conversation/compliance/properties.py b/openhands-sdk/openhands/sdk/conversation/compliance/properties.py index abce522005..e2ec0f200f 100644 --- a/openhands-sdk/openhands/sdk/conversation/compliance/properties.py +++ b/openhands-sdk/openhands/sdk/conversation/compliance/properties.py @@ -57,7 +57,6 @@ def update_state(self, event: Event, state: ComplianceState) -> None: """Track pending and completed tool calls.""" if isinstance(event, ActionEvent): state.pending_tool_call_ids[event.tool_call_id] = event.id - state.all_tool_call_ids.add(event.tool_call_id) elif isinstance(event, ObservationBaseEvent): # Move from pending to completed state.pending_tool_call_ids.pop(event.tool_call_id, None) @@ -159,13 +158,9 @@ def check( if not isinstance(event, ObservationBaseEvent): return None - # Check if we've seen an action with this tool_call_id - # (pending or completed - either means we saw it) - seen_in_pending = event.tool_call_id in state.pending_tool_call_ids - seen_in_completed = event.tool_call_id in state.completed_tool_call_ids - seen_in_all = event.tool_call_id in state.all_tool_call_ids - - if not (seen_in_pending or seen_in_completed or seen_in_all): + # Check if we've seen an action with this tool_call_id. + # all_tool_call_ids combines pending and completed, so a single check suffices. + if event.tool_call_id not in state.all_tool_call_ids: return ComplianceViolation( property_name=self.name, event_id=event.id, diff --git a/openhands-sdk/openhands/sdk/conversation/state.py b/openhands-sdk/openhands/sdk/conversation/state.py index 23c216aa2e..123bd30d30 100644 --- a/openhands-sdk/openhands/sdk/conversation/state.py +++ b/openhands-sdk/openhands/sdk/conversation/state.py @@ -533,7 +533,10 @@ def compliance_monitor(self) -> APIComplianceMonitor: def add_event(self, event: Event) -> None: """Add an event to the conversation, checking for API compliance. - This method should be used instead of directly appending to events. + This is the only supported way to add events to the conversation. + Do not mutate the events list directly (e.g., via ``state.events.append()``), + as this bypasses compliance monitoring and may cause silent failures. + It checks the event against API compliance properties and logs any violations before adding the event to the event log. From 28d034db17e276e9e04ec3ae24a5e092df4b5fdb Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 25 Feb 2026 21:31:18 +0000 Subject: [PATCH 03/13] refactor: Address review feedback on API compliance monitor - Strengthen type from Event to LLMConvertibleEvent in compliance checks (base.py, monitor.py, properties) as we only check LLM-convertible events - Remove _violations field from APIComplianceMonitor since violations are returned and logged per call (callers can accumulate if needed) - Move compliance import to top of state.py instead of lazy import - Refactor properties.py into a module with separate files for each property: - properties/interleaved_message.py - properties/unmatched_tool_result.py - properties/duplicate_tool_result.py - properties/tool_result_order.py - Update add_event() to only check compliance for LLMConvertibleEvent instances - Update tests to match new design (use returned violations and log assertions) Co-authored-by: openhands --- .../sdk/conversation/compliance/__init__.py | 2 + .../sdk/conversation/compliance/base.py | 6 +- .../sdk/conversation/compliance/monitor.py | 24 +-- .../sdk/conversation/compliance/properties.py | 183 ------------------ .../compliance/properties/__init__.py | 39 ++++ .../properties/duplicate_tool_result.py | 43 ++++ .../properties/interleaved_message.py | 61 ++++++ .../properties/tool_result_order.py | 49 +++++ .../properties/unmatched_tool_result.py | 43 ++++ .../openhands/sdk/conversation/state.py | 26 +-- .../conversation/compliance/test_monitor.py | 42 ++-- .../compliance/test_state_integration.py | 41 ++-- 12 files changed, 301 insertions(+), 258 deletions(-) delete mode 100644 openhands-sdk/openhands/sdk/conversation/compliance/properties.py create mode 100644 openhands-sdk/openhands/sdk/conversation/compliance/properties/__init__.py create mode 100644 openhands-sdk/openhands/sdk/conversation/compliance/properties/duplicate_tool_result.py create mode 100644 openhands-sdk/openhands/sdk/conversation/compliance/properties/interleaved_message.py create mode 100644 openhands-sdk/openhands/sdk/conversation/compliance/properties/tool_result_order.py create mode 100644 openhands-sdk/openhands/sdk/conversation/compliance/properties/unmatched_tool_result.py diff --git a/openhands-sdk/openhands/sdk/conversation/compliance/__init__.py b/openhands-sdk/openhands/sdk/conversation/compliance/__init__.py index 11c398ef97..9950a71f38 100644 --- a/openhands-sdk/openhands/sdk/conversation/compliance/__init__.py +++ b/openhands-sdk/openhands/sdk/conversation/compliance/__init__.py @@ -12,6 +12,7 @@ ) from openhands.sdk.conversation.compliance.monitor import APIComplianceMonitor from openhands.sdk.conversation.compliance.properties import ( + ALL_COMPLIANCE_PROPERTIES, DuplicateToolResultProperty, InterleavedMessageProperty, ToolResultOrderProperty, @@ -20,6 +21,7 @@ __all__ = [ + "ALL_COMPLIANCE_PROPERTIES", "APICompliancePropertyBase", "APIComplianceMonitor", "ComplianceState", diff --git a/openhands-sdk/openhands/sdk/conversation/compliance/base.py b/openhands-sdk/openhands/sdk/conversation/compliance/base.py index de8ca21c6f..32cfe5702d 100644 --- a/openhands-sdk/openhands/sdk/conversation/compliance/base.py +++ b/openhands-sdk/openhands/sdk/conversation/compliance/base.py @@ -3,7 +3,7 @@ from abc import ABC, abstractmethod from dataclasses import dataclass, field -from openhands.sdk.event import Event +from openhands.sdk.event import LLMConvertibleEvent from openhands.sdk.event.types import EventID, ToolCallID @@ -72,7 +72,7 @@ def name(self) -> str: @abstractmethod def check( self, - event: Event, + event: LLMConvertibleEvent, state: ComplianceState, ) -> ComplianceViolation | None: """Check if an event violates this property. @@ -86,7 +86,7 @@ def check( None otherwise. """ - def update_state(self, event: Event, state: ComplianceState) -> None: + def update_state(self, event: LLMConvertibleEvent, state: ComplianceState) -> None: """Update compliance state after an event is processed. Override this method if the property needs to track state. diff --git a/openhands-sdk/openhands/sdk/conversation/compliance/monitor.py b/openhands-sdk/openhands/sdk/conversation/compliance/monitor.py index ce6e8ca929..f84358c341 100644 --- a/openhands-sdk/openhands/sdk/conversation/compliance/monitor.py +++ b/openhands-sdk/openhands/sdk/conversation/compliance/monitor.py @@ -6,7 +6,7 @@ ComplianceViolation, ) from openhands.sdk.conversation.compliance.properties import ALL_COMPLIANCE_PROPERTIES -from openhands.sdk.event import Event +from openhands.sdk.event import LLMConvertibleEvent from openhands.sdk.logger import get_logger @@ -40,9 +40,8 @@ def __init__( self.properties = ( properties if properties is not None else list(ALL_COMPLIANCE_PROPERTIES) ) - self._violations: list[ComplianceViolation] = [] - def check_event(self, event: Event) -> list[ComplianceViolation]: + def check_event(self, event: LLMConvertibleEvent) -> list[ComplianceViolation]: """Check an event for compliance violations. This method checks the event against all properties and returns any @@ -70,7 +69,7 @@ def check_event(self, event: Event) -> list[ComplianceViolation]: return violations - def update_state(self, event: Event) -> None: + def update_state(self, event: LLMConvertibleEvent) -> None: """Update compliance state after an event is processed. Call this after the event has been added to the event log. @@ -81,7 +80,7 @@ def update_state(self, event: Event) -> None: for prop in self.properties: prop.update_state(event, self.state) - def process_event(self, event: Event) -> list[ComplianceViolation]: + def process_event(self, event: LLMConvertibleEvent) -> list[ComplianceViolation]: """Check an event and update state. This is a convenience method that combines check_event() and @@ -95,20 +94,5 @@ def process_event(self, event: Event) -> list[ComplianceViolation]: List of violations detected (empty if compliant). """ violations = self.check_event(event) - self._violations.extend(violations) self.update_state(event) return violations - - @property - def violations(self) -> list[ComplianceViolation]: - """Get all violations detected so far.""" - return list(self._violations) - - @property - def violation_count(self) -> int: - """Get the total number of violations detected.""" - return len(self._violations) - - def clear_violations(self) -> None: - """Clear the violations history.""" - self._violations.clear() diff --git a/openhands-sdk/openhands/sdk/conversation/compliance/properties.py b/openhands-sdk/openhands/sdk/conversation/compliance/properties.py deleted file mode 100644 index e2ec0f200f..0000000000 --- a/openhands-sdk/openhands/sdk/conversation/compliance/properties.py +++ /dev/null @@ -1,183 +0,0 @@ -"""Concrete implementations of API compliance properties. - -Each property corresponds to one or more API compliance patterns: -- InterleavedMessageProperty: a01, a03, a04, a07 -- UnmatchedToolResultProperty: a02, a06 -- DuplicateToolResultProperty: a05 -- ToolResultOrderProperty: a08 -""" - -from openhands.sdk.conversation.compliance.base import ( - APICompliancePropertyBase, - ComplianceState, - ComplianceViolation, -) -from openhands.sdk.event import ActionEvent, Event, MessageEvent, ObservationBaseEvent - - -class InterleavedMessageProperty(APICompliancePropertyBase): - """Detects messages interleaved between tool_use and tool_result. - - Violations: - - User or assistant message arrives while tool calls are pending. - - Corresponds to patterns: a01 (unmatched tool_use), a03 (interleaved user), - a04 (interleaved assistant), a07 (parallel missing result). - """ - - @property - def name(self) -> str: - return "interleaved_message" - - def check( - self, - event: Event, - state: ComplianceState, - ) -> ComplianceViolation | None: - # Only check MessageEvent - if not isinstance(event, MessageEvent): - return None - - # Violation if there are pending tool calls - if state.pending_tool_call_ids: - pending_ids = list(state.pending_tool_call_ids.keys()) - return ComplianceViolation( - property_name=self.name, - event_id=event.id, - description=( - f"Message event interleaved with {len(pending_ids)} pending " - f"tool call(s)" - ), - context={"pending_tool_call_ids": pending_ids}, - ) - - return None - - def update_state(self, event: Event, state: ComplianceState) -> None: - """Track pending and completed tool calls.""" - if isinstance(event, ActionEvent): - state.pending_tool_call_ids[event.tool_call_id] = event.id - elif isinstance(event, ObservationBaseEvent): - # Move from pending to completed - state.pending_tool_call_ids.pop(event.tool_call_id, None) - state.completed_tool_call_ids.add(event.tool_call_id) - - -class UnmatchedToolResultProperty(APICompliancePropertyBase): - """Detects tool results that reference unknown tool_call_ids. - - Violations: - - Tool result references a tool_call_id that was never seen. - - Corresponds to patterns: a02 (unmatched tool_result), a06 (wrong tool_call_id). - """ - - @property - def name(self) -> str: - return "unmatched_tool_result" - - def check( - self, - event: Event, - state: ComplianceState, - ) -> ComplianceViolation | None: - if not isinstance(event, ObservationBaseEvent): - return None - - # Check if tool_call_id was ever seen - if event.tool_call_id not in state.all_tool_call_ids: - return ComplianceViolation( - property_name=self.name, - event_id=event.id, - description=( - f"Tool result references unknown tool_call_id: {event.tool_call_id}" - ), - context={"tool_call_id": event.tool_call_id}, - ) - - return None - - -class DuplicateToolResultProperty(APICompliancePropertyBase): - """Detects duplicate tool results for the same tool_call_id. - - Violations: - - Tool result arrives for a tool_call_id that already has a result. - - Corresponds to pattern: a05 (duplicate tool_call_id). - """ - - @property - def name(self) -> str: - return "duplicate_tool_result" - - def check( - self, - event: Event, - state: ComplianceState, - ) -> ComplianceViolation | None: - if not isinstance(event, ObservationBaseEvent): - return None - - # Check if this tool_call_id already has a result - if event.tool_call_id in state.completed_tool_call_ids: - return ComplianceViolation( - property_name=self.name, - event_id=event.id, - description=( - f"Duplicate tool result for tool_call_id: {event.tool_call_id}" - ), - context={"tool_call_id": event.tool_call_id}, - ) - - return None - - -class ToolResultOrderProperty(APICompliancePropertyBase): - """Detects tool results that arrive before their corresponding actions. - - Violations: - - Tool result arrives before any action with that tool_call_id. - - Corresponds to pattern: a08 (parallel wrong order). - - Note: This is similar to UnmatchedToolResultProperty but semantically - different - here the action may arrive later, whereas unmatched means - the action never existed. - """ - - @property - def name(self) -> str: - return "tool_result_order" - - def check( - self, - event: Event, - state: ComplianceState, - ) -> ComplianceViolation | None: - if not isinstance(event, ObservationBaseEvent): - return None - - # Check if we've seen an action with this tool_call_id. - # all_tool_call_ids combines pending and completed, so a single check suffices. - if event.tool_call_id not in state.all_tool_call_ids: - return ComplianceViolation( - property_name=self.name, - event_id=event.id, - description=( - f"Tool result arrived before action for tool_call_id: " - f"{event.tool_call_id}" - ), - context={"tool_call_id": event.tool_call_id}, - ) - - return None - - -# All properties in recommended check order -ALL_COMPLIANCE_PROPERTIES: list[APICompliancePropertyBase] = [ - ToolResultOrderProperty(), - UnmatchedToolResultProperty(), - DuplicateToolResultProperty(), - InterleavedMessageProperty(), -] diff --git a/openhands-sdk/openhands/sdk/conversation/compliance/properties/__init__.py b/openhands-sdk/openhands/sdk/conversation/compliance/properties/__init__.py new file mode 100644 index 0000000000..dda8670408 --- /dev/null +++ b/openhands-sdk/openhands/sdk/conversation/compliance/properties/__init__.py @@ -0,0 +1,39 @@ +"""Concrete implementations of API compliance properties. + +Each property corresponds to one or more API compliance patterns: +- InterleavedMessageProperty: a01, a03, a04, a07 +- UnmatchedToolResultProperty: a02, a06 +- DuplicateToolResultProperty: a05 +- ToolResultOrderProperty: a08 +""" + +from openhands.sdk.conversation.compliance.base import APICompliancePropertyBase +from openhands.sdk.conversation.compliance.properties.duplicate_tool_result import ( + DuplicateToolResultProperty, +) +from openhands.sdk.conversation.compliance.properties.interleaved_message import ( + InterleavedMessageProperty, +) +from openhands.sdk.conversation.compliance.properties.tool_result_order import ( + ToolResultOrderProperty, +) +from openhands.sdk.conversation.compliance.properties.unmatched_tool_result import ( + UnmatchedToolResultProperty, +) + + +# All properties in recommended check order +ALL_COMPLIANCE_PROPERTIES: list[APICompliancePropertyBase] = [ + ToolResultOrderProperty(), + UnmatchedToolResultProperty(), + DuplicateToolResultProperty(), + InterleavedMessageProperty(), +] + +__all__ = [ + "ALL_COMPLIANCE_PROPERTIES", + "DuplicateToolResultProperty", + "InterleavedMessageProperty", + "ToolResultOrderProperty", + "UnmatchedToolResultProperty", +] diff --git a/openhands-sdk/openhands/sdk/conversation/compliance/properties/duplicate_tool_result.py b/openhands-sdk/openhands/sdk/conversation/compliance/properties/duplicate_tool_result.py new file mode 100644 index 0000000000..dda47f2463 --- /dev/null +++ b/openhands-sdk/openhands/sdk/conversation/compliance/properties/duplicate_tool_result.py @@ -0,0 +1,43 @@ +"""Duplicate tool result property for API compliance monitoring.""" + +from openhands.sdk.conversation.compliance.base import ( + APICompliancePropertyBase, + ComplianceState, + ComplianceViolation, +) +from openhands.sdk.event import LLMConvertibleEvent, ObservationBaseEvent + + +class DuplicateToolResultProperty(APICompliancePropertyBase): + """Detects duplicate tool results for the same tool_call_id. + + Violations: + - Tool result arrives for a tool_call_id that already has a result. + + Corresponds to pattern: a05 (duplicate tool_call_id). + """ + + @property + def name(self) -> str: + return "duplicate_tool_result" + + def check( + self, + event: LLMConvertibleEvent, + state: ComplianceState, + ) -> ComplianceViolation | None: + if not isinstance(event, ObservationBaseEvent): + return None + + # Check if this tool_call_id already has a result + if event.tool_call_id in state.completed_tool_call_ids: + return ComplianceViolation( + property_name=self.name, + event_id=event.id, + description=( + f"Duplicate tool result for tool_call_id: {event.tool_call_id}" + ), + context={"tool_call_id": event.tool_call_id}, + ) + + return None diff --git a/openhands-sdk/openhands/sdk/conversation/compliance/properties/interleaved_message.py b/openhands-sdk/openhands/sdk/conversation/compliance/properties/interleaved_message.py new file mode 100644 index 0000000000..b5c17fdb2e --- /dev/null +++ b/openhands-sdk/openhands/sdk/conversation/compliance/properties/interleaved_message.py @@ -0,0 +1,61 @@ +"""Interleaved message property for API compliance monitoring.""" + +from openhands.sdk.conversation.compliance.base import ( + APICompliancePropertyBase, + ComplianceState, + ComplianceViolation, +) +from openhands.sdk.event import ( + ActionEvent, + LLMConvertibleEvent, + MessageEvent, + ObservationBaseEvent, +) + + +class InterleavedMessageProperty(APICompliancePropertyBase): + """Detects messages interleaved between tool_use and tool_result. + + Violations: + - User or assistant message arrives while tool calls are pending. + + Corresponds to patterns: a01 (unmatched tool_use), a03 (interleaved user), + a04 (interleaved assistant), a07 (parallel missing result). + """ + + @property + def name(self) -> str: + return "interleaved_message" + + def check( + self, + event: LLMConvertibleEvent, + state: ComplianceState, + ) -> ComplianceViolation | None: + # Only check MessageEvent + if not isinstance(event, MessageEvent): + return None + + # Violation if there are pending tool calls + if state.pending_tool_call_ids: + pending_ids = list(state.pending_tool_call_ids.keys()) + return ComplianceViolation( + property_name=self.name, + event_id=event.id, + description=( + f"Message event interleaved with {len(pending_ids)} pending " + f"tool call(s)" + ), + context={"pending_tool_call_ids": pending_ids}, + ) + + return None + + def update_state(self, event: LLMConvertibleEvent, state: ComplianceState) -> None: + """Track pending and completed tool calls.""" + if isinstance(event, ActionEvent): + state.pending_tool_call_ids[event.tool_call_id] = event.id + elif isinstance(event, ObservationBaseEvent): + # Move from pending to completed + state.pending_tool_call_ids.pop(event.tool_call_id, None) + state.completed_tool_call_ids.add(event.tool_call_id) diff --git a/openhands-sdk/openhands/sdk/conversation/compliance/properties/tool_result_order.py b/openhands-sdk/openhands/sdk/conversation/compliance/properties/tool_result_order.py new file mode 100644 index 0000000000..e42bf231db --- /dev/null +++ b/openhands-sdk/openhands/sdk/conversation/compliance/properties/tool_result_order.py @@ -0,0 +1,49 @@ +"""Tool result order property for API compliance monitoring.""" + +from openhands.sdk.conversation.compliance.base import ( + APICompliancePropertyBase, + ComplianceState, + ComplianceViolation, +) +from openhands.sdk.event import LLMConvertibleEvent, ObservationBaseEvent + + +class ToolResultOrderProperty(APICompliancePropertyBase): + """Detects tool results that arrive before their corresponding actions. + + Violations: + - Tool result arrives before any action with that tool_call_id. + + Corresponds to pattern: a08 (parallel wrong order). + + Note: This is similar to UnmatchedToolResultProperty but semantically + different - here the action may arrive later, whereas unmatched means + the action never existed. + """ + + @property + def name(self) -> str: + return "tool_result_order" + + def check( + self, + event: LLMConvertibleEvent, + state: ComplianceState, + ) -> ComplianceViolation | None: + if not isinstance(event, ObservationBaseEvent): + return None + + # Check if we've seen an action with this tool_call_id. + # all_tool_call_ids combines pending and completed, so a single check suffices. + if event.tool_call_id not in state.all_tool_call_ids: + return ComplianceViolation( + property_name=self.name, + event_id=event.id, + description=( + f"Tool result arrived before action for tool_call_id: " + f"{event.tool_call_id}" + ), + context={"tool_call_id": event.tool_call_id}, + ) + + return None diff --git a/openhands-sdk/openhands/sdk/conversation/compliance/properties/unmatched_tool_result.py b/openhands-sdk/openhands/sdk/conversation/compliance/properties/unmatched_tool_result.py new file mode 100644 index 0000000000..d4f9356a9d --- /dev/null +++ b/openhands-sdk/openhands/sdk/conversation/compliance/properties/unmatched_tool_result.py @@ -0,0 +1,43 @@ +"""Unmatched tool result property for API compliance monitoring.""" + +from openhands.sdk.conversation.compliance.base import ( + APICompliancePropertyBase, + ComplianceState, + ComplianceViolation, +) +from openhands.sdk.event import LLMConvertibleEvent, ObservationBaseEvent + + +class UnmatchedToolResultProperty(APICompliancePropertyBase): + """Detects tool results that reference unknown tool_call_ids. + + Violations: + - Tool result references a tool_call_id that was never seen. + + Corresponds to patterns: a02 (unmatched tool_result), a06 (wrong tool_call_id). + """ + + @property + def name(self) -> str: + return "unmatched_tool_result" + + def check( + self, + event: LLMConvertibleEvent, + state: ComplianceState, + ) -> ComplianceViolation | None: + if not isinstance(event, ObservationBaseEvent): + return None + + # Check if tool_call_id was ever seen + if event.tool_call_id not in state.all_tool_call_ids: + return ComplianceViolation( + property_name=self.name, + event_id=event.id, + description=( + f"Tool result references unknown tool_call_id: {event.tool_call_id}" + ), + context={"tool_call_id": event.tool_call_id}, + ) + + return None diff --git a/openhands-sdk/openhands/sdk/conversation/state.py b/openhands-sdk/openhands/sdk/conversation/state.py index 123bd30d30..e37ccf5335 100644 --- a/openhands-sdk/openhands/sdk/conversation/state.py +++ b/openhands-sdk/openhands/sdk/conversation/state.py @@ -5,18 +5,24 @@ from collections.abc import Sequence from enum import Enum from pathlib import Path -from typing import TYPE_CHECKING, Any, Self +from typing import Any, Self from pydantic import Field, PrivateAttr, model_validator from openhands.sdk.agent.base import AgentBase +from openhands.sdk.conversation.compliance import APIComplianceMonitor from openhands.sdk.conversation.conversation_stats import ConversationStats from openhands.sdk.conversation.event_store import EventLog from openhands.sdk.conversation.fifo_lock import FIFOLock from openhands.sdk.conversation.persistence_const import BASE_STATE, EVENTS_DIR from openhands.sdk.conversation.secret_registry import SecretRegistry from openhands.sdk.conversation.types import ConversationCallbackType, ConversationID -from openhands.sdk.event import ActionEvent, ObservationEvent, UserRejectObservation +from openhands.sdk.event import ( + ActionEvent, + LLMConvertibleEvent, + ObservationEvent, + UserRejectObservation, +) from openhands.sdk.event.base import Event from openhands.sdk.event.types import EventID from openhands.sdk.io import FileStore, InMemoryFileStore, LocalFileStore @@ -32,10 +38,6 @@ from openhands.sdk.workspace.base import BaseWorkspace -if TYPE_CHECKING: - from openhands.sdk.conversation.compliance import APIComplianceMonitor - - logger = get_logger(__name__) @@ -525,8 +527,6 @@ def compliance_monitor(self) -> APIComplianceMonitor: The monitor is lazily initialized on first access. """ if self._compliance_monitor is None: - from openhands.sdk.conversation.compliance import APIComplianceMonitor - self._compliance_monitor = APIComplianceMonitor() return self._compliance_monitor @@ -537,8 +537,9 @@ def add_event(self, event: Event) -> None: Do not mutate the events list directly (e.g., via ``state.events.append()``), as this bypasses compliance monitoring and may cause silent failures. - It checks the event against API compliance properties and logs any - violations before adding the event to the event log. + For LLMConvertibleEvent instances, the event is checked against API + compliance properties and any violations are logged before adding to + the event log. Currently operates in observation mode: violations are logged but events are still processed. Future versions may support per-property @@ -547,8 +548,9 @@ def add_event(self, event: Event) -> None: Args: event: The event to add to the conversation. """ - # Check for compliance violations (logs warnings if any) - self.compliance_monitor.process_event(event) + # Check for compliance violations only for LLM-convertible events + if isinstance(event, LLMConvertibleEvent): + self.compliance_monitor.process_event(event) # Add to event log regardless of violations (observation mode) self._events.append(event) diff --git a/tests/sdk/conversation/compliance/test_monitor.py b/tests/sdk/conversation/compliance/test_monitor.py index 7cdf55b82e..fd5749cf72 100644 --- a/tests/sdk/conversation/compliance/test_monitor.py +++ b/tests/sdk/conversation/compliance/test_monitor.py @@ -15,21 +15,25 @@ def test_monitor_no_violations_normal_flow(): """Normal conversation flow should have no violations.""" monitor = APIComplianceMonitor() + all_violations: list = [] # Normal flow: action -> observation -> message action = make_action_event(tool_call_id="call_1") violations = monitor.process_event(action) + all_violations.extend(violations) assert len(violations) == 0 obs = make_observation_event(action) violations = monitor.process_event(obs) + all_violations.extend(violations) assert len(violations) == 0 user_msg = make_user_message_event() violations = monitor.process_event(user_msg) + all_violations.extend(violations) assert len(violations) == 0 - assert monitor.violation_count == 0 + assert len(all_violations) == 0 def test_monitor_detects_interleaved_message(): @@ -45,7 +49,6 @@ def test_monitor_detects_interleaved_message(): assert len(violations) == 1 assert violations[0].property_name == "interleaved_message" - assert monitor.violation_count == 1 def test_monitor_detects_multiple_violations(): @@ -63,38 +66,27 @@ def test_monitor_detects_multiple_violations(): assert "unmatched_tool_result" in property_names -def test_monitor_tracks_violations_history(): - """Monitor should accumulate violations over time.""" +def test_monitor_returns_violations_per_call(): + """Monitor returns violations for each call, caller can accumulate.""" monitor = APIComplianceMonitor() + all_violations: list = [] # First violation action = make_action_event(tool_call_id="call_1") - monitor.process_event(action) - monitor.process_event(make_user_message_event()) # interleaved + all_violations.extend(monitor.process_event(action)) + violations = monitor.process_event(make_user_message_event()) # interleaved + all_violations.extend(violations) - initial_count = monitor.violation_count + initial_count = len(all_violations) assert initial_count > 0 # Second violation - monitor.process_event(make_orphan_observation_event(tool_call_id="unknown")) - - assert monitor.violation_count > initial_count - assert len(monitor.violations) == monitor.violation_count - - -def test_monitor_clear_violations(): - """Monitor should allow clearing violation history.""" - monitor = APIComplianceMonitor() - - action = make_action_event(tool_call_id="call_1") - monitor.process_event(action) - monitor.process_event(make_user_message_event()) - - assert monitor.violation_count > 0 + violations = monitor.process_event( + make_orphan_observation_event(tool_call_id="unknown") + ) + all_violations.extend(violations) - monitor.clear_violations() - assert monitor.violation_count == 0 - assert len(monitor.violations) == 0 + assert len(all_violations) > initial_count def test_monitor_custom_properties(): diff --git a/tests/sdk/conversation/compliance/test_state_integration.py b/tests/sdk/conversation/compliance/test_state_integration.py index cade3f08fd..9af6200218 100644 --- a/tests/sdk/conversation/compliance/test_state_integration.py +++ b/tests/sdk/conversation/compliance/test_state_integration.py @@ -64,24 +64,29 @@ def test_add_event_lazy_creates_monitor(conversation_state): assert conversation_state._compliance_monitor is monitor -def test_add_event_checks_compliance(conversation_state): - """add_event() should check compliance and detect violations.""" +def test_add_event_checks_compliance(conversation_state, caplog): + """add_event() should check compliance and log violations.""" + import logging + # Add an action action = make_action_event(tool_call_id="call_1") conversation_state.add_event(action) # User message while action pending should create violation user_msg = make_user_message_event() - conversation_state.add_event(user_msg) - # Should have recorded violation - assert conversation_state.compliance_monitor.violation_count > 0 - violations = conversation_state.compliance_monitor.violations - assert any(v.property_name == "interleaved_message" for v in violations) + with caplog.at_level(logging.WARNING): + conversation_state.add_event(user_msg) + + # Should have logged violation + assert "interleaved_message" in caplog.text + assert "API compliance violation detected" in caplog.text -def test_add_event_normal_flow_no_violations(conversation_state): +def test_add_event_normal_flow_no_violations(conversation_state, caplog): """Normal conversation flow should have no violations.""" + import logging + # Normal flow: action -> observation -> user message action = make_action_event(tool_call_id="call_1") conversation_state.add_event(action) @@ -90,26 +95,32 @@ def test_add_event_normal_flow_no_violations(conversation_state): conversation_state.add_event(obs) user_msg = make_user_message_event() - conversation_state.add_event(user_msg) - # No violations - assert conversation_state.compliance_monitor.violation_count == 0 + with caplog.at_level(logging.WARNING): + conversation_state.add_event(user_msg) + # No violations logged + assert "API compliance violation detected" not in caplog.text -def test_add_event_still_adds_on_violation(conversation_state): + +def test_add_event_still_adds_on_violation(conversation_state, caplog): """Events should still be added even when violations occur (observation mode).""" + import logging + action = make_action_event(tool_call_id="call_1") conversation_state.add_event(action) # User message while action pending - violation user_msg = make_user_message_event() - conversation_state.add_event(user_msg) + + with caplog.at_level(logging.WARNING): + conversation_state.add_event(user_msg) # Event should still be in the log assert conversation_state.events[-1].id == user_msg.id - # Violation should be recorded - assert conversation_state.compliance_monitor.violation_count > 0 + # Violation should be logged + assert "API compliance violation detected" in caplog.text def test_add_event_tracks_state_correctly(conversation_state): From ce3213b9f16b0dfb65af3e44335dac71b1345dfc Mon Sep 17 00:00:00 2001 From: openhands Date: Fri, 27 Feb 2026 20:29:09 +0000 Subject: [PATCH 04/13] Add error handling for observation mode robustness - Wrap process_event() call in state.py with try-except to ensure events are always added regardless of compliance check failures - Add try-except in monitor.py check_event() and update_state() to handle exceptions from individual properties gracefully - Add test_monitor_handles_buggy_property_gracefully test to verify buggy properties don't crash the monitor This ensures observation mode is robust: events are always processed even if compliance checking fails. Co-authored-by: openhands --- .../sdk/conversation/compliance/monitor.py | 34 ++++++++++++----- .../openhands/sdk/conversation/state.py | 7 +++- .../conversation/compliance/test_monitor.py | 37 +++++++++++++++++++ 3 files changed, 68 insertions(+), 10 deletions(-) diff --git a/openhands-sdk/openhands/sdk/conversation/compliance/monitor.py b/openhands-sdk/openhands/sdk/conversation/compliance/monitor.py index f84358c341..e6f4d4f150 100644 --- a/openhands-sdk/openhands/sdk/conversation/compliance/monitor.py +++ b/openhands-sdk/openhands/sdk/conversation/compliance/monitor.py @@ -57,14 +57,22 @@ def check_event(self, event: LLMConvertibleEvent) -> list[ComplianceViolation]: violations: list[ComplianceViolation] = [] for prop in self.properties: - violation = prop.check(event, self.state) - if violation is not None: - violations.append(violation) - logger.warning( - "API compliance violation detected: %s - %s (event_id=%s)", - violation.property_name, - violation.description, - violation.event_id, + try: + violation = prop.check(event, self.state) + if violation is not None: + violations.append(violation) + logger.warning( + "API compliance violation detected: %s - %s (event_id=%s)", + violation.property_name, + violation.description, + violation.event_id, + ) + except Exception as e: + logger.exception( + "Error checking compliance property %s for event %s: %s", + prop.name, + event.id, + e, ) return violations @@ -78,7 +86,15 @@ def update_state(self, event: LLMConvertibleEvent) -> None: event: The event that was just processed. """ for prop in self.properties: - prop.update_state(event, self.state) + try: + prop.update_state(event, self.state) + except Exception as e: + logger.exception( + "Error updating state for property %s on event %s: %s", + prop.name, + event.id, + e, + ) def process_event(self, event: LLMConvertibleEvent) -> list[ComplianceViolation]: """Check an event and update state. diff --git a/openhands-sdk/openhands/sdk/conversation/state.py b/openhands-sdk/openhands/sdk/conversation/state.py index e37ccf5335..89e612e165 100644 --- a/openhands-sdk/openhands/sdk/conversation/state.py +++ b/openhands-sdk/openhands/sdk/conversation/state.py @@ -550,7 +550,12 @@ def add_event(self, event: Event) -> None: """ # Check for compliance violations only for LLM-convertible events if isinstance(event, LLMConvertibleEvent): - self.compliance_monitor.process_event(event) + try: + self.compliance_monitor.process_event(event) + except Exception as e: + logger.exception( + "Error checking compliance for event %s: %s", event.id, e + ) # Add to event log regardless of violations (observation mode) self._events.append(event) diff --git a/tests/sdk/conversation/compliance/test_monitor.py b/tests/sdk/conversation/compliance/test_monitor.py index fd5749cf72..55e5a34cd9 100644 --- a/tests/sdk/conversation/compliance/test_monitor.py +++ b/tests/sdk/conversation/compliance/test_monitor.py @@ -4,6 +4,12 @@ APIComplianceMonitor, InterleavedMessageProperty, ) +from openhands.sdk.conversation.compliance.base import ( + APICompliancePropertyBase, + ComplianceState, + ComplianceViolation, +) +from openhands.sdk.event import LLMConvertibleEvent from tests.sdk.conversation.compliance.conftest import ( make_action_event, make_observation_event, @@ -147,3 +153,34 @@ def test_monitor_parallel_tool_calls(): violations = monitor.process_event(make_user_message_event()) assert len(violations) == 1 assert "call_paris" in str(violations[0].context) + + +def test_monitor_handles_buggy_property_gracefully(): + """Monitor should handle exceptions in property checks gracefully. + + A buggy property that raises an exception should not crash the monitor. + Instead, the exception is logged and processing continues. This ensures + observation mode is robust against buggy properties. + """ + + class BuggyProperty(APICompliancePropertyBase): + @property + def name(self) -> str: + return "buggy" + + def check( + self, + event: LLMConvertibleEvent, + state: ComplianceState, + ) -> ComplianceViolation | None: + raise ValueError("Oops!") + + monitor = APIComplianceMonitor(properties=[BuggyProperty()]) + + # Should not raise - the exception should be caught and logged + # We expect an empty list because the buggy property doesn't return a violation + # (it raises instead) + violations = monitor.process_event(make_user_message_event()) + + # The monitor should continue working despite the error + assert violations == [] From c087f2efa909d482d7017bcbdd25c3405a7dcc7f Mon Sep 17 00:00:00 2001 From: openhands Date: Fri, 27 Feb 2026 20:48:13 +0000 Subject: [PATCH 05/13] Collapse compliance properties into single monitor method Refactor the compliance monitoring system to use a simpler, more elegant design based on the insight that all checks can be framed as 'what events are allowed right now': - If we have pending tool calls, only matching observations are allowed - Messages cannot interleave with pending tool calls - Tool results must reference known tool_call_ids Changes: - Remove separate property classes (InterleavedMessageProperty, etc.) - Remove APICompliancePropertyBase abstract class - Remove properties/ module entirely - Add _check_tool_call_sequence method to APIComplianceMonitor - Simplify ComplianceState (remove all_tool_call_ids - derived when needed) This eliminates the duplicate check issue where ToolResultOrderProperty and UnmatchedToolResultProperty performed identical checks with different labels. One method now handles all 8 API compliance patterns (a01-a08). Co-authored-by: openhands --- .../sdk/conversation/compliance/__init__.py | 25 +- .../sdk/conversation/compliance/base.py | 65 +--- .../sdk/conversation/compliance/monitor.py | 191 +++++++---- .../compliance/properties/__init__.py | 39 --- .../properties/duplicate_tool_result.py | 43 --- .../properties/interleaved_message.py | 61 ---- .../properties/tool_result_order.py | 49 --- .../properties/unmatched_tool_result.py | 43 --- .../conversation/compliance/test_monitor.py | 89 ++--- .../compliance/test_properties.py | 313 +++++++----------- .../compliance/test_state_integration.py | 1 - 11 files changed, 281 insertions(+), 638 deletions(-) delete mode 100644 openhands-sdk/openhands/sdk/conversation/compliance/properties/__init__.py delete mode 100644 openhands-sdk/openhands/sdk/conversation/compliance/properties/duplicate_tool_result.py delete mode 100644 openhands-sdk/openhands/sdk/conversation/compliance/properties/interleaved_message.py delete mode 100644 openhands-sdk/openhands/sdk/conversation/compliance/properties/tool_result_order.py delete mode 100644 openhands-sdk/openhands/sdk/conversation/compliance/properties/unmatched_tool_result.py diff --git a/openhands-sdk/openhands/sdk/conversation/compliance/__init__.py b/openhands-sdk/openhands/sdk/conversation/compliance/__init__.py index 9950a71f38..823a1ce5b8 100644 --- a/openhands-sdk/openhands/sdk/conversation/compliance/__init__.py +++ b/openhands-sdk/openhands/sdk/conversation/compliance/__init__.py @@ -1,33 +1,24 @@ """API Compliance monitoring for conversation events. -This module provides monitors that detect violations of LLM API requirements -in the event stream. Violations are currently logged for data gathering; -future versions may support per-property reconciliation strategies. +This module provides an APIComplianceMonitor that detects violations of LLM API +requirements in the event stream. Violations are currently logged for data +gathering; future versions may support reconciliation strategies. + +The monitor enforces valid tool-call sequences: +- When tool calls are pending, only matching observations are allowed +- Messages cannot interleave with pending tool calls +- Tool results must reference known tool_call_ids """ from openhands.sdk.conversation.compliance.base import ( - APICompliancePropertyBase, ComplianceState, ComplianceViolation, ) from openhands.sdk.conversation.compliance.monitor import APIComplianceMonitor -from openhands.sdk.conversation.compliance.properties import ( - ALL_COMPLIANCE_PROPERTIES, - DuplicateToolResultProperty, - InterleavedMessageProperty, - ToolResultOrderProperty, - UnmatchedToolResultProperty, -) __all__ = [ - "ALL_COMPLIANCE_PROPERTIES", - "APICompliancePropertyBase", "APIComplianceMonitor", "ComplianceState", "ComplianceViolation", - "DuplicateToolResultProperty", - "InterleavedMessageProperty", - "ToolResultOrderProperty", - "UnmatchedToolResultProperty", ] diff --git a/openhands-sdk/openhands/sdk/conversation/compliance/base.py b/openhands-sdk/openhands/sdk/conversation/compliance/base.py index 32cfe5702d..57350f5614 100644 --- a/openhands-sdk/openhands/sdk/conversation/compliance/base.py +++ b/openhands-sdk/openhands/sdk/conversation/compliance/base.py @@ -1,9 +1,7 @@ """Base classes for API compliance monitoring.""" -from abc import ABC, abstractmethod from dataclasses import dataclass, field -from openhands.sdk.event import LLMConvertibleEvent from openhands.sdk.event.types import EventID, ToolCallID @@ -26,10 +24,11 @@ class ComplianceViolation: @dataclass class ComplianceState: - """Shared state for tracking API compliance across properties. + """Shared state for tracking API compliance. - This state is updated as events are processed and provides the context - needed by individual properties to detect violations. + Tracks the tool call lifecycle to detect violations: + - pending_tool_call_ids: Actions awaiting results + - completed_tool_call_ids: Actions that have received results Attributes: pending_tool_call_ids: Tool calls that have been made but not yet @@ -40,59 +39,3 @@ class ComplianceState: pending_tool_call_ids: dict[ToolCallID, EventID] = field(default_factory=dict) completed_tool_call_ids: set[ToolCallID] = field(default_factory=set) - - @property - def all_tool_call_ids(self) -> set[ToolCallID]: - """All tool_call_ids seen so far (pending or completed). - - This is derived from pending_tool_call_ids (dict keys) and - completed_tool_call_ids to avoid maintaining redundant state. - """ - return set(self.pending_tool_call_ids) | self.completed_tool_call_ids - - -class APICompliancePropertyBase(ABC): - """Base class for API compliance properties. - - Each property represents a single compliance rule that LLM APIs expect. - Properties check incoming events for violations and update shared state. - - The design allows for future per-property reconciliation strategies - while currently defaulting to logging violations. - """ - - @property - @abstractmethod - def name(self) -> str: - """Unique identifier for this property. - - Used in violation reports and logging. - """ - - @abstractmethod - def check( - self, - event: LLMConvertibleEvent, - state: ComplianceState, - ) -> ComplianceViolation | None: - """Check if an event violates this property. - - Args: - event: The event about to be added to the conversation. - state: Current compliance state for context. - - Returns: - A ComplianceViolation if the event violates this property, - None otherwise. - """ - - def update_state(self, event: LLMConvertibleEvent, state: ComplianceState) -> None: - """Update compliance state after an event is processed. - - Override this method if the property needs to track state. - Default implementation does nothing. - - Args: - event: The event that was just processed. - state: The compliance state to update. - """ diff --git a/openhands-sdk/openhands/sdk/conversation/compliance/monitor.py b/openhands-sdk/openhands/sdk/conversation/compliance/monitor.py index e6f4d4f150..241b7b5bee 100644 --- a/openhands-sdk/openhands/sdk/conversation/compliance/monitor.py +++ b/openhands-sdk/openhands/sdk/conversation/compliance/monitor.py @@ -1,12 +1,15 @@ """API Compliance Monitor that checks events before adding to conversation.""" from openhands.sdk.conversation.compliance.base import ( - APICompliancePropertyBase, ComplianceState, ComplianceViolation, ) -from openhands.sdk.conversation.compliance.properties import ALL_COMPLIANCE_PROPERTIES -from openhands.sdk.event import LLMConvertibleEvent +from openhands.sdk.event import ( + ActionEvent, + LLMConvertibleEvent, + MessageEvent, + ObservationBaseEvent, +) from openhands.sdk.logger import get_logger @@ -16,92 +19,114 @@ class APIComplianceMonitor: """Monitors events for API compliance violations. - This monitor checks incoming events against a set of compliance properties - and logs any violations. Currently operates in observation mode (violations - are logged but events are still processed). Future versions may support - per-property reconciliation strategies. + Enforces valid tool-call sequences by checking what events are allowed + given current state. The key invariant: when tool calls are pending, + only matching observations are allowed. + + State machine: + - IDLE (no pending calls): Messages and new actions allowed + - TOOL_CALLING (pending calls): Only matching observations allowed + + Currently operates in observation mode (violations are logged but events + are still processed). Attributes: - state: Shared compliance state tracking tool calls. - properties: List of compliance properties to check. + state: Compliance state tracking pending/completed tool calls. """ - def __init__( - self, - properties: list[APICompliancePropertyBase] | None = None, - ) -> None: - """Initialize the compliance monitor. - - Args: - properties: Compliance properties to check. If None, uses all - default properties. - """ + def __init__(self) -> None: + """Initialize the compliance monitor.""" self.state = ComplianceState() - self.properties = ( - properties if properties is not None else list(ALL_COMPLIANCE_PROPERTIES) - ) - def check_event(self, event: LLMConvertibleEvent) -> list[ComplianceViolation]: - """Check an event for compliance violations. + def _check_tool_call_sequence( + self, event: LLMConvertibleEvent + ) -> ComplianceViolation | None: + """Check if an event violates the tool-call sequence property. + + The rule is simple: if we have pending tool calls, only matching + observations are allowed. This covers all 8 API compliance patterns: - This method checks the event against all properties and returns any - violations found. It does NOT update state - call process_event() - after adding the event to update tracking state. + - a01 (unmatched_tool_use): Message while calls pending + - a02 (unmatched_tool_result): Result with unknown ID + - a03 (interleaved_user_msg): User message while calls pending + - a04 (interleaved_asst_msg): Assistant message while calls pending + - a05 (duplicate_tool_call_id): Result for already-completed ID + - a06 (wrong_tool_call_id): Result with wrong/unknown ID + - a07 (parallel_missing_result): Message before all parallel results + - a08 (parallel_wrong_order): Result before action (unknown ID) Args: event: The event to check. Returns: - List of violations detected (empty if compliant). + A ComplianceViolation if the event violates the property, None otherwise. """ - violations: list[ComplianceViolation] = [] - - for prop in self.properties: - try: - violation = prop.check(event, self.state) - if violation is not None: - violations.append(violation) - logger.warning( - "API compliance violation detected: %s - %s (event_id=%s)", - violation.property_name, - violation.description, - violation.event_id, - ) - except Exception as e: - logger.exception( - "Error checking compliance property %s for event %s: %s", - prop.name, - event.id, - e, + # Actions are always allowed - they start or continue a tool-call batch + if isinstance(event, ActionEvent): + return None + + # Messages require no pending tool calls + if isinstance(event, MessageEvent): + if self.state.pending_tool_call_ids: + pending_ids = list(self.state.pending_tool_call_ids.keys()) + return ComplianceViolation( + property_name="interleaved_message", + event_id=event.id, + description=( + f"Message interleaved with {len(pending_ids)} pending " + f"tool call(s)" + ), + context={"pending_tool_call_ids": pending_ids}, + ) + return None + + # Observations must match a known tool_call_id + if isinstance(event, ObservationBaseEvent): + tool_call_id = event.tool_call_id + + # Check for valid match (pending) + if tool_call_id in self.state.pending_tool_call_ids: + return None # Valid - completes a pending call + + # Check for duplicate (already completed) + if tool_call_id in self.state.completed_tool_call_ids: + return ComplianceViolation( + property_name="duplicate_tool_result", + event_id=event.id, + description=( + f"Duplicate tool result for tool_call_id: {tool_call_id}" + ), + context={"tool_call_id": tool_call_id}, ) - return violations + # Unknown ID - orphan result (covers a02, a06, a08) + return ComplianceViolation( + property_name="unmatched_tool_result", + event_id=event.id, + description=( + f"Tool result references unknown tool_call_id: {tool_call_id}" + ), + context={"tool_call_id": tool_call_id}, + ) - def update_state(self, event: LLMConvertibleEvent) -> None: - """Update compliance state after an event is processed. + return None - Call this after the event has been added to the event log. + def _update_state(self, event: LLMConvertibleEvent) -> None: + """Update compliance state after processing an event. - Args: - event: The event that was just processed. + Tracks the tool-call lifecycle: + - ActionEvent: Add to pending + - ObservationBaseEvent: Move from pending to completed """ - for prop in self.properties: - try: - prop.update_state(event, self.state) - except Exception as e: - logger.exception( - "Error updating state for property %s on event %s: %s", - prop.name, - event.id, - e, - ) + if isinstance(event, ActionEvent): + self.state.pending_tool_call_ids[event.tool_call_id] = event.id + elif isinstance(event, ObservationBaseEvent): + # Move from pending to completed (if it was pending) + self.state.pending_tool_call_ids.pop(event.tool_call_id, None) + self.state.completed_tool_call_ids.add(event.tool_call_id) def process_event(self, event: LLMConvertibleEvent) -> list[ComplianceViolation]: - """Check an event and update state. - - This is a convenience method that combines check_event() and - update_state(). Use this when you want to check and track in - one call. + """Check an event for violations and update state. Args: event: The event to process. @@ -109,6 +134,32 @@ def process_event(self, event: LLMConvertibleEvent) -> list[ComplianceViolation] Returns: List of violations detected (empty if compliant). """ - violations = self.check_event(event) - self.update_state(event) + violations: list[ComplianceViolation] = [] + + try: + violation = self._check_tool_call_sequence(event) + if violation is not None: + violations.append(violation) + logger.warning( + "API compliance violation detected: %s - %s (event_id=%s)", + violation.property_name, + violation.description, + violation.event_id, + ) + except Exception as e: + logger.exception( + "Error checking compliance for event %s: %s", + event.id, + e, + ) + + try: + self._update_state(event) + except Exception as e: + logger.exception( + "Error updating compliance state for event %s: %s", + event.id, + e, + ) + return violations diff --git a/openhands-sdk/openhands/sdk/conversation/compliance/properties/__init__.py b/openhands-sdk/openhands/sdk/conversation/compliance/properties/__init__.py deleted file mode 100644 index dda8670408..0000000000 --- a/openhands-sdk/openhands/sdk/conversation/compliance/properties/__init__.py +++ /dev/null @@ -1,39 +0,0 @@ -"""Concrete implementations of API compliance properties. - -Each property corresponds to one or more API compliance patterns: -- InterleavedMessageProperty: a01, a03, a04, a07 -- UnmatchedToolResultProperty: a02, a06 -- DuplicateToolResultProperty: a05 -- ToolResultOrderProperty: a08 -""" - -from openhands.sdk.conversation.compliance.base import APICompliancePropertyBase -from openhands.sdk.conversation.compliance.properties.duplicate_tool_result import ( - DuplicateToolResultProperty, -) -from openhands.sdk.conversation.compliance.properties.interleaved_message import ( - InterleavedMessageProperty, -) -from openhands.sdk.conversation.compliance.properties.tool_result_order import ( - ToolResultOrderProperty, -) -from openhands.sdk.conversation.compliance.properties.unmatched_tool_result import ( - UnmatchedToolResultProperty, -) - - -# All properties in recommended check order -ALL_COMPLIANCE_PROPERTIES: list[APICompliancePropertyBase] = [ - ToolResultOrderProperty(), - UnmatchedToolResultProperty(), - DuplicateToolResultProperty(), - InterleavedMessageProperty(), -] - -__all__ = [ - "ALL_COMPLIANCE_PROPERTIES", - "DuplicateToolResultProperty", - "InterleavedMessageProperty", - "ToolResultOrderProperty", - "UnmatchedToolResultProperty", -] diff --git a/openhands-sdk/openhands/sdk/conversation/compliance/properties/duplicate_tool_result.py b/openhands-sdk/openhands/sdk/conversation/compliance/properties/duplicate_tool_result.py deleted file mode 100644 index dda47f2463..0000000000 --- a/openhands-sdk/openhands/sdk/conversation/compliance/properties/duplicate_tool_result.py +++ /dev/null @@ -1,43 +0,0 @@ -"""Duplicate tool result property for API compliance monitoring.""" - -from openhands.sdk.conversation.compliance.base import ( - APICompliancePropertyBase, - ComplianceState, - ComplianceViolation, -) -from openhands.sdk.event import LLMConvertibleEvent, ObservationBaseEvent - - -class DuplicateToolResultProperty(APICompliancePropertyBase): - """Detects duplicate tool results for the same tool_call_id. - - Violations: - - Tool result arrives for a tool_call_id that already has a result. - - Corresponds to pattern: a05 (duplicate tool_call_id). - """ - - @property - def name(self) -> str: - return "duplicate_tool_result" - - def check( - self, - event: LLMConvertibleEvent, - state: ComplianceState, - ) -> ComplianceViolation | None: - if not isinstance(event, ObservationBaseEvent): - return None - - # Check if this tool_call_id already has a result - if event.tool_call_id in state.completed_tool_call_ids: - return ComplianceViolation( - property_name=self.name, - event_id=event.id, - description=( - f"Duplicate tool result for tool_call_id: {event.tool_call_id}" - ), - context={"tool_call_id": event.tool_call_id}, - ) - - return None diff --git a/openhands-sdk/openhands/sdk/conversation/compliance/properties/interleaved_message.py b/openhands-sdk/openhands/sdk/conversation/compliance/properties/interleaved_message.py deleted file mode 100644 index b5c17fdb2e..0000000000 --- a/openhands-sdk/openhands/sdk/conversation/compliance/properties/interleaved_message.py +++ /dev/null @@ -1,61 +0,0 @@ -"""Interleaved message property for API compliance monitoring.""" - -from openhands.sdk.conversation.compliance.base import ( - APICompliancePropertyBase, - ComplianceState, - ComplianceViolation, -) -from openhands.sdk.event import ( - ActionEvent, - LLMConvertibleEvent, - MessageEvent, - ObservationBaseEvent, -) - - -class InterleavedMessageProperty(APICompliancePropertyBase): - """Detects messages interleaved between tool_use and tool_result. - - Violations: - - User or assistant message arrives while tool calls are pending. - - Corresponds to patterns: a01 (unmatched tool_use), a03 (interleaved user), - a04 (interleaved assistant), a07 (parallel missing result). - """ - - @property - def name(self) -> str: - return "interleaved_message" - - def check( - self, - event: LLMConvertibleEvent, - state: ComplianceState, - ) -> ComplianceViolation | None: - # Only check MessageEvent - if not isinstance(event, MessageEvent): - return None - - # Violation if there are pending tool calls - if state.pending_tool_call_ids: - pending_ids = list(state.pending_tool_call_ids.keys()) - return ComplianceViolation( - property_name=self.name, - event_id=event.id, - description=( - f"Message event interleaved with {len(pending_ids)} pending " - f"tool call(s)" - ), - context={"pending_tool_call_ids": pending_ids}, - ) - - return None - - def update_state(self, event: LLMConvertibleEvent, state: ComplianceState) -> None: - """Track pending and completed tool calls.""" - if isinstance(event, ActionEvent): - state.pending_tool_call_ids[event.tool_call_id] = event.id - elif isinstance(event, ObservationBaseEvent): - # Move from pending to completed - state.pending_tool_call_ids.pop(event.tool_call_id, None) - state.completed_tool_call_ids.add(event.tool_call_id) diff --git a/openhands-sdk/openhands/sdk/conversation/compliance/properties/tool_result_order.py b/openhands-sdk/openhands/sdk/conversation/compliance/properties/tool_result_order.py deleted file mode 100644 index e42bf231db..0000000000 --- a/openhands-sdk/openhands/sdk/conversation/compliance/properties/tool_result_order.py +++ /dev/null @@ -1,49 +0,0 @@ -"""Tool result order property for API compliance monitoring.""" - -from openhands.sdk.conversation.compliance.base import ( - APICompliancePropertyBase, - ComplianceState, - ComplianceViolation, -) -from openhands.sdk.event import LLMConvertibleEvent, ObservationBaseEvent - - -class ToolResultOrderProperty(APICompliancePropertyBase): - """Detects tool results that arrive before their corresponding actions. - - Violations: - - Tool result arrives before any action with that tool_call_id. - - Corresponds to pattern: a08 (parallel wrong order). - - Note: This is similar to UnmatchedToolResultProperty but semantically - different - here the action may arrive later, whereas unmatched means - the action never existed. - """ - - @property - def name(self) -> str: - return "tool_result_order" - - def check( - self, - event: LLMConvertibleEvent, - state: ComplianceState, - ) -> ComplianceViolation | None: - if not isinstance(event, ObservationBaseEvent): - return None - - # Check if we've seen an action with this tool_call_id. - # all_tool_call_ids combines pending and completed, so a single check suffices. - if event.tool_call_id not in state.all_tool_call_ids: - return ComplianceViolation( - property_name=self.name, - event_id=event.id, - description=( - f"Tool result arrived before action for tool_call_id: " - f"{event.tool_call_id}" - ), - context={"tool_call_id": event.tool_call_id}, - ) - - return None diff --git a/openhands-sdk/openhands/sdk/conversation/compliance/properties/unmatched_tool_result.py b/openhands-sdk/openhands/sdk/conversation/compliance/properties/unmatched_tool_result.py deleted file mode 100644 index d4f9356a9d..0000000000 --- a/openhands-sdk/openhands/sdk/conversation/compliance/properties/unmatched_tool_result.py +++ /dev/null @@ -1,43 +0,0 @@ -"""Unmatched tool result property for API compliance monitoring.""" - -from openhands.sdk.conversation.compliance.base import ( - APICompliancePropertyBase, - ComplianceState, - ComplianceViolation, -) -from openhands.sdk.event import LLMConvertibleEvent, ObservationBaseEvent - - -class UnmatchedToolResultProperty(APICompliancePropertyBase): - """Detects tool results that reference unknown tool_call_ids. - - Violations: - - Tool result references a tool_call_id that was never seen. - - Corresponds to patterns: a02 (unmatched tool_result), a06 (wrong tool_call_id). - """ - - @property - def name(self) -> str: - return "unmatched_tool_result" - - def check( - self, - event: LLMConvertibleEvent, - state: ComplianceState, - ) -> ComplianceViolation | None: - if not isinstance(event, ObservationBaseEvent): - return None - - # Check if tool_call_id was ever seen - if event.tool_call_id not in state.all_tool_call_ids: - return ComplianceViolation( - property_name=self.name, - event_id=event.id, - description=( - f"Tool result references unknown tool_call_id: {event.tool_call_id}" - ), - context={"tool_call_id": event.tool_call_id}, - ) - - return None diff --git a/tests/sdk/conversation/compliance/test_monitor.py b/tests/sdk/conversation/compliance/test_monitor.py index 55e5a34cd9..5f2cf31346 100644 --- a/tests/sdk/conversation/compliance/test_monitor.py +++ b/tests/sdk/conversation/compliance/test_monitor.py @@ -1,15 +1,8 @@ """Tests for the APIComplianceMonitor.""" -from openhands.sdk.conversation.compliance import ( - APIComplianceMonitor, - InterleavedMessageProperty, -) -from openhands.sdk.conversation.compliance.base import ( - APICompliancePropertyBase, - ComplianceState, - ComplianceViolation, -) -from openhands.sdk.event import LLMConvertibleEvent +from unittest.mock import patch + +from openhands.sdk.conversation.compliance import APIComplianceMonitor from tests.sdk.conversation.compliance.conftest import ( make_action_event, make_observation_event, @@ -57,19 +50,17 @@ def test_monitor_detects_interleaved_message(): assert violations[0].property_name == "interleaved_message" -def test_monitor_detects_multiple_violations(): - """Monitor should detect multiple violations.""" +def test_monitor_detects_orphan_observation(): + """Monitor should detect orphan observation as single violation.""" monitor = APIComplianceMonitor() - # Orphan observation (unmatched + order violation) + # Orphan observation (unknown tool_call_id) orphan = make_orphan_observation_event(tool_call_id="call_unknown") violations = monitor.process_event(orphan) - # Should detect both tool_result_order and unmatched_tool_result - assert len(violations) >= 2 - property_names = {v.property_name for v in violations} - assert "tool_result_order" in property_names - assert "unmatched_tool_result" in property_names + # Should detect exactly one violation (unmatched_tool_result) + assert len(violations) == 1 + assert violations[0].property_name == "unmatched_tool_result" def test_monitor_returns_violations_per_call(): @@ -95,19 +86,6 @@ def test_monitor_returns_violations_per_call(): assert len(all_violations) > initial_count -def test_monitor_custom_properties(): - """Monitor can be initialized with custom properties.""" - # Only check interleaved messages - monitor = APIComplianceMonitor(properties=[InterleavedMessageProperty()]) - - # This would normally trigger tool_result_order violation - orphan = make_orphan_observation_event(tool_call_id="call_unknown") - violations = monitor.process_event(orphan) - - # But we only have InterleavedMessageProperty, so no violations - assert len(violations) == 0 - - def test_monitor_state_persists_across_events(): """Monitor state should persist correctly across events.""" monitor = APIComplianceMonitor() @@ -117,7 +95,6 @@ def test_monitor_state_persists_across_events(): monitor.process_event(action1) assert "call_1" in monitor.state.pending_tool_call_ids - assert "call_1" in monitor.state.all_tool_call_ids # Add observation obs1 = make_observation_event(action1) @@ -125,7 +102,6 @@ def test_monitor_state_persists_across_events(): assert "call_1" not in monitor.state.pending_tool_call_ids assert "call_1" in monitor.state.completed_tool_call_ids - assert "call_1" in monitor.state.all_tool_call_ids def test_monitor_parallel_tool_calls(): @@ -155,32 +131,35 @@ def test_monitor_parallel_tool_calls(): assert "call_paris" in str(violations[0].context) -def test_monitor_handles_buggy_property_gracefully(): - """Monitor should handle exceptions in property checks gracefully. +def test_monitor_handles_check_exception_gracefully(): + """Monitor should handle exceptions in check gracefully. - A buggy property that raises an exception should not crash the monitor. - Instead, the exception is logged and processing continues. This ensures - observation mode is robust against buggy properties. + If _check_tool_call_sequence raises an exception, it should be caught + and logged, not crash the monitor. This ensures observation mode is robust. """ + monitor = APIComplianceMonitor() - class BuggyProperty(APICompliancePropertyBase): - @property - def name(self) -> str: - return "buggy" + with patch.object( + monitor, "_check_tool_call_sequence", side_effect=ValueError("Oops!") + ): + # Should not raise - the exception should be caught and logged + violations = monitor.process_event(make_user_message_event()) - def check( - self, - event: LLMConvertibleEvent, - state: ComplianceState, - ) -> ComplianceViolation | None: - raise ValueError("Oops!") + # The monitor should continue working despite the error + assert violations == [] - monitor = APIComplianceMonitor(properties=[BuggyProperty()]) - # Should not raise - the exception should be caught and logged - # We expect an empty list because the buggy property doesn't return a violation - # (it raises instead) - violations = monitor.process_event(make_user_message_event()) +def test_monitor_handles_update_exception_gracefully(): + """Monitor should handle exceptions in state update gracefully. + + If _update_state raises an exception, it should be caught and logged, + not crash the monitor. + """ + monitor = APIComplianceMonitor() + + with patch.object(monitor, "_update_state", side_effect=ValueError("Oops!")): + # Should not raise - the exception should be caught and logged + violations = monitor.process_event(make_action_event()) - # The monitor should continue working despite the error - assert violations == [] + # The monitor should continue working + assert violations == [] diff --git a/tests/sdk/conversation/compliance/test_properties.py b/tests/sdk/conversation/compliance/test_properties.py index 7d8477f27a..15513f1a65 100644 --- a/tests/sdk/conversation/compliance/test_properties.py +++ b/tests/sdk/conversation/compliance/test_properties.py @@ -1,23 +1,19 @@ -"""Tests for individual API compliance properties. - -These tests correspond to the API compliance patterns in tests/integration/tests/a*.py: -- a01: Unmatched tool_use -> InterleavedMessageProperty -- a02: Unmatched tool_result -> UnmatchedToolResultProperty -- a03: Interleaved user message -> InterleavedMessageProperty -- a04: Interleaved assistant message -> InterleavedMessageProperty -- a05: Duplicate tool_call_id -> DuplicateToolResultProperty -- a06: Wrong tool_call_id -> UnmatchedToolResultProperty -- a07: Parallel missing result -> InterleavedMessageProperty -- a08: Parallel wrong order -> ToolResultOrderProperty +"""Tests for API compliance monitoring. + +These tests verify the monitor detects all 8 API compliance patterns from +tests/integration/tests/a*.py: + +- a01: Unmatched tool_use (message while calls pending) +- a02: Unmatched tool_result (result with unknown ID) +- a03: Interleaved user message (user message while calls pending) +- a04: Interleaved assistant message (assistant message while calls pending) +- a05: Duplicate tool_call_id (result for already-completed ID) +- a06: Wrong tool_call_id (result with wrong/unknown ID) +- a07: Parallel missing result (message before all parallel results) +- a08: Parallel wrong order (result before action) """ -from openhands.sdk.conversation.compliance import ( - ComplianceState, - DuplicateToolResultProperty, - InterleavedMessageProperty, - ToolResultOrderProperty, - UnmatchedToolResultProperty, -) +from openhands.sdk.conversation.compliance import APIComplianceMonitor from tests.sdk.conversation.compliance.conftest import ( make_action_event, make_assistant_message_event, @@ -28,263 +24,186 @@ # ============================================================================= -# InterleavedMessageProperty Tests -# Detects: a01, a03, a04, a07 +# Interleaved Message Violations (a01, a03, a04, a07) # ============================================================================= -def test_interleaved_no_violation_when_no_pending_actions(): +def test_no_violation_message_when_no_pending_actions(): """User message is fine when no tool calls are pending.""" - prop = InterleavedMessageProperty() - state = ComplianceState() + monitor = APIComplianceMonitor() user_msg = make_user_message_event() - violation = prop.check(user_msg, state) + violations = monitor.process_event(user_msg) - assert violation is None + assert len(violations) == 0 -def test_interleaved_violation_user_message_with_pending_action(): +def test_violation_user_message_with_pending_action(): """User message while action is pending violates the property (a01/a03).""" - prop = InterleavedMessageProperty() - state = ComplianceState() + monitor = APIComplianceMonitor() - # Simulate an action that's pending + # Add an action (now pending) action = make_action_event(tool_call_id="call_123") - state.pending_tool_call_ids["call_123"] = action.id + monitor.process_event(action) + # User message before observation - violation user_msg = make_user_message_event() - violation = prop.check(user_msg, state) + violations = monitor.process_event(user_msg) - assert violation is not None - assert violation.property_name == "interleaved_message" - assert "pending" in violation.description.lower() - assert "call_123" in str(violation.context) + assert len(violations) == 1 + assert violations[0].property_name == "interleaved_message" + assert "pending" in violations[0].description.lower() + assert "call_123" in str(violations[0].context) -def test_interleaved_violation_assistant_message_with_pending_action(): +def test_violation_assistant_message_with_pending_action(): """Assistant message while action is pending violates the property (a04).""" - prop = InterleavedMessageProperty() - state = ComplianceState() + monitor = APIComplianceMonitor() action = make_action_event(tool_call_id="call_456") - state.pending_tool_call_ids["call_456"] = action.id + monitor.process_event(action) assistant_msg = make_assistant_message_event() - violation = prop.check(assistant_msg, state) + violations = monitor.process_event(assistant_msg) - assert violation is not None - assert violation.property_name == "interleaved_message" + assert len(violations) == 1 + assert violations[0].property_name == "interleaved_message" -def test_interleaved_violation_parallel_missing_result(): +def test_violation_parallel_missing_result(): """User message with partial parallel results violates property (a07).""" - prop = InterleavedMessageProperty() - state = ComplianceState() + monitor = APIComplianceMonitor() - # Simulate 3 parallel actions, only 2 have results - state.pending_tool_call_ids["call_sf"] = "action_sf" - state.pending_tool_call_ids["call_tokyo"] = "action_tokyo" - state.pending_tool_call_ids["call_paris"] = "action_paris" + # Three parallel actions + action1 = make_action_event(tool_call_id="call_sf") + action2 = make_action_event(tool_call_id="call_tokyo") + action3 = make_action_event(tool_call_id="call_paris") + for action in [action1, action2, action3]: + monitor.process_event(action) # Two results arrive - state.pending_tool_call_ids.pop("call_sf") - state.completed_tool_call_ids.add("call_sf") - state.pending_tool_call_ids.pop("call_tokyo") - state.completed_tool_call_ids.add("call_tokyo") + monitor.process_event(make_observation_event(action1)) + monitor.process_event(make_observation_event(action2)) # call_paris is still pending user_msg = make_user_message_event("What about Paris?") - violation = prop.check(user_msg, state) + violations = monitor.process_event(user_msg) - assert violation is not None - assert "call_paris" in str(violation.context) + assert len(violations) == 1 + assert "call_paris" in str(violations[0].context) -def test_interleaved_no_violation_for_action_events(): - """ActionEvent itself doesn't trigger interleaved violation.""" - prop = InterleavedMessageProperty() - state = ComplianceState() +def test_no_violation_for_action_events(): + """ActionEvent itself doesn't trigger violation (always allowed).""" + monitor = APIComplianceMonitor() # Even with pending actions, a new action is fine - state.pending_tool_call_ids["call_existing"] = "action_existing" - action = make_action_event() + action1 = make_action_event(tool_call_id="call_existing") + monitor.process_event(action1) + + action2 = make_action_event(tool_call_id="call_new") + violations = monitor.process_event(action2) - violation = prop.check(action, state) - assert violation is None + assert len(violations) == 0 -def test_interleaved_no_violation_for_observation_events(): - """ObservationEvent doesn't trigger interleaved violation.""" - prop = InterleavedMessageProperty() - state = ComplianceState() +def test_no_violation_for_matching_observation(): + """ObservationEvent matching a pending action is allowed.""" + monitor = APIComplianceMonitor() action = make_action_event(tool_call_id="call_789") - state.pending_tool_call_ids["call_789"] = action.id + monitor.process_event(action) obs = make_observation_event(action) - violation = prop.check(obs, state) + violations = monitor.process_event(obs) - assert violation is None + assert len(violations) == 0 # ============================================================================= -# UnmatchedToolResultProperty Tests -# Detects: a02, a06 +# Unmatched Tool Result Violations (a02, a06, a08) # ============================================================================= -def test_unmatched_result_no_violation_when_action_exists(): - """Tool result is fine when its tool_call_id exists.""" - prop = UnmatchedToolResultProperty() - state = ComplianceState() +def test_no_violation_when_action_exists(): + """Tool result is fine when its tool_call_id is pending.""" + monitor = APIComplianceMonitor() action = make_action_event(tool_call_id="call_valid") - state.all_tool_call_ids.add("call_valid") - state.pending_tool_call_ids["call_valid"] = action.id + monitor.process_event(action) obs = make_observation_event(action) - violation = prop.check(obs, state) + violations = monitor.process_event(obs) - assert violation is None + assert len(violations) == 0 -def test_unmatched_result_violation_unknown_tool_call_id(): - """Tool result with unknown tool_call_id violates property (a02/a06).""" - prop = UnmatchedToolResultProperty() - state = ComplianceState() +def test_violation_unknown_tool_call_id(): + """Tool result with unknown tool_call_id violates property (a02/a06/a08).""" + monitor = APIComplianceMonitor() # No actions have been seen orphan_obs = make_orphan_observation_event(tool_call_id="call_unknown") - violation = prop.check(orphan_obs, state) + violations = monitor.process_event(orphan_obs) - assert violation is not None - assert violation.property_name == "unmatched_tool_result" - assert "call_unknown" in violation.description + assert len(violations) == 1 + assert violations[0].property_name == "unmatched_tool_result" + assert "call_unknown" in violations[0].description -def test_unmatched_result_violation_wrong_tool_call_id(): +def test_violation_wrong_tool_call_id(): """Tool result referencing wrong tool_call_id violates property (a06).""" - prop = UnmatchedToolResultProperty() - state = ComplianceState() + monitor = APIComplianceMonitor() # We have action with call_correct, but result references call_wrong - state.all_tool_call_ids.add("call_correct") - state.pending_tool_call_ids["call_correct"] = "action_correct" + action = make_action_event(tool_call_id="call_correct") + monitor.process_event(action) orphan_obs = make_orphan_observation_event(tool_call_id="call_wrong") - violation = prop.check(orphan_obs, state) - - assert violation is not None - assert "call_wrong" in violation.description - + violations = monitor.process_event(orphan_obs) -def test_unmatched_result_no_violation_for_non_observation(): - """Non-observation events don't trigger this property.""" - prop = UnmatchedToolResultProperty() - state = ComplianceState() - - user_msg = make_user_message_event() - violation = prop.check(user_msg, state) - - assert violation is None + assert len(violations) == 1 + assert violations[0].property_name == "unmatched_tool_result" + assert "call_wrong" in violations[0].description # ============================================================================= -# DuplicateToolResultProperty Tests -# Detects: a05 +# Duplicate Tool Result Violations (a05) # ============================================================================= -def test_duplicate_result_no_violation_first_result(): +def test_no_violation_first_result(): """First tool result for a tool_call_id is fine.""" - prop = DuplicateToolResultProperty() - state = ComplianceState() + monitor = APIComplianceMonitor() action = make_action_event(tool_call_id="call_first") - state.all_tool_call_ids.add("call_first") - state.pending_tool_call_ids["call_first"] = action.id + monitor.process_event(action) obs = make_observation_event(action) - violation = prop.check(obs, state) + violations = monitor.process_event(obs) - assert violation is None + assert len(violations) == 0 -def test_duplicate_result_violation_second_result(): +def test_violation_duplicate_result(): """Second tool result for same tool_call_id violates property (a05).""" - prop = DuplicateToolResultProperty() - state = ComplianceState() - - # First result already processed - state.completed_tool_call_ids.add("call_duplicate") - state.all_tool_call_ids.add("call_duplicate") + monitor = APIComplianceMonitor() - # Second result arrives - duplicate_obs = make_orphan_observation_event(tool_call_id="call_duplicate") - violation = prop.check(duplicate_obs, state) + action = make_action_event(tool_call_id="call_duplicate") + monitor.process_event(action) - assert violation is not None - assert violation.property_name == "duplicate_tool_result" - assert "call_duplicate" in violation.description + # First result - fine + obs1 = make_observation_event(action) + violations = monitor.process_event(obs1) + assert len(violations) == 0 + # Second result for same ID - violation + obs2 = make_orphan_observation_event(tool_call_id="call_duplicate") + violations = monitor.process_event(obs2) -def test_duplicate_result_no_violation_for_non_observation(): - """Non-observation events don't trigger this property.""" - prop = DuplicateToolResultProperty() - state = ComplianceState() - - action = make_action_event() - violation = prop.check(action, state) - - assert violation is None - - -# ============================================================================= -# ToolResultOrderProperty Tests -# Detects: a08 -# ============================================================================= - - -def test_result_order_no_violation_normal_order(): - """Tool result after action is fine.""" - prop = ToolResultOrderProperty() - state = ComplianceState() - - action = make_action_event(tool_call_id="call_ordered") - state.all_tool_call_ids.add("call_ordered") - state.pending_tool_call_ids["call_ordered"] = action.id - - obs = make_observation_event(action) - violation = prop.check(obs, state) - - assert violation is None - - -def test_result_order_violation_result_before_action(): - """Tool result before action violates property (a08).""" - prop = ToolResultOrderProperty() - state = ComplianceState() - - # Result arrives, but no action has been seen with this ID - # (different from unmatched - here the action may come later) - orphan_obs = make_orphan_observation_event(tool_call_id="call_early") - violation = prop.check(orphan_obs, state) - - assert violation is not None - assert violation.property_name == "tool_result_order" - assert "call_early" in violation.description - - -def test_result_order_no_violation_for_non_observation(): - """Non-observation events don't trigger this property.""" - prop = ToolResultOrderProperty() - state = ComplianceState() - - user_msg = make_user_message_event() - violation = prop.check(user_msg, state) - - assert violation is None + assert len(violations) == 1 + assert violations[0].property_name == "duplicate_tool_result" + assert "call_duplicate" in violations[0].description # ============================================================================= @@ -294,28 +213,24 @@ def test_result_order_no_violation_for_non_observation(): def test_state_update_action_adds_pending(): """Adding an action should update pending_tool_call_ids.""" - prop = InterleavedMessageProperty() - state = ComplianceState() + monitor = APIComplianceMonitor() action = make_action_event(tool_call_id="call_new") - prop.update_state(action, state) + monitor.process_event(action) - assert "call_new" in state.pending_tool_call_ids - assert "call_new" in state.all_tool_call_ids + assert "call_new" in monitor.state.pending_tool_call_ids def test_state_update_observation_resolves_pending(): """Adding an observation should move from pending to completed.""" - prop = InterleavedMessageProperty() - state = ComplianceState() + monitor = APIComplianceMonitor() - # Setup: action is pending action = make_action_event(tool_call_id="call_resolve") - state.pending_tool_call_ids["call_resolve"] = action.id - state.all_tool_call_ids.add("call_resolve") + monitor.process_event(action) + assert "call_resolve" in monitor.state.pending_tool_call_ids obs = make_observation_event(action) - prop.update_state(obs, state) + monitor.process_event(obs) - assert "call_resolve" not in state.pending_tool_call_ids - assert "call_resolve" in state.completed_tool_call_ids + assert "call_resolve" not in monitor.state.pending_tool_call_ids + assert "call_resolve" in monitor.state.completed_tool_call_ids diff --git a/tests/sdk/conversation/compliance/test_state_integration.py b/tests/sdk/conversation/compliance/test_state_integration.py index 9af6200218..2c7c7c41f1 100644 --- a/tests/sdk/conversation/compliance/test_state_integration.py +++ b/tests/sdk/conversation/compliance/test_state_integration.py @@ -130,7 +130,6 @@ def test_add_event_tracks_state_correctly(conversation_state): monitor = conversation_state.compliance_monitor assert "call_track" in monitor.state.pending_tool_call_ids - assert "call_track" in monitor.state.all_tool_call_ids obs = make_observation_event(action) conversation_state.add_event(obs) From 770723aca691649aa7ea2e7aa5e9f8917b1c25e1 Mon Sep 17 00:00:00 2001 From: openhands Date: Fri, 27 Feb 2026 22:04:12 +0000 Subject: [PATCH 06/13] Extract compliance patterns to shared module with unit tests - Create tests/integration/api_compliance/patterns.py with all 8 compliance patterns (a01-a08) as CompliancePattern dataclass instances - Add tests/sdk/conversation/compliance/test_pattern_detection.py with 17 tests that verify the APIComplianceMonitor catches all patterns - Refactor a01-a08 integration test files to use the shared patterns module This enables: 1. Fast unit tests that verify monitor catches violations (no LLM calls) 2. Separate integration tests that verify LLM API responses 3. Single source of truth for pattern definitions Co-authored-by: openhands --- tests/integration/api_compliance/patterns.py | 462 ++++++++++++++++++ .../tests/a01_unmatched_tool_use.py | 49 +- .../tests/a02_unmatched_tool_result.py | 44 +- .../tests/a03_interleaved_user_msg.py | 56 +-- .../tests/a04_interleaved_asst_msg.py | 55 +-- .../tests/a05_duplicate_tool_call_id.py | 75 +-- .../tests/a06_wrong_tool_call_id.py | 76 +-- .../tests/a07_parallel_missing_result.py | 81 +-- .../tests/a08_parallel_wrong_order.py | 62 +-- .../compliance/test_pattern_detection.py | 326 ++++++++++++ 10 files changed, 831 insertions(+), 455 deletions(-) create mode 100644 tests/integration/api_compliance/patterns.py create mode 100644 tests/sdk/conversation/compliance/test_pattern_detection.py diff --git a/tests/integration/api_compliance/patterns.py b/tests/integration/api_compliance/patterns.py new file mode 100644 index 0000000000..d60688b095 --- /dev/null +++ b/tests/integration/api_compliance/patterns.py @@ -0,0 +1,462 @@ +"""API Compliance Pattern Definitions. + +This module defines the 8 malformed message patterns (a01-a08) that test +API compliance. Each pattern is a list of Message objects representing +a malformed conversation sequence. + +These patterns are used by: +1. Integration tests that verify how LLM APIs respond to malformed input +2. Unit tests that verify the APIComplianceMonitor catches these violations +""" + +from dataclasses import dataclass + +from openhands.sdk.llm import Message, MessageToolCall, TextContent + + +@dataclass +class CompliancePattern: + """A compliance test pattern with metadata.""" + + name: str + description: str + messages: list[Message] + expected_violation: str # The violation property_name we expect + + +# ============================================================================= +# Pattern a01: Unmatched tool_use +# ============================================================================= +A01_UNMATCHED_TOOL_USE = CompliancePattern( + name="unmatched_tool_use", + description=( + "Conversation where an assistant message contains a tool_use " + "(tool_calls), but no tool_result follows before the next user message." + ), + messages=[ + Message( + role="system", + content=[TextContent(text="You are a helpful assistant.")], + ), + Message( + role="user", + content=[TextContent(text="List the files in the current directory.")], + ), + # Assistant message with tool_use + Message( + role="assistant", + content=[TextContent(text="I'll list the files for you.")], + tool_calls=[ + MessageToolCall( + id="call_abc123", + name="terminal", + arguments='{"command": "ls -la"}', + origin="completion", + ) + ], + ), + # NOTE: No tool_result follows! Directly another user message. + Message( + role="user", + content=[TextContent(text="What was the result?")], + ), + ], + expected_violation="interleaved_message", +) + + +# ============================================================================= +# Pattern a02: Unmatched tool_result +# ============================================================================= +A02_UNMATCHED_TOOL_RESULT = CompliancePattern( + name="unmatched_tool_result", + description=( + "Conversation where a tool_result message references a tool_call_id " + "that doesn't exist in any prior assistant message's tool_calls." + ), + messages=[ + Message( + role="system", + content=[TextContent(text="You are a helpful assistant.")], + ), + Message( + role="user", + content=[TextContent(text="List the files in the current directory.")], + ), + # Assistant message WITHOUT tool_use + Message( + role="assistant", + content=[TextContent(text="I can help you list files. What directory?")], + ), + # Tool result that references a non-existent tool_call_id + Message( + role="tool", + content=[TextContent(text="file1.txt\nfile2.txt\nfile3.txt")], + tool_call_id="call_nonexistent_xyz", + name="terminal", + ), + ], + expected_violation="unmatched_tool_result", +) + + +# ============================================================================= +# Pattern a03: Interleaved user message +# ============================================================================= +A03_INTERLEAVED_USER_MSG = CompliancePattern( + name="interleaved_user_message", + description=( + "Conversation where a user message appears between a tool_use " + "(in assistant message) and its corresponding tool_result." + ), + messages=[ + Message( + role="system", + content=[TextContent(text="You are a helpful assistant.")], + ), + Message( + role="user", + content=[TextContent(text="List the files in the current directory.")], + ), + # Assistant message with tool_use + Message( + role="assistant", + content=[TextContent(text="I'll list the files for you.")], + tool_calls=[ + MessageToolCall( + id="call_abc123", + name="terminal", + arguments='{"command": "ls -la"}', + origin="completion", + ) + ], + ), + # INTERLEAVED: User message before tool_result + Message( + role="user", + content=[TextContent(text="Actually, can you also show hidden files?")], + ), + # Tool result comes AFTER the interleaved user message + Message( + role="tool", + content=[TextContent(text="file1.txt\nfile2.txt")], + tool_call_id="call_abc123", + name="terminal", + ), + ], + expected_violation="interleaved_message", +) + + +# ============================================================================= +# Pattern a04: Interleaved assistant message +# ============================================================================= +A04_INTERLEAVED_ASST_MSG = CompliancePattern( + name="interleaved_assistant_message", + description=( + "Conversation where an assistant message (without tool_calls) appears " + "between a tool_use and its corresponding tool_result." + ), + messages=[ + Message( + role="system", + content=[TextContent(text="You are a helpful assistant.")], + ), + Message( + role="user", + content=[TextContent(text="List the files in the current directory.")], + ), + # First assistant message with tool_use + Message( + role="assistant", + content=[TextContent(text="I'll list the files for you.")], + tool_calls=[ + MessageToolCall( + id="call_abc123", + name="terminal", + arguments='{"command": "ls -la"}', + origin="completion", + ) + ], + ), + # INTERLEAVED: Another assistant message without tool_calls + Message( + role="assistant", + content=[TextContent(text="The command is running...")], + ), + # Tool result comes AFTER the interleaved assistant message + Message( + role="tool", + content=[TextContent(text="file1.txt\nfile2.txt")], + tool_call_id="call_abc123", + name="terminal", + ), + ], + expected_violation="interleaved_message", +) + + +# ============================================================================= +# Pattern a05: Duplicate tool_call_id +# ============================================================================= +A05_DUPLICATE_TOOL_CALL_ID = CompliancePattern( + name="duplicate_tool_call_id", + description=( + "Conversation where two tool_result messages have the same tool_call_id, " + "meaning multiple results are provided for a single tool_use." + ), + messages=[ + Message( + role="system", + content=[TextContent(text="You are a helpful assistant.")], + ), + Message( + role="user", + content=[TextContent(text="List the files in the current directory.")], + ), + # Assistant message with tool_use + Message( + role="assistant", + content=[TextContent(text="I'll list the files for you.")], + tool_calls=[ + MessageToolCall( + id="call_abc123", + name="terminal", + arguments='{"command": "ls -la"}', + origin="completion", + ) + ], + ), + # First tool result (correct) + Message( + role="tool", + content=[TextContent(text="file1.txt\nfile2.txt")], + tool_call_id="call_abc123", + name="terminal", + ), + # Some intervening messages + Message( + role="user", + content=[TextContent(text="Thanks! Now what?")], + ), + Message( + role="assistant", + content=[ + TextContent(text="You're welcome! Let me know if you need anything.") + ], + ), + Message( + role="user", + content=[TextContent(text="Actually, show me the files again.")], + ), + # DUPLICATE: Second tool result with SAME tool_call_id + Message( + role="tool", + content=[TextContent(text="file1.txt\nfile2.txt\nfile3.txt")], + tool_call_id="call_abc123", # Same ID as before! + name="terminal", + ), + ], + expected_violation="duplicate_tool_result", +) + + +# ============================================================================= +# Pattern a06: Wrong tool_call_id +# ============================================================================= +A06_WRONG_TOOL_CALL_ID = CompliancePattern( + name="wrong_tool_call_id", + description=( + "Conversation where a tool_result references the wrong tool_call_id " + "(one that has already been completed)." + ), + messages=[ + Message( + role="system", + content=[TextContent(text="You are a helpful assistant.")], + ), + Message( + role="user", + content=[TextContent(text="Run two commands: ls and pwd")], + ), + # First assistant message with tool_use (id=A) + Message( + role="assistant", + content=[TextContent(text="I'll run ls first.")], + tool_calls=[ + MessageToolCall( + id="call_A_ls", + name="terminal", + arguments='{"command": "ls"}', + origin="completion", + ) + ], + ), + # First tool result - CORRECT + Message( + role="tool", + content=[TextContent(text="file1.txt\nfile2.txt")], + tool_call_id="call_A_ls", + name="terminal", + ), + # Second assistant message with tool_use (id=B) + Message( + role="assistant", + content=[TextContent(text="Now I'll run pwd.")], + tool_calls=[ + MessageToolCall( + id="call_B_pwd", + name="terminal", + arguments='{"command": "pwd"}', + origin="completion", + ) + ], + ), + # Second tool result - WRONG ID (references first tool_use which is done) + Message( + role="tool", + content=[TextContent(text="/home/user/project")], + tool_call_id="call_A_ls", # Wrong! Should be call_B_pwd + name="terminal", + ), + ], + expected_violation="duplicate_tool_result", # It's a duplicate of completed ID +) + + +# ============================================================================= +# Pattern a07: Parallel missing result +# ============================================================================= +A07_PARALLEL_MISSING_RESULT = CompliancePattern( + name="parallel_missing_result", + description=( + "Conversation where an assistant message contains multiple parallel " + "tool_calls, but only some of them have corresponding tool_results." + ), + messages=[ + Message( + role="system", + content=[TextContent(text="You are a helpful assistant.")], + ), + Message( + role="user", + content=[ + TextContent(text="Get the weather in San Francisco, Tokyo, and Paris.") + ], + ), + # Assistant message with THREE parallel tool_calls + Message( + role="assistant", + content=[TextContent(text="I'll check the weather in all three cities.")], + tool_calls=[ + MessageToolCall( + id="call_sf", + name="terminal", + arguments='{"command": "weather sf"}', + origin="completion", + ), + MessageToolCall( + id="call_tokyo", + name="terminal", + arguments='{"command": "weather tokyo"}', + origin="completion", + ), + MessageToolCall( + id="call_paris", + name="terminal", + arguments='{"command": "weather paris"}', + origin="completion", + ), + ], + ), + # Tool result for SF - provided + Message( + role="tool", + content=[TextContent(text="San Francisco: 65°F, Sunny")], + tool_call_id="call_sf", + name="terminal", + ), + # Tool result for Tokyo - provided + Message( + role="tool", + content=[TextContent(text="Tokyo: 72°F, Cloudy")], + tool_call_id="call_tokyo", + name="terminal", + ), + # NOTE: Tool result for Paris is MISSING! + # Next user message arrives before Paris result + Message( + role="user", + content=[TextContent(text="What about Paris?")], + ), + ], + expected_violation="interleaved_message", +) + + +# ============================================================================= +# Pattern a08: Parallel wrong order +# ============================================================================= +A08_PARALLEL_WRONG_ORDER = CompliancePattern( + name="parallel_wrong_order", + description=( + "Conversation where tool_results appear before the assistant message " + "that contains the corresponding tool_calls." + ), + messages=[ + Message( + role="system", + content=[TextContent(text="You are a helpful assistant.")], + ), + Message( + role="user", + content=[TextContent(text="Check the weather in SF and Tokyo.")], + ), + # Tool results appear FIRST (wrong!) + Message( + role="tool", + content=[TextContent(text="San Francisco: 65°F, Sunny")], + tool_call_id="call_sf", + name="terminal", + ), + Message( + role="tool", + content=[TextContent(text="Tokyo: 72°F, Cloudy")], + tool_call_id="call_tokyo", + name="terminal", + ), + # Assistant message with tool_calls comes AFTER tool_results + Message( + role="assistant", + content=[TextContent(text="I'll check both cities.")], + tool_calls=[ + MessageToolCall( + id="call_sf", + name="terminal", + arguments='{"command": "weather sf"}', + origin="completion", + ), + MessageToolCall( + id="call_tokyo", + name="terminal", + arguments='{"command": "weather tokyo"}', + origin="completion", + ), + ], + ), + ], + expected_violation="unmatched_tool_result", +) + + +# All patterns for iteration +ALL_COMPLIANCE_PATTERNS = [ + A01_UNMATCHED_TOOL_USE, + A02_UNMATCHED_TOOL_RESULT, + A03_INTERLEAVED_USER_MSG, + A04_INTERLEAVED_ASST_MSG, + A05_DUPLICATE_TOOL_CALL_ID, + A06_WRONG_TOOL_CALL_ID, + A07_PARALLEL_MISSING_RESULT, + A08_PARALLEL_WRONG_ORDER, +] diff --git a/tests/integration/tests/a01_unmatched_tool_use.py b/tests/integration/tests/a01_unmatched_tool_use.py index 490b6c20fa..5c7b521cf9 100644 --- a/tests/integration/tests/a01_unmatched_tool_use.py +++ b/tests/integration/tests/a01_unmatched_tool_use.py @@ -4,26 +4,14 @@ Tests how different LLM APIs respond when a tool_use message is sent without a corresponding tool_result. - Pattern: [system] → [user] → [assistant with tool_use] → [user message] → API CALL ↑ No tool_result! """ -from openhands.sdk.llm import Message, MessageToolCall, TextContent +from openhands.sdk.llm import Message from tests.integration.api_compliance.base import BaseAPIComplianceTest - - -PATTERN_NAME = "unmatched_tool_use" -DESCRIPTION = """ -Sends a conversation where an assistant message contains a tool_use (tool_calls), -but no tool_result (tool message) follows before the next user message. - -This pattern can occur when: -- ObservationEvent is delayed or lost -- User message arrives before observation is recorded -- Event sync issues during conversation resume -""" +from tests.integration.api_compliance.patterns import A01_UNMATCHED_TOOL_USE class UnmatchedToolUseTest(BaseAPIComplianceTest): @@ -31,39 +19,12 @@ class UnmatchedToolUseTest(BaseAPIComplianceTest): @property def pattern_name(self) -> str: - return PATTERN_NAME + return A01_UNMATCHED_TOOL_USE.name @property def pattern_description(self) -> str: - return DESCRIPTION + return A01_UNMATCHED_TOOL_USE.description def build_malformed_messages(self) -> list[Message]: """Build message sequence with unmatched tool_use.""" - return [ - Message( - role="system", - content=[TextContent(text="You are a helpful assistant.")], - ), - Message( - role="user", - content=[TextContent(text="List the files in the current directory.")], - ), - # Assistant message with tool_use - Message( - role="assistant", - content=[TextContent(text="I'll list the files for you.")], - tool_calls=[ - MessageToolCall( - id="call_abc123", - name="terminal", - arguments='{"command": "ls -la"}', - origin="completion", - ) - ], - ), - # NOTE: No tool_result follows! Directly another user message. - Message( - role="user", - content=[TextContent(text="What was the result?")], - ), - ] + return A01_UNMATCHED_TOOL_USE.messages diff --git a/tests/integration/tests/a02_unmatched_tool_result.py b/tests/integration/tests/a02_unmatched_tool_result.py index 4a54c9587e..feb656198d 100644 --- a/tests/integration/tests/a02_unmatched_tool_result.py +++ b/tests/integration/tests/a02_unmatched_tool_result.py @@ -9,20 +9,9 @@ ↑ References non-existent ID! """ -from openhands.sdk.llm import Message, TextContent +from openhands.sdk.llm import Message from tests.integration.api_compliance.base import BaseAPIComplianceTest - - -PATTERN_NAME = "unmatched_tool_result" -DESCRIPTION = """ -Sends a conversation where a tool_result message references a tool_call_id -that doesn't exist in any prior assistant message's tool_calls. - -This pattern can occur when: -- tool_call_id is corrupted during serialization -- Tool results are sent for the wrong conversation -- Event ordering issues cause mismatched IDs -""" +from tests.integration.api_compliance.patterns import A02_UNMATCHED_TOOL_RESULT class UnmatchedToolResultTest(BaseAPIComplianceTest): @@ -30,35 +19,12 @@ class UnmatchedToolResultTest(BaseAPIComplianceTest): @property def pattern_name(self) -> str: - return PATTERN_NAME + return A02_UNMATCHED_TOOL_RESULT.name @property def pattern_description(self) -> str: - return DESCRIPTION + return A02_UNMATCHED_TOOL_RESULT.description def build_malformed_messages(self) -> list[Message]: """Build message sequence with unmatched tool_result.""" - return [ - Message( - role="system", - content=[TextContent(text="You are a helpful assistant.")], - ), - Message( - role="user", - content=[TextContent(text="List the files in the current directory.")], - ), - # Assistant message WITHOUT tool_use - Message( - role="assistant", - content=[ - TextContent(text="I can help you list files. What directory?") - ], - ), - # Tool result that references a non-existent tool_call_id - Message( - role="tool", - content=[TextContent(text="file1.txt\nfile2.txt\nfile3.txt")], - tool_call_id="call_nonexistent_xyz", - name="terminal", - ), - ] + return A02_UNMATCHED_TOOL_RESULT.messages diff --git a/tests/integration/tests/a03_interleaved_user_msg.py b/tests/integration/tests/a03_interleaved_user_msg.py index ed75a914cd..50ad9e87df 100644 --- a/tests/integration/tests/a03_interleaved_user_msg.py +++ b/tests/integration/tests/a03_interleaved_user_msg.py @@ -4,26 +4,14 @@ Tests how different LLM APIs respond when a user message appears between tool_use and tool_result. - Pattern: [assistant with tool_use] → [user message] → [tool_result] ↑ Inserted between tool_use and tool_result! """ -from openhands.sdk.llm import Message, MessageToolCall, TextContent +from openhands.sdk.llm import Message from tests.integration.api_compliance.base import BaseAPIComplianceTest - - -PATTERN_NAME = "interleaved_user_message" -DESCRIPTION = """ -Sends a conversation where a user message appears between a tool_use -(in assistant message) and its corresponding tool_result (tool message). - -This pattern can occur when: -- User sends message via send_message() during pending tool execution -- Events are appended to the event list in incorrect order -- Async message delivery causes race conditions -""" +from tests.integration.api_compliance.patterns import A03_INTERLEAVED_USER_MSG class InterleavedUserMessageTest(BaseAPIComplianceTest): @@ -31,46 +19,12 @@ class InterleavedUserMessageTest(BaseAPIComplianceTest): @property def pattern_name(self) -> str: - return PATTERN_NAME + return A03_INTERLEAVED_USER_MSG.name @property def pattern_description(self) -> str: - return DESCRIPTION + return A03_INTERLEAVED_USER_MSG.description def build_malformed_messages(self) -> list[Message]: """Build message sequence with interleaved user message.""" - return [ - Message( - role="system", - content=[TextContent(text="You are a helpful assistant.")], - ), - Message( - role="user", - content=[TextContent(text="List the files in the current directory.")], - ), - # Assistant message with tool_use - Message( - role="assistant", - content=[TextContent(text="I'll list the files for you.")], - tool_calls=[ - MessageToolCall( - id="call_abc123", - name="terminal", - arguments='{"command": "ls -la"}', - origin="completion", - ) - ], - ), - # INTERLEAVED: User message before tool_result - Message( - role="user", - content=[TextContent(text="Actually, can you also show hidden files?")], - ), - # Tool result comes AFTER the interleaved user message - Message( - role="tool", - content=[TextContent(text="file1.txt\nfile2.txt")], - tool_call_id="call_abc123", - name="terminal", - ), - ] + return A03_INTERLEAVED_USER_MSG.messages diff --git a/tests/integration/tests/a04_interleaved_asst_msg.py b/tests/integration/tests/a04_interleaved_asst_msg.py index b59ce40302..49eb248437 100644 --- a/tests/integration/tests/a04_interleaved_asst_msg.py +++ b/tests/integration/tests/a04_interleaved_asst_msg.py @@ -9,20 +9,9 @@ ↑ Another assistant turn before tool_result! """ -from openhands.sdk.llm import Message, MessageToolCall, TextContent +from openhands.sdk.llm import Message from tests.integration.api_compliance.base import BaseAPIComplianceTest - - -PATTERN_NAME = "interleaved_assistant_message" -DESCRIPTION = """ -Sends a conversation where an assistant message (without tool_calls) appears -between a tool_use and its corresponding tool_result. - -This pattern might occur in edge cases with: -- Malformed condensation that inserts summary messages incorrectly -- Manual event manipulation -- Corrupted conversation history -""" +from tests.integration.api_compliance.patterns import A04_INTERLEAVED_ASST_MSG class InterleavedAssistantMessageTest(BaseAPIComplianceTest): @@ -30,46 +19,12 @@ class InterleavedAssistantMessageTest(BaseAPIComplianceTest): @property def pattern_name(self) -> str: - return PATTERN_NAME + return A04_INTERLEAVED_ASST_MSG.name @property def pattern_description(self) -> str: - return DESCRIPTION + return A04_INTERLEAVED_ASST_MSG.description def build_malformed_messages(self) -> list[Message]: """Build message sequence with interleaved assistant message.""" - return [ - Message( - role="system", - content=[TextContent(text="You are a helpful assistant.")], - ), - Message( - role="user", - content=[TextContent(text="List the files in the current directory.")], - ), - # First assistant message with tool_use - Message( - role="assistant", - content=[TextContent(text="I'll list the files for you.")], - tool_calls=[ - MessageToolCall( - id="call_abc123", - name="terminal", - arguments='{"command": "ls -la"}', - origin="completion", - ) - ], - ), - # INTERLEAVED: Another assistant message without tool_calls - Message( - role="assistant", - content=[TextContent(text="The command is running...")], - ), - # Tool result comes AFTER the interleaved assistant message - Message( - role="tool", - content=[TextContent(text="file1.txt\nfile2.txt")], - tool_call_id="call_abc123", - name="terminal", - ), - ] + return A04_INTERLEAVED_ASST_MSG.messages diff --git a/tests/integration/tests/a05_duplicate_tool_call_id.py b/tests/integration/tests/a05_duplicate_tool_call_id.py index 7e8db00410..49ab29589a 100644 --- a/tests/integration/tests/a05_duplicate_tool_call_id.py +++ b/tests/integration/tests/a05_duplicate_tool_call_id.py @@ -4,26 +4,14 @@ Tests how different LLM APIs respond when multiple tool_result messages have the same tool_call_id. - Pattern: [assistant with tool_use id=X] → [tool_result id=X] → ... → [tool_result id=X] ↑ Duplicate! """ -from openhands.sdk.llm import Message, MessageToolCall, TextContent +from openhands.sdk.llm import Message from tests.integration.api_compliance.base import BaseAPIComplianceTest - - -PATTERN_NAME = "duplicate_tool_call_id" -DESCRIPTION = """ -Sends a conversation where two tool_result messages have the same tool_call_id, -meaning multiple results are provided for a single tool_use. - -This pattern can occur when: -- Conversation is resumed and duplicate ObservationEvent is created -- Event sync issues during conversation restore -- get_unmatched_actions() incorrectly identifies action as unmatched -""" +from tests.integration.api_compliance.patterns import A05_DUPLICATE_TOOL_CALL_ID class DuplicateToolCallIdTest(BaseAPIComplianceTest): @@ -31,65 +19,12 @@ class DuplicateToolCallIdTest(BaseAPIComplianceTest): @property def pattern_name(self) -> str: - return PATTERN_NAME + return A05_DUPLICATE_TOOL_CALL_ID.name @property def pattern_description(self) -> str: - return DESCRIPTION + return A05_DUPLICATE_TOOL_CALL_ID.description def build_malformed_messages(self) -> list[Message]: """Build message sequence with duplicate tool_call_id.""" - return [ - Message( - role="system", - content=[TextContent(text="You are a helpful assistant.")], - ), - Message( - role="user", - content=[TextContent(text="List the files in the current directory.")], - ), - # Assistant message with tool_use - Message( - role="assistant", - content=[TextContent(text="I'll list the files for you.")], - tool_calls=[ - MessageToolCall( - id="call_abc123", - name="terminal", - arguments='{"command": "ls -la"}', - origin="completion", - ) - ], - ), - # First tool result (correct) - Message( - role="tool", - content=[TextContent(text="file1.txt\nfile2.txt")], - tool_call_id="call_abc123", - name="terminal", - ), - # Some intervening messages (simulating conversation continuation) - Message( - role="user", - content=[TextContent(text="Thanks! Now what?")], - ), - Message( - role="assistant", - content=[ - TextContent( - text="You're welcome! Let me know if you need anything else." - ) - ], - ), - Message( - role="user", - content=[TextContent(text="Actually, show me the files again.")], - ), - # DUPLICATE: Second tool result with SAME tool_call_id - Message( - role="tool", - content=[TextContent(text="file1.txt\nfile2.txt\nfile3.txt")], - tool_call_id="call_abc123", # Same ID as before! - name="terminal", - ), - ] + return A05_DUPLICATE_TOOL_CALL_ID.messages diff --git a/tests/integration/tests/a06_wrong_tool_call_id.py b/tests/integration/tests/a06_wrong_tool_call_id.py index 68263a2c9b..db6cdd3849 100644 --- a/tests/integration/tests/a06_wrong_tool_call_id.py +++ b/tests/integration/tests/a06_wrong_tool_call_id.py @@ -2,27 +2,16 @@ API Compliance Test: Wrong tool_call_id Tests how different LLM APIs respond when a tool_result references the wrong -tool_call_id (swapped with another tool_use's ID). +tool_call_id (one that has already been completed). Pattern: - [assistant with tool_use id=A] → [assistant with tool_use id=B] → - [tool_result id=B] → [tool_result id=A] ← IDs swapped! + [assistant with tool_use id=A] → [tool_result id=A] → + [assistant with tool_use id=B] → [tool_result id=A] ← References completed ID! """ -from openhands.sdk.llm import Message, MessageToolCall, TextContent +from openhands.sdk.llm import Message from tests.integration.api_compliance.base import BaseAPIComplianceTest - - -PATTERN_NAME = "wrong_tool_call_id" -DESCRIPTION = """ -Sends a conversation where tool_results are provided but with swapped IDs, -so each tool_result references the wrong tool_use. - -This pattern might occur with: -- ID corruption during serialization -- Race conditions in parallel tool execution -- Manual event manipulation errors -""" +from tests.integration.api_compliance.patterns import A06_WRONG_TOOL_CALL_ID class WrongToolCallIdTest(BaseAPIComplianceTest): @@ -30,61 +19,12 @@ class WrongToolCallIdTest(BaseAPIComplianceTest): @property def pattern_name(self) -> str: - return PATTERN_NAME + return A06_WRONG_TOOL_CALL_ID.name @property def pattern_description(self) -> str: - return DESCRIPTION + return A06_WRONG_TOOL_CALL_ID.description def build_malformed_messages(self) -> list[Message]: """Build message sequence with swapped tool_call_ids.""" - return [ - Message( - role="system", - content=[TextContent(text="You are a helpful assistant.")], - ), - Message( - role="user", - content=[TextContent(text="Run two commands: ls and pwd")], - ), - # First assistant message with tool_use (id=A) - Message( - role="assistant", - content=[TextContent(text="I'll run ls first.")], - tool_calls=[ - MessageToolCall( - id="call_A_ls", - name="terminal", - arguments='{"command": "ls"}', - origin="completion", - ) - ], - ), - # First tool result - CORRECT - Message( - role="tool", - content=[TextContent(text="file1.txt\nfile2.txt")], - tool_call_id="call_A_ls", - name="terminal", - ), - # Second assistant message with tool_use (id=B) - Message( - role="assistant", - content=[TextContent(text="Now I'll run pwd.")], - tool_calls=[ - MessageToolCall( - id="call_B_pwd", - name="terminal", - arguments='{"command": "pwd"}', - origin="completion", - ) - ], - ), - # Second tool result - WRONG ID (references first tool_use) - Message( - role="tool", - content=[TextContent(text="/home/user/project")], - tool_call_id="call_A_ls", # Wrong! Should be call_B_pwd - name="terminal", - ), - ] + return A06_WRONG_TOOL_CALL_ID.messages diff --git a/tests/integration/tests/a07_parallel_missing_result.py b/tests/integration/tests/a07_parallel_missing_result.py index 1e5e4ef72a..23b40fb356 100644 --- a/tests/integration/tests/a07_parallel_missing_result.py +++ b/tests/integration/tests/a07_parallel_missing_result.py @@ -9,20 +9,9 @@ ↑ Missing result for C! """ -from openhands.sdk.llm import Message, MessageToolCall, TextContent +from openhands.sdk.llm import Message from tests.integration.api_compliance.base import BaseAPIComplianceTest - - -PATTERN_NAME = "parallel_missing_result" -DESCRIPTION = """ -Sends a conversation where an assistant message contains multiple parallel -tool_calls, but only some of them have corresponding tool_results. - -This pattern can occur when: -- Partial tool execution failure -- Event loss for some observations -- Timeout causes some results to be missing -""" +from tests.integration.api_compliance.patterns import A07_PARALLEL_MISSING_RESULT class ParallelMissingResultTest(BaseAPIComplianceTest): @@ -30,72 +19,12 @@ class ParallelMissingResultTest(BaseAPIComplianceTest): @property def pattern_name(self) -> str: - return PATTERN_NAME + return A07_PARALLEL_MISSING_RESULT.name @property def pattern_description(self) -> str: - return DESCRIPTION + return A07_PARALLEL_MISSING_RESULT.description def build_malformed_messages(self) -> list[Message]: """Build message sequence with parallel tool calls missing a result.""" - return [ - Message( - role="system", - content=[TextContent(text="You are a helpful assistant.")], - ), - Message( - role="user", - content=[ - TextContent( - text="Get the weather in San Francisco, Tokyo, and Paris." - ) - ], - ), - # Assistant message with THREE parallel tool_calls - Message( - role="assistant", - content=[ - TextContent(text="I'll check the weather in all three cities.") - ], - tool_calls=[ - MessageToolCall( - id="call_sf", - name="terminal", - arguments='{"command": "weather sf"}', - origin="completion", - ), - MessageToolCall( - id="call_tokyo", - name="terminal", - arguments='{"command": "weather tokyo"}', - origin="completion", - ), - MessageToolCall( - id="call_paris", - name="terminal", - arguments='{"command": "weather paris"}', - origin="completion", - ), - ], - ), - # Tool result for SF - provided - Message( - role="tool", - content=[TextContent(text="San Francisco: 65°F, Sunny")], - tool_call_id="call_sf", - name="terminal", - ), - # Tool result for Tokyo - provided - Message( - role="tool", - content=[TextContent(text="Tokyo: 72°F, Cloudy")], - tool_call_id="call_tokyo", - name="terminal", - ), - # NOTE: Tool result for Paris is MISSING! - # Next user message arrives before Paris result - Message( - role="user", - content=[TextContent(text="What about Paris?")], - ), - ] + return A07_PARALLEL_MISSING_RESULT.messages diff --git a/tests/integration/tests/a08_parallel_wrong_order.py b/tests/integration/tests/a08_parallel_wrong_order.py index f5257618e4..45ba982e0a 100644 --- a/tests/integration/tests/a08_parallel_wrong_order.py +++ b/tests/integration/tests/a08_parallel_wrong_order.py @@ -9,20 +9,9 @@ ↑ Results before the tool_calls! """ -from openhands.sdk.llm import Message, MessageToolCall, TextContent +from openhands.sdk.llm import Message from tests.integration.api_compliance.base import BaseAPIComplianceTest - - -PATTERN_NAME = "parallel_wrong_order" -DESCRIPTION = """ -Sends a conversation where tool_results appear before the assistant message -that contains the corresponding tool_calls. This is a severe ordering violation. - -This pattern might occur with: -- Severe event ordering bugs -- Manual conversation manipulation -- Corrupted event stream -""" +from tests.integration.api_compliance.patterns import A08_PARALLEL_WRONG_ORDER class ParallelWrongOrderTest(BaseAPIComplianceTest): @@ -30,53 +19,12 @@ class ParallelWrongOrderTest(BaseAPIComplianceTest): @property def pattern_name(self) -> str: - return PATTERN_NAME + return A08_PARALLEL_WRONG_ORDER.name @property def pattern_description(self) -> str: - return DESCRIPTION + return A08_PARALLEL_WRONG_ORDER.description def build_malformed_messages(self) -> list[Message]: """Build message sequence with tool results before tool calls.""" - return [ - Message( - role="system", - content=[TextContent(text="You are a helpful assistant.")], - ), - Message( - role="user", - content=[TextContent(text="Check the weather in SF and Tokyo.")], - ), - # Tool results appear FIRST (wrong!) - Message( - role="tool", - content=[TextContent(text="San Francisco: 65°F, Sunny")], - tool_call_id="call_sf", - name="terminal", - ), - Message( - role="tool", - content=[TextContent(text="Tokyo: 72°F, Cloudy")], - tool_call_id="call_tokyo", - name="terminal", - ), - # Assistant message with tool_calls comes AFTER tool_results - Message( - role="assistant", - content=[TextContent(text="I'll check both cities.")], - tool_calls=[ - MessageToolCall( - id="call_sf", - name="terminal", - arguments='{"command": "weather sf"}', - origin="completion", - ), - MessageToolCall( - id="call_tokyo", - name="terminal", - arguments='{"command": "weather tokyo"}', - origin="completion", - ), - ], - ), - ] + return A08_PARALLEL_WRONG_ORDER.messages diff --git a/tests/sdk/conversation/compliance/test_pattern_detection.py b/tests/sdk/conversation/compliance/test_pattern_detection.py new file mode 100644 index 0000000000..260dcb06c3 --- /dev/null +++ b/tests/sdk/conversation/compliance/test_pattern_detection.py @@ -0,0 +1,326 @@ +"""Tests that verify APIComplianceMonitor catches all 8 API compliance patterns. + +These tests convert the Message-based patterns from the API compliance tests +into Event sequences and verify the monitor detects the expected violations. + +This provides fast unit test coverage without requiring LLM API calls. +""" + +import uuid + +import pytest + +from openhands.sdk.conversation.compliance import APIComplianceMonitor +from openhands.sdk.event import ActionEvent, MessageEvent, ObservationEvent +from openhands.sdk.llm import Message, MessageToolCall, TextContent +from openhands.sdk.tool import Observation +from tests.integration.api_compliance.patterns import ( + A01_UNMATCHED_TOOL_USE, + A02_UNMATCHED_TOOL_RESULT, + A03_INTERLEAVED_USER_MSG, + A04_INTERLEAVED_ASST_MSG, + A05_DUPLICATE_TOOL_CALL_ID, + A06_WRONG_TOOL_CALL_ID, + A07_PARALLEL_MISSING_RESULT, + A08_PARALLEL_WRONG_ORDER, + ALL_COMPLIANCE_PATTERNS, + CompliancePattern, +) + + +class SimpleObservation(Observation): + """Simple observation for testing.""" + + result: str + + @property + def to_llm_content(self) -> list[TextContent]: + return [TextContent(text=self.result)] + + +def message_to_event( + msg: Message, + pending_tool_calls: dict[str, MessageToolCall], +) -> MessageEvent | ActionEvent | ObservationEvent | None: + """Convert a Message to the appropriate Event type. + + Args: + msg: The Message to convert. + pending_tool_calls: Dict tracking tool_call_id -> MessageToolCall for + pending actions. Updated in-place when we see tool_calls. + + Returns: + The corresponding Event, or None for system messages. + """ + if msg.role == "system": + # Skip system messages - they don't become events + return None + + if msg.role == "user": + return MessageEvent( + id=str(uuid.uuid4()), + source="user", + llm_message=msg, + ) + + if msg.role == "assistant": + # Check if this assistant message has tool_calls + if msg.tool_calls: + # Convert to ActionEvent(s) - we'll just return the first one + # and track all tool_calls + for tc in msg.tool_calls: + pending_tool_calls[tc.id] = tc + + # Return an ActionEvent for the first tool_call + first_tc = msg.tool_calls[0] + thought_text = "" + if msg.content and isinstance(msg.content[0], TextContent): + thought_text = msg.content[0].text + return ActionEvent( + id=str(uuid.uuid4()), + source="agent", + thought=[TextContent(text=thought_text)], + tool_name=first_tc.name, + tool_call_id=first_tc.id, + tool_call=first_tc, + llm_response_id=str(uuid.uuid4()), + ) + else: + # Regular assistant message + return MessageEvent( + id=str(uuid.uuid4()), + source="agent", + llm_message=msg, + ) + + if msg.role == "tool": + # Tool result -> ObservationEvent + tool_call_id = msg.tool_call_id + assert tool_call_id is not None, "Tool message must have tool_call_id" + + # Get result text + result_text = "" + if msg.content: + for content in msg.content: + if isinstance(content, TextContent): + result_text = content.text + break + + return ObservationEvent( + id=str(uuid.uuid4()), + source="environment", + tool_name=msg.name or "terminal", + tool_call_id=tool_call_id, + action_id=str(uuid.uuid4()), # We don't track this precisely + observation=SimpleObservation(result=result_text), + ) + + return None + + +def convert_pattern_to_events( + pattern: CompliancePattern, +) -> list[MessageEvent | ActionEvent | ObservationEvent]: + """Convert a compliance pattern's messages to events. + + For patterns with multiple parallel tool_calls, we generate one ActionEvent + per tool_call (not just one per assistant message). + + Returns: + List of events representing the pattern. + """ + events: list[MessageEvent | ActionEvent | ObservationEvent] = [] + pending_tool_calls: dict[str, MessageToolCall] = {} + + for msg in pattern.messages: + if msg.role == "assistant" and msg.tool_calls and len(msg.tool_calls) > 1: + # Handle parallel tool calls - create ActionEvent for each + thought_text = "" + if msg.content and isinstance(msg.content[0], TextContent): + thought_text = msg.content[0].text + for tc in msg.tool_calls: + pending_tool_calls[tc.id] = tc + event = ActionEvent( + id=str(uuid.uuid4()), + source="agent", + thought=[TextContent(text=thought_text)], + tool_name=tc.name, + tool_call_id=tc.id, + tool_call=tc, + llm_response_id=str(uuid.uuid4()), + ) + events.append(event) + else: + event = message_to_event(msg, pending_tool_calls) + if event is not None: + events.append(event) + + return events + + +def run_pattern_through_monitor( + pattern: CompliancePattern, +) -> tuple[list[str], int]: + """Run a pattern through the monitor and collect violations. + + Returns: + Tuple of (list of violation property_names, index of first violation) + """ + monitor = APIComplianceMonitor() + events = convert_pattern_to_events(pattern) + + all_violations: list[str] = [] + first_violation_idx = -1 + + for i, event in enumerate(events): + violations = monitor.process_event(event) + for v in violations: + if first_violation_idx == -1: + first_violation_idx = i + all_violations.append(v.property_name) + + return all_violations, first_violation_idx + + +# ============================================================================= +# Parametrized test for all patterns +# ============================================================================= + + +@pytest.mark.parametrize( + "pattern", + ALL_COMPLIANCE_PATTERNS, + ids=[p.name for p in ALL_COMPLIANCE_PATTERNS], +) +def test_monitor_detects_pattern(pattern: CompliancePattern): + """Verify the monitor detects the expected violation for each pattern.""" + violations, _ = run_pattern_through_monitor(pattern) + + assert len(violations) > 0, f"Pattern '{pattern.name}' should trigger a violation" + assert pattern.expected_violation in violations, ( + f"Pattern '{pattern.name}' should trigger '{pattern.expected_violation}', " + f"but got: {violations}" + ) + + +# ============================================================================= +# Individual pattern tests with detailed assertions +# ============================================================================= + + +def test_a01_unmatched_tool_use(): + """Pattern a01: User message while tool call is pending.""" + violations, idx = run_pattern_through_monitor(A01_UNMATCHED_TOOL_USE) + + assert "interleaved_message" in violations + # Violation should occur when user message arrives (4th event: after action) + assert idx == 2 # 0=user, 1=action, 2=user (violation) + + +def test_a02_unmatched_tool_result(): + """Pattern a02: Tool result with unknown tool_call_id.""" + violations, idx = run_pattern_through_monitor(A02_UNMATCHED_TOOL_RESULT) + + assert "unmatched_tool_result" in violations + # Violation should occur when orphan tool result arrives + assert idx == 2 # 0=user, 1=assistant, 2=tool (violation) + + +def test_a03_interleaved_user_msg(): + """Pattern a03: User message between tool_use and tool_result.""" + violations, idx = run_pattern_through_monitor(A03_INTERLEAVED_USER_MSG) + + assert "interleaved_message" in violations + # Violation on interleaved user message + assert idx == 2 # 0=user, 1=action, 2=user (violation), 3=tool + + +def test_a04_interleaved_asst_msg(): + """Pattern a04: Assistant message between tool_use and tool_result.""" + violations, idx = run_pattern_through_monitor(A04_INTERLEAVED_ASST_MSG) + + assert "interleaved_message" in violations + # Violation on interleaved assistant message + assert idx == 2 # 0=user, 1=action, 2=assistant (violation), 3=tool + + +def test_a05_duplicate_tool_call_id(): + """Pattern a05: Second tool_result with same tool_call_id.""" + violations, idx = run_pattern_through_monitor(A05_DUPLICATE_TOOL_CALL_ID) + + assert "duplicate_tool_result" in violations + + +def test_a06_wrong_tool_call_id(): + """Pattern a06: Tool result references wrong (already completed) ID.""" + violations, idx = run_pattern_through_monitor(A06_WRONG_TOOL_CALL_ID) + + # This results in a duplicate because call_A_ls was already completed + assert "duplicate_tool_result" in violations + + +def test_a07_parallel_missing_result(): + """Pattern a07: User message while parallel tool calls are pending.""" + violations, idx = run_pattern_through_monitor(A07_PARALLEL_MISSING_RESULT) + + assert "interleaved_message" in violations + + +def test_a08_parallel_wrong_order(): + """Pattern a08: Tool results arrive before tool_calls.""" + violations, idx = run_pattern_through_monitor(A08_PARALLEL_WRONG_ORDER) + + assert "unmatched_tool_result" in violations + # First violation should be on first tool result (unknown ID at that point) + assert idx == 1 # 0=user, 1=tool (violation) + + +# ============================================================================= +# Sanity check: valid sequences should have no violations +# ============================================================================= + + +def test_valid_sequence_no_violations(): + """A valid tool-call sequence should have no violations.""" + monitor = APIComplianceMonitor() + + # Create a valid sequence: user -> action -> observation -> user + user1 = MessageEvent( + id=str(uuid.uuid4()), + source="user", + llm_message=Message(role="user", content=[TextContent(text="List files")]), + ) + tool_call = MessageToolCall( + id="call_valid", + name="terminal", + arguments='{"command": "ls"}', + origin="completion", + ) + action = ActionEvent( + id=str(uuid.uuid4()), + source="agent", + thought=[TextContent(text="I'll list files")], + tool_name="terminal", + tool_call_id="call_valid", + tool_call=tool_call, + llm_response_id=str(uuid.uuid4()), + ) + observation = ObservationEvent( + id=str(uuid.uuid4()), + source="environment", + tool_name="terminal", + tool_call_id="call_valid", + action_id=action.id, + observation=SimpleObservation(result="file1.txt\nfile2.txt"), + ) + user2 = MessageEvent( + id=str(uuid.uuid4()), + source="user", + llm_message=Message(role="user", content=[TextContent(text="Thanks!")]), + ) + + all_violations = [] + for event in [user1, action, observation, user2]: + all_violations.extend(monitor.process_event(event)) + + assert len(all_violations) == 0, f"Valid sequence had violations: {all_violations}" From 1c3bef3459b2618ed12a7f695171f8819592ffdf Mon Sep 17 00:00:00 2001 From: openhands Date: Tue, 3 Mar 2026 16:10:03 +0000 Subject: [PATCH 07/13] fix: reject events with API compliance violations Previously the API compliance monitor operated in observation mode, logging violations but still adding events to the event log. Now events with compliance violations are rejected (not added to the event log). The warning is still logged, but the violating event is not added. Co-authored-by: openhands --- openhands-sdk/openhands/sdk/conversation/state.py | 14 ++++++-------- .../compliance/test_state_integration.py | 11 +++++++---- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/openhands-sdk/openhands/sdk/conversation/state.py b/openhands-sdk/openhands/sdk/conversation/state.py index 89e612e165..da2249623b 100644 --- a/openhands-sdk/openhands/sdk/conversation/state.py +++ b/openhands-sdk/openhands/sdk/conversation/state.py @@ -538,12 +538,8 @@ def add_event(self, event: Event) -> None: as this bypasses compliance monitoring and may cause silent failures. For LLMConvertibleEvent instances, the event is checked against API - compliance properties and any violations are logged before adding to - the event log. - - Currently operates in observation mode: violations are logged but - events are still processed. Future versions may support per-property - reconciliation strategies. + compliance properties. Events with violations are logged and rejected + (not added to the event log). Args: event: The event to add to the conversation. @@ -551,11 +547,13 @@ def add_event(self, event: Event) -> None: # Check for compliance violations only for LLM-convertible events if isinstance(event, LLMConvertibleEvent): try: - self.compliance_monitor.process_event(event) + violations = self.compliance_monitor.process_event(event) + if violations: + # Reject events with violations + return except Exception as e: logger.exception( "Error checking compliance for event %s: %s", event.id, e ) - # Add to event log regardless of violations (observation mode) self._events.append(event) diff --git a/tests/sdk/conversation/compliance/test_state_integration.py b/tests/sdk/conversation/compliance/test_state_integration.py index 2c7c7c41f1..dee726359b 100644 --- a/tests/sdk/conversation/compliance/test_state_integration.py +++ b/tests/sdk/conversation/compliance/test_state_integration.py @@ -103,21 +103,24 @@ def test_add_event_normal_flow_no_violations(conversation_state, caplog): assert "API compliance violation detected" not in caplog.text -def test_add_event_still_adds_on_violation(conversation_state, caplog): - """Events should still be added even when violations occur (observation mode).""" +def test_add_event_rejects_on_violation(conversation_state, caplog): + """Events with violations should be rejected (not added to event log).""" import logging action = make_action_event(tool_call_id="call_1") conversation_state.add_event(action) + initial_count = len(conversation_state.events) + # User message while action pending - violation user_msg = make_user_message_event() with caplog.at_level(logging.WARNING): conversation_state.add_event(user_msg) - # Event should still be in the log - assert conversation_state.events[-1].id == user_msg.id + # Event should NOT be in the log + assert len(conversation_state.events) == initial_count + assert conversation_state.events[-1].id == action.id # Violation should be logged assert "API compliance violation detected" in caplog.text From 2eada09cbe0578d4a61e6a23ca72df77d1c98a55 Mon Sep 17 00:00:00 2001 From: Calvin Smith Date: Tue, 3 Mar 2026 09:22:00 -0700 Subject: [PATCH 08/13] Update openhands-sdk/openhands/sdk/conversation/compliance/monitor.py Co-authored-by: OpenHands Bot --- .../openhands/sdk/conversation/compliance/monitor.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/openhands-sdk/openhands/sdk/conversation/compliance/monitor.py b/openhands-sdk/openhands/sdk/conversation/compliance/monitor.py index 241b7b5bee..73e90c4158 100644 --- a/openhands-sdk/openhands/sdk/conversation/compliance/monitor.py +++ b/openhands-sdk/openhands/sdk/conversation/compliance/monitor.py @@ -26,9 +26,8 @@ class APIComplianceMonitor: State machine: - IDLE (no pending calls): Messages and new actions allowed - TOOL_CALLING (pending calls): Only matching observations allowed - - Currently operates in observation mode (violations are logged but events - are still processed). + Currently operates in rejection mode: violating events are logged and rejected + (not added to the conversation). Attributes: state: Compliance state tracking pending/completed tool calls. From aa415477accc17b7cca5ceedb17c07d006c65970 Mon Sep 17 00:00:00 2001 From: Calvin Smith Date: Tue, 3 Mar 2026 09:24:53 -0700 Subject: [PATCH 09/13] Update openhands-sdk/openhands/sdk/conversation/compliance/__init__.py Co-authored-by: OpenHands Bot --- .../openhands/sdk/conversation/compliance/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/openhands-sdk/openhands/sdk/conversation/compliance/__init__.py b/openhands-sdk/openhands/sdk/conversation/compliance/__init__.py index 823a1ce5b8..36cf4235a6 100644 --- a/openhands-sdk/openhands/sdk/conversation/compliance/__init__.py +++ b/openhands-sdk/openhands/sdk/conversation/compliance/__init__.py @@ -2,7 +2,9 @@ This module provides an APIComplianceMonitor that detects violations of LLM API requirements in the event stream. Violations are currently logged for data -gathering; future versions may support reconciliation strategies. +This module provides an APIComplianceMonitor that detects and rejects violations of LLM API +requirements in the event stream. Violating events are logged and rejected (not added to +conversation). Future versions may support reconciliation strategies. The monitor enforces valid tool-call sequences: - When tool calls are pending, only matching observations are allowed From d816d44088305fea5ed65f2e4c0799b79b3efbac Mon Sep 17 00:00:00 2001 From: openhands Date: Tue, 3 Mar 2026 16:30:44 +0000 Subject: [PATCH 10/13] Address review comments: fix docs and events property type - Fix garbled/duplicate text in compliance/__init__.py docstring - Change events property to return Sequence[Event] instead of EventLog to discourage direct mutation and enforce use of add_event() - Add docstring to events property explaining the read-only intent Co-authored-by: openhands --- .../openhands/sdk/conversation/compliance/__init__.py | 9 ++++----- openhands-sdk/openhands/sdk/conversation/state.py | 7 ++++++- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/openhands-sdk/openhands/sdk/conversation/compliance/__init__.py b/openhands-sdk/openhands/sdk/conversation/compliance/__init__.py index 36cf4235a6..d27fbcdc23 100644 --- a/openhands-sdk/openhands/sdk/conversation/compliance/__init__.py +++ b/openhands-sdk/openhands/sdk/conversation/compliance/__init__.py @@ -1,10 +1,9 @@ """API Compliance monitoring for conversation events. -This module provides an APIComplianceMonitor that detects violations of LLM API -requirements in the event stream. Violations are currently logged for data -This module provides an APIComplianceMonitor that detects and rejects violations of LLM API -requirements in the event stream. Violating events are logged and rejected (not added to -conversation). Future versions may support reconciliation strategies. +This module provides an APIComplianceMonitor that detects and rejects violations +of LLM API requirements in the event stream. Violating events are logged and +rejected (not added to conversation). Future versions may support reconciliation +strategies. The monitor enforces valid tool-call sequences: - When tool calls are pending, only matching observations are allowed diff --git a/openhands-sdk/openhands/sdk/conversation/state.py b/openhands-sdk/openhands/sdk/conversation/state.py index da2249623b..e9b93c5528 100644 --- a/openhands-sdk/openhands/sdk/conversation/state.py +++ b/openhands-sdk/openhands/sdk/conversation/state.py @@ -213,7 +213,12 @@ def _handle_legacy_fields(cls, data: Any) -> Any: return data @property - def events(self) -> EventLog: + def events(self) -> Sequence[Event]: + """Read-only view of conversation events. + + Returns events as a Sequence to discourage direct mutation. + Use add_event() to add new events with compliance monitoring. + """ return self._events @property From a0a3470084fcefa280a86b7b64f4e9cb2c95d0c0 Mon Sep 17 00:00:00 2001 From: Calvin Smith Date: Tue, 3 Mar 2026 09:40:41 -0700 Subject: [PATCH 11/13] Update openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py Co-authored-by: OpenHands Bot --- .../openhands/sdk/conversation/impl/local_conversation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py b/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py index 9d6a685483..95caf4b0fd 100644 --- a/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py +++ b/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py @@ -185,6 +185,7 @@ def _default_callback(e): # regions), so updating state here is thread-safe. # # Use add_event() to check API compliance before appending. + # Events with violations are logged and rejected (not added to event log). # Violations are logged but events are still processed. self._state.add_event(e) # Track user MessageEvent IDs here so hook callbacks (which may From 89ae8b314d7c621eefa3ab008f91ceb7a149eeb6 Mon Sep 17 00:00:00 2001 From: openhands Date: Tue, 3 Mar 2026 16:46:32 +0000 Subject: [PATCH 12/13] fix: Improve compliance monitor exception handling and add_event return value - Monitor now uses fail-closed semantics: if checking crashes, state is not updated (event is treated as violating) - add_event() now returns bool indicating if event was actually added - Added fail-closed handling in state.py: if compliance check crashes, reject event - Updated tests to verify fail-closed behavior and return values Co-authored-by: openhands --- .../sdk/conversation/compliance/monitor.py | 26 ++++++++++++------- .../openhands/sdk/conversation/state.py | 10 +++++-- .../conversation/compliance/test_monitor.py | 15 ++++++++--- .../compliance/test_state_integration.py | 15 +++++++---- 4 files changed, 47 insertions(+), 19 deletions(-) diff --git a/openhands-sdk/openhands/sdk/conversation/compliance/monitor.py b/openhands-sdk/openhands/sdk/conversation/compliance/monitor.py index 73e90c4158..aafa91ecdb 100644 --- a/openhands-sdk/openhands/sdk/conversation/compliance/monitor.py +++ b/openhands-sdk/openhands/sdk/conversation/compliance/monitor.py @@ -127,6 +127,10 @@ def _update_state(self, event: LLMConvertibleEvent) -> None: def process_event(self, event: LLMConvertibleEvent) -> list[ComplianceViolation]: """Check an event for violations and update state. + Fail-closed semantics: if checking crashes, the event is treated as + violating (state is not updated). Only events that pass checking + without violations have their state updated. + Args: event: The event to process. @@ -134,6 +138,7 @@ def process_event(self, event: LLMConvertibleEvent) -> list[ComplianceViolation] List of violations detected (empty if compliant). """ violations: list[ComplianceViolation] = [] + check_failed = False try: violation = self._check_tool_call_sequence(event) @@ -151,14 +156,17 @@ def process_event(self, event: LLMConvertibleEvent) -> list[ComplianceViolation] event.id, e, ) - - try: - self._update_state(event) - except Exception as e: - logger.exception( - "Error updating compliance state for event %s: %s", - event.id, - e, - ) + check_failed = True + + # Only update state if check succeeded and no violations + if not check_failed and not violations: + try: + self._update_state(event) + except Exception as e: + logger.exception( + "Error updating compliance state for event %s: %s", + event.id, + e, + ) return violations diff --git a/openhands-sdk/openhands/sdk/conversation/state.py b/openhands-sdk/openhands/sdk/conversation/state.py index e9b93c5528..cd9d4dc00b 100644 --- a/openhands-sdk/openhands/sdk/conversation/state.py +++ b/openhands-sdk/openhands/sdk/conversation/state.py @@ -535,7 +535,7 @@ def compliance_monitor(self) -> APIComplianceMonitor: self._compliance_monitor = APIComplianceMonitor() return self._compliance_monitor - def add_event(self, event: Event) -> None: + def add_event(self, event: Event) -> bool: """Add an event to the conversation, checking for API compliance. This is the only supported way to add events to the conversation. @@ -548,6 +548,9 @@ def add_event(self, event: Event) -> None: Args: event: The event to add to the conversation. + + Returns: + True if the event was added, False if rejected due to violations. """ # Check for compliance violations only for LLM-convertible events if isinstance(event, LLMConvertibleEvent): @@ -555,10 +558,13 @@ def add_event(self, event: Event) -> None: violations = self.compliance_monitor.process_event(event) if violations: # Reject events with violations - return + return False except Exception as e: logger.exception( "Error checking compliance for event %s: %s", event.id, e ) + # Fail-closed: reject event if compliance check crashes + return False self._events.append(event) + return True diff --git a/tests/sdk/conversation/compliance/test_monitor.py b/tests/sdk/conversation/compliance/test_monitor.py index 5f2cf31346..fe4ba1b483 100644 --- a/tests/sdk/conversation/compliance/test_monitor.py +++ b/tests/sdk/conversation/compliance/test_monitor.py @@ -132,22 +132,31 @@ def test_monitor_parallel_tool_calls(): def test_monitor_handles_check_exception_gracefully(): - """Monitor should handle exceptions in check gracefully. + """Monitor should handle exceptions in check gracefully (fail-closed). If _check_tool_call_sequence raises an exception, it should be caught - and logged, not crash the monitor. This ensures observation mode is robust. + and logged, not crash the monitor. State should NOT be updated (fail-closed). """ monitor = APIComplianceMonitor() + action = make_action_event(tool_call_id="call_check_err") + monitor.process_event(action) + initial_pending = len(monitor.state.pending_tool_call_ids) + + # Simulate another action where check fails with patch.object( monitor, "_check_tool_call_sequence", side_effect=ValueError("Oops!") ): # Should not raise - the exception should be caught and logged - violations = monitor.process_event(make_user_message_event()) + violations = monitor.process_event(make_action_event(tool_call_id="call_2")) # The monitor should continue working despite the error assert violations == [] + # Fail-closed: state should NOT be updated when check fails + assert len(monitor.state.pending_tool_call_ids) == initial_pending + assert "call_2" not in monitor.state.pending_tool_call_ids + def test_monitor_handles_update_exception_gracefully(): """Monitor should handle exceptions in state update gracefully. diff --git a/tests/sdk/conversation/compliance/test_state_integration.py b/tests/sdk/conversation/compliance/test_state_integration.py index dee726359b..d373d47794 100644 --- a/tests/sdk/conversation/compliance/test_state_integration.py +++ b/tests/sdk/conversation/compliance/test_state_integration.py @@ -42,12 +42,13 @@ def conversation_state(temp_workspace, mock_agent): def test_add_event_appends_to_event_log(conversation_state): - """add_event() should append events to the event log.""" + """add_event() should append events to the event log and return True.""" initial_count = len(conversation_state.events) user_msg = make_user_message_event("Hello") - conversation_state.add_event(user_msg) + result = conversation_state.add_event(user_msg) + assert result is True assert len(conversation_state.events) == initial_count + 1 assert conversation_state.events[-1].id == user_msg.id @@ -104,11 +105,12 @@ def test_add_event_normal_flow_no_violations(conversation_state, caplog): def test_add_event_rejects_on_violation(conversation_state, caplog): - """Events with violations should be rejected (not added to event log).""" + """Events with violations should be rejected (not added) and return False.""" import logging action = make_action_event(tool_call_id="call_1") - conversation_state.add_event(action) + result = conversation_state.add_event(action) + assert result is True initial_count = len(conversation_state.events) @@ -116,7 +118,10 @@ def test_add_event_rejects_on_violation(conversation_state, caplog): user_msg = make_user_message_event() with caplog.at_level(logging.WARNING): - conversation_state.add_event(user_msg) + result = conversation_state.add_event(user_msg) + + # Should return False for rejected event + assert result is False # Event should NOT be in the log assert len(conversation_state.events) == initial_count From d03e682e80274d531930dbb607d152bf23a3f41b Mon Sep 17 00:00:00 2001 From: openhands Date: Tue, 3 Mar 2026 16:55:29 +0000 Subject: [PATCH 13/13] fix: Correct events property return type and add get_index to EventsListBase - Change ConversationState.events return type from Sequence[Event] to EventsListBase - Add get_index abstract method to EventsListBase for event lookup by ID - Implement get_index in RemoteEventsList class This fixes pyright type errors where code was calling .append() and .get_index() on state.events, which are not available on Sequence[Event]. Co-authored-by: openhands --- .../openhands/sdk/conversation/events_list_base.py | 6 ++++++ .../sdk/conversation/impl/remote_conversation.py | 8 ++++++++ openhands-sdk/openhands/sdk/conversation/state.py | 5 +++-- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/openhands-sdk/openhands/sdk/conversation/events_list_base.py b/openhands-sdk/openhands/sdk/conversation/events_list_base.py index 0f5f77239c..784fe7f086 100644 --- a/openhands-sdk/openhands/sdk/conversation/events_list_base.py +++ b/openhands-sdk/openhands/sdk/conversation/events_list_base.py @@ -2,6 +2,7 @@ from collections.abc import Sequence from openhands.sdk.event import Event +from openhands.sdk.event.types import EventID class EventsListBase(Sequence[Event], ABC): @@ -15,3 +16,8 @@ class EventsListBase(Sequence[Event], ABC): def append(self, event: Event) -> None: """Add a new event to the list.""" ... + + @abstractmethod + def get_index(self, event_id: EventID) -> int: + """Return the integer index for a given event_id.""" + ... diff --git a/openhands-sdk/openhands/sdk/conversation/impl/remote_conversation.py b/openhands-sdk/openhands/sdk/conversation/impl/remote_conversation.py index 32762bb2e4..895a03dfe7 100644 --- a/openhands-sdk/openhands/sdk/conversation/impl/remote_conversation.py +++ b/openhands-sdk/openhands/sdk/conversation/impl/remote_conversation.py @@ -352,6 +352,14 @@ def append(self, event: Event) -> None: """Add a new event to the list (for compatibility with EventLog interface).""" self.add_event(event) + def get_index(self, event_id: str) -> int: + """Return the integer index for a given event_id.""" + with self._lock: + for idx, event in enumerate(self._cached_events): + if event.id == event_id: + return idx + raise KeyError(f"Unknown event_id: {event_id}") + def create_default_callback(self) -> ConversationCallbackType: """Create a default callback that adds events to this list.""" diff --git a/openhands-sdk/openhands/sdk/conversation/state.py b/openhands-sdk/openhands/sdk/conversation/state.py index cd9d4dc00b..62b93fcdcd 100644 --- a/openhands-sdk/openhands/sdk/conversation/state.py +++ b/openhands-sdk/openhands/sdk/conversation/state.py @@ -13,6 +13,7 @@ from openhands.sdk.conversation.compliance import APIComplianceMonitor from openhands.sdk.conversation.conversation_stats import ConversationStats from openhands.sdk.conversation.event_store import EventLog +from openhands.sdk.conversation.events_list_base import EventsListBase from openhands.sdk.conversation.fifo_lock import FIFOLock from openhands.sdk.conversation.persistence_const import BASE_STATE, EVENTS_DIR from openhands.sdk.conversation.secret_registry import SecretRegistry @@ -213,10 +214,10 @@ def _handle_legacy_fields(cls, data: Any) -> Any: return data @property - def events(self) -> Sequence[Event]: + def events(self) -> EventsListBase: """Read-only view of conversation events. - Returns events as a Sequence to discourage direct mutation. + Returns events as EventsListBase to discourage direct mutation. Use add_event() to add new events with compliance monitoring. """ return self._events