Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
96db244
feat: Add API compliance monitoring for conversation events
openhands-agent Feb 25, 2026
b9a7910
refactor: Address review feedback on API compliance monitor
openhands-agent Feb 25, 2026
28d034d
refactor: Address review feedback on API compliance monitor
openhands-agent Feb 25, 2026
a6f0f77
Merge branch 'main' into api-compliance-monitor
csmith49 Feb 25, 2026
9df4326
Merge branch 'main' into api-compliance-monitor
csmith49 Feb 26, 2026
4e28e19
Merge branch 'main' into api-compliance-monitor
csmith49 Feb 27, 2026
ce3213b
Add error handling for observation mode robustness
openhands-agent Feb 27, 2026
c087f2e
Collapse compliance properties into single monitor method
openhands-agent Feb 27, 2026
770723a
Extract compliance patterns to shared module with unit tests
openhands-agent Feb 27, 2026
e61d9c0
Merge branch 'main' into api-compliance-monitor
csmith49 Mar 2, 2026
1c3bef3
fix: reject events with API compliance violations
openhands-agent Mar 3, 2026
54a4e25
Merge branch 'main' into api-compliance-monitor
csmith49 Mar 3, 2026
2eada09
Update openhands-sdk/openhands/sdk/conversation/compliance/monitor.py
csmith49 Mar 3, 2026
aa41547
Update openhands-sdk/openhands/sdk/conversation/compliance/__init__.py
csmith49 Mar 3, 2026
d816d44
Address review comments: fix docs and events property type
openhands-agent Mar 3, 2026
a0a3470
Update openhands-sdk/openhands/sdk/conversation/impl/local_conversati…
csmith49 Mar 3, 2026
89ae8b3
fix: Improve compliance monitor exception handling and add_event retu…
openhands-agent Mar 3, 2026
d03e682
fix: Correct events property return type and add get_index to EventsL…
openhands-agent Mar 3, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions openhands-sdk/openhands/sdk/conversation/compliance/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
"""API Compliance monitoring for conversation events.

This module provides an APIComplianceMonitor that detects and rejects violations
of LLM API requirements in the event stream. Violating events are logged and
rejected (not added to conversation). Future versions may support reconciliation
strategies.

The monitor enforces valid tool-call sequences:
- When tool calls are pending, only matching observations are allowed
- Messages cannot interleave with pending tool calls
- Tool results must reference known tool_call_ids
"""

from openhands.sdk.conversation.compliance.base import (
ComplianceState,
ComplianceViolation,
)
from openhands.sdk.conversation.compliance.monitor import APIComplianceMonitor


__all__ = [
"APIComplianceMonitor",
"ComplianceState",
"ComplianceViolation",
]
41 changes: 41 additions & 0 deletions openhands-sdk/openhands/sdk/conversation/compliance/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
"""Base classes for API compliance monitoring."""

from dataclasses import dataclass, field

from openhands.sdk.event.types import EventID, ToolCallID


@dataclass
class ComplianceViolation:
"""Represents an API compliance violation.

Attributes:
property_name: Name of the property that was violated.
event_id: ID of the event that caused the violation.
description: Human-readable description of the violation.
context: Optional additional context (e.g., related tool_call_ids).
"""

property_name: str
event_id: EventID
description: str
context: dict[str, object] | None = None


@dataclass
class ComplianceState:
"""Shared state for tracking API compliance.

Tracks the tool call lifecycle to detect violations:
- pending_tool_call_ids: Actions awaiting results
- completed_tool_call_ids: Actions that have received results

Attributes:
pending_tool_call_ids: Tool calls that have been made but not yet
received results. Maps tool_call_id to the ActionEvent id.
completed_tool_call_ids: Tool calls that have received results.
Used to detect duplicate results.
"""

pending_tool_call_ids: dict[ToolCallID, EventID] = field(default_factory=dict)
completed_tool_call_ids: set[ToolCallID] = field(default_factory=set)
172 changes: 172 additions & 0 deletions openhands-sdk/openhands/sdk/conversation/compliance/monitor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
"""API Compliance Monitor that checks events before adding to conversation."""

from openhands.sdk.conversation.compliance.base import (
ComplianceState,
ComplianceViolation,
)
from openhands.sdk.event import (
ActionEvent,
LLMConvertibleEvent,
MessageEvent,
ObservationBaseEvent,
)
from openhands.sdk.logger import get_logger


logger = get_logger(__name__)


class APIComplianceMonitor:
"""Monitors events for API compliance violations.

Enforces valid tool-call sequences by checking what events are allowed
given current state. The key invariant: when tool calls are pending,
only matching observations are allowed.

State machine:
- IDLE (no pending calls): Messages and new actions allowed
- TOOL_CALLING (pending calls): Only matching observations allowed
Currently operates in rejection mode: violating events are logged and rejected
(not added to the conversation).

Attributes:
state: Compliance state tracking pending/completed tool calls.
"""

def __init__(self) -> None:
"""Initialize the compliance monitor."""
self.state = ComplianceState()

def _check_tool_call_sequence(
self, event: LLMConvertibleEvent
) -> ComplianceViolation | None:
"""Check if an event violates the tool-call sequence property.

The rule is simple: if we have pending tool calls, only matching
observations are allowed. This covers all 8 API compliance patterns:

- a01 (unmatched_tool_use): Message while calls pending
- a02 (unmatched_tool_result): Result with unknown ID
- a03 (interleaved_user_msg): User message while calls pending
- a04 (interleaved_asst_msg): Assistant message while calls pending
- a05 (duplicate_tool_call_id): Result for already-completed ID
- a06 (wrong_tool_call_id): Result with wrong/unknown ID
- a07 (parallel_missing_result): Message before all parallel results
- a08 (parallel_wrong_order): Result before action (unknown ID)

Args:
event: The event to check.

Returns:
A ComplianceViolation if the event violates the property, None otherwise.
"""
# Actions are always allowed - they start or continue a tool-call batch
if isinstance(event, ActionEvent):
return None

# Messages require no pending tool calls
if isinstance(event, MessageEvent):
if self.state.pending_tool_call_ids:
pending_ids = list(self.state.pending_tool_call_ids.keys())
return ComplianceViolation(
property_name="interleaved_message",
event_id=event.id,
description=(
f"Message interleaved with {len(pending_ids)} pending "
f"tool call(s)"
),
context={"pending_tool_call_ids": pending_ids},
)
return None

# Observations must match a known tool_call_id
if isinstance(event, ObservationBaseEvent):
tool_call_id = event.tool_call_id

# Check for valid match (pending)
if tool_call_id in self.state.pending_tool_call_ids:
return None # Valid - completes a pending call

# Check for duplicate (already completed)
if tool_call_id in self.state.completed_tool_call_ids:
return ComplianceViolation(
property_name="duplicate_tool_result",
event_id=event.id,
description=(
f"Duplicate tool result for tool_call_id: {tool_call_id}"
),
context={"tool_call_id": tool_call_id},
)

# Unknown ID - orphan result (covers a02, a06, a08)
return ComplianceViolation(
property_name="unmatched_tool_result",
event_id=event.id,
description=(
f"Tool result references unknown tool_call_id: {tool_call_id}"
),
context={"tool_call_id": tool_call_id},
)

return None

def _update_state(self, event: LLMConvertibleEvent) -> None:
"""Update compliance state after processing an event.

Tracks the tool-call lifecycle:
- ActionEvent: Add to pending
- ObservationBaseEvent: Move from pending to completed
"""
if isinstance(event, ActionEvent):
self.state.pending_tool_call_ids[event.tool_call_id] = event.id
elif isinstance(event, ObservationBaseEvent):
# Move from pending to completed (if it was pending)
self.state.pending_tool_call_ids.pop(event.tool_call_id, None)
self.state.completed_tool_call_ids.add(event.tool_call_id)

def process_event(self, event: LLMConvertibleEvent) -> list[ComplianceViolation]:
"""Check an event for violations and update state.

Fail-closed semantics: if checking crashes, the event is treated as
violating (state is not updated). Only events that pass checking
without violations have their state updated.

Args:
event: The event to process.

Returns:
List of violations detected (empty if compliant).
"""
violations: list[ComplianceViolation] = []
check_failed = False

try:
violation = self._check_tool_call_sequence(event)
if violation is not None:
violations.append(violation)
logger.warning(
"API compliance violation detected: %s - %s (event_id=%s)",
violation.property_name,
violation.description,
violation.event_id,
)
except Exception as e:
logger.exception(
"Error checking compliance for event %s: %s",
event.id,
e,
)
check_failed = True

# Only update state if check succeeded and no violations
if not check_failed and not violations:
try:
self._update_state(event)
except Exception as e:
logger.exception(
"Error updating compliance state for event %s: %s",
event.id,
e,
)

return violations
6 changes: 6 additions & 0 deletions openhands-sdk/openhands/sdk/conversation/events_list_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from collections.abc import Sequence

from openhands.sdk.event import Event
from openhands.sdk.event.types import EventID


class EventsListBase(Sequence[Event], ABC):
Expand All @@ -15,3 +16,8 @@ class EventsListBase(Sequence[Event], ABC):
def append(self, event: Event) -> None:
"""Add a new event to the list."""
...

@abstractmethod
def get_index(self, event_id: EventID) -> int:
"""Return the integer index for a given event_id."""
...
Original file line number Diff line number Diff line change
Expand Up @@ -178,12 +178,16 @@ def __init__(
cipher=cipher,
)

# Default callback: persist every event to state
# Default callback: persist every event to state with compliance checking
def _default_callback(e):
# This callback runs while holding the conversation state's lock
# (see BaseConversation.compose_callbacks usage inside `with self._state:`
# regions), so updating state here is thread-safe.
self._state.events.append(e)
#
# Use add_event() to check API compliance before appending.
# Events with violations are logged and rejected (not added to event log).
# Violations are logged but events are still processed.
self._state.add_event(e)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think when we modify self._state we use a lock... so I wonder, does it mean that the compliance code is under it? It must be, right 🤔

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

^ i think so, we hold the lock when default_callback is executed.

@OpenHands please review the code to confirm this. Do not change any code, just answer my question.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm on it! xingyaoww can track my progress at all-hands.dev

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Summary

I was asked to review the code to confirm whether the API compliance monitoring code runs under a lock when self._state is modified.

Answer: Yes, the compliance code does run under the lock.

Here's the evidence from the code:

  1. ConversationState implements a FIFOLock via __enter__/__exit__ context manager methods (lines 506-513 in state.py).

  2. agent.step() is called inside with self._state: (lines 561, 619-621 in local_conversation.py), which acquires the lock.

  3. Inside agent.step(), the on_event callback is invoked, which eventually calls _default_callback.

  4. _default_callback calls self._state.add_event(e) (line 190), which runs the compliance check via self.compliance_monitor.process_event(event).

  5. The code explicitly documents this in the comment on lines 183-185:

    # This callback runs while holding the conversation state's lock
    # (see BaseConversation.compose_callbacks usage inside `with self._state:`
    # regions), so updating state here is thread-safe.

Conclusion: The compliance monitoring state modifications are thread-safe because _default_callback (which calls add_event()compliance_monitor.process_event()) runs while the FIFOLock on ConversationState is held.

No code changes were made as per the user's instructions.

# Track user MessageEvent IDs here so hook callbacks (which may
# synthesize or alter user messages) are captured in one place.
if isinstance(e, MessageEvent) and e.source == "user":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,14 @@ def append(self, event: Event) -> None:
"""Add a new event to the list (for compatibility with EventLog interface)."""
self.add_event(event)

def get_index(self, event_id: str) -> int:
"""Return the integer index for a given event_id."""
with self._lock:
for idx, event in enumerate(self._cached_events):
if event.id == event_id:
return idx
raise KeyError(f"Unknown event_id: {event_id}")

def create_default_callback(self) -> ConversationCallbackType:
"""Create a default callback that adds events to this list."""

Expand Down
Loading
Loading