Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
272 changes: 272 additions & 0 deletions openhands-sdk/openhands/sdk/llm/capabilities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,272 @@
"""Model capability detection and validation for LLM instances.

This module extracts capability-related logic from the LLM class to improve
maintainability and testability. It handles:
- Model information lookup from litellm
- Context window validation
- Vision support detection
- Prompt caching support detection
- Responses API support detection
"""

from __future__ import annotations

import os
import warnings
from dataclasses import dataclass
from typing import Final

from litellm.types.utils import ModelInfo
from litellm.utils import supports_vision
from pydantic import SecretStr

from openhands.sdk.llm.exceptions import LLMContextWindowTooSmallError
from openhands.sdk.llm.utils.model_features import get_features
from openhands.sdk.llm.utils.model_info import get_litellm_model_info
from openhands.sdk.logger import get_logger


logger = get_logger(__name__)

__all__ = ["CapabilitiesConfig", "LLMCapabilities", "ModelInfo"]


@dataclass(frozen=True, slots=True)
class CapabilitiesConfig:
"""Configuration for LLMCapabilities initialization.

Groups the parameters needed to detect model capabilities, keeping
the LLMCapabilities constructor signature stable as new fields are added.
"""

model: str
model_canonical_name: str | None
base_url: str | None
api_key: SecretStr | str | None
disable_vision: bool
caching_prompt: bool


# Minimum context window size required for OpenHands to function properly.
# Based on typical usage: system prompt (~2k) + conversation history (~4k)
# + tool definitions (~2k) + working memory (~8k) = ~16k minimum.
MIN_CONTEXT_WINDOW_TOKENS: Final[int] = 16384

# Environment variable to override the minimum context window check
ENV_ALLOW_SHORT_CONTEXT_WINDOWS: Final[str] = "ALLOW_SHORT_CONTEXT_WINDOWS"

# Default max output tokens when model info only provides 'max_tokens' (ambiguous).
# Some providers use 'max_tokens' for the total context window, not output limit.
# This cap prevents requesting output that exceeds the context window.
# 16384 is a safe default that works for most models (GPT-4o: 16k, Claude: 8k).
DEFAULT_MAX_OUTPUT_TOKENS_CAP: Final[int] = 16384

# Model-specific output token limits.
# These override litellm's model_info when a substring match is found.
# The limit is applied as an upper cap: if litellm reports a higher value,
# it's clamped down; if the model isn't in model_info at all, this value is used.
MODEL_OUTPUT_TOKEN_LIMITS: Final[dict[str, int]] = {
"claude-3-7-sonnet": 64000,
"claude-sonnet-4": 64000,
"kimi-k2-thinking": 64000,
"o3": 100000,
Comment on lines +68 to +72
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟠 Important: Comment Doesn't Match Implementation

This comment claims the limit is an "upper cap" that clamps down higher values from litellm, but the implementation unconditionally overrides model_info (see line 165 early return).

The behavior change: For o3 models, the old code applied the limit as a cap AFTER model_info detection:

# Old: Applied after model_info check
if "o3" in self.model:
    if self.max_output_tokens is None or self.max_output_tokens > 100000:
        self.max_output_tokens = 100000

This meant if model_info said max_output_tokens=50k, it would keep 50k. If model_info said 150k, it would clamp to 100k.

The new code checks MODEL_OUTPUT_TOKEN_LIMITS FIRST and returns early, so model_info is never consulted for o3 models. This changes behavior for o3 models where model_info might have a value < 100k.

This contradicts the PR's claim of "No external behavior changes".

Either fix the comment to say "these override model_info" OR fix the implementation to actually apply limits as caps:

# Get base value from model_info first
base_value = None
if self._model_info is not None:
    if isinstance(self._model_info.get("max_output_tokens"), int):
        base_value = self._model_info.get("max_output_tokens")
    elif isinstance(max_tokens_value := self._model_info.get("max_tokens"), int):
        base_value = min(max_tokens_value, DEFAULT_MAX_OUTPUT_TOKENS_CAP)

# Apply model-specific caps
for model_prefix, limit in MODEL_OUTPUT_TOKEN_LIMITS.items():
    if model_prefix in model:
        self.detected_max_output_tokens = min(base_value, limit) if base_value else limit
        return

}


class LLMCapabilities:
"""Detects and caches model capabilities.

This class encapsulates capability detection for LLM models, including:
- Vision support
- Prompt caching support
- Responses API support
- Context window validation
- Auto-detection of token limits from model info

It is initialized with model configuration and caches model info from litellm.
Token limits are auto-detected and exposed as ``detected_max_input_tokens``
and ``detected_max_output_tokens``. The caller (LLM) owns the resolution of
user overrides vs detected values.

Example:
>>> config = CapabilitiesConfig(
... model="claude-sonnet-4-20250514",
... model_canonical_name=None,
... base_url=None,
... api_key=SecretStr("key"),
... disable_vision=False,
... caching_prompt=True,
... )
>>> caps = LLMCapabilities(config)
>>> caps.vision_is_active()
True
>>> caps.is_caching_prompt_active()
True
"""

def __init__(self, config: CapabilitiesConfig) -> None:
"""Initialize capabilities detection.

Args:
config: Configuration for capability detection.
"""
self._config = config

# Auto-detected token limits (never user overrides)
self.detected_max_input_tokens: int | None = None
self.detected_max_output_tokens: int | None = None

# Internal cache for model info
self._model_info: ModelInfo | None = None

# Initialize model info and capabilities
self._init_model_info_and_caps()

@property
def model(self) -> str:
"""Return the model name."""
return self._config.model

@property
def model_name_for_capabilities(self) -> str:
"""Return canonical name for capability lookups (e.g., vision support)."""
return self._config.model_canonical_name or self._config.model

@property
def model_info(self) -> ModelInfo | None:
"""Return the cached model info dictionary."""
return self._model_info

def _init_model_info_and_caps(self) -> None:
"""Initialize model info and auto-detect token limits."""
self._model_info = get_litellm_model_info(
secret_api_key=self._config.api_key,
base_url=self._config.base_url,
model=self.model_name_for_capabilities,
)

# Context window (max_input_tokens)
if self._model_info is not None and isinstance(
self._model_info.get("max_input_tokens"), int
):
self.detected_max_input_tokens = self._model_info.get("max_input_tokens")

# Validate context window size
self._validate_context_window_size()

# Auto-detect max_output_tokens
self._auto_detect_max_output_tokens()

def _auto_detect_max_output_tokens(self) -> None:
"""Auto-detect max_output_tokens from model info."""
model = self._config.model

# 1. Check model-specific overrides (from MODEL_OUTPUT_TOKEN_LIMITS)
for model_prefix, limit in MODEL_OUTPUT_TOKEN_LIMITS.items():
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 Critical: Early Return Changes o3 Behavior

This early return means model_info is never checked for models matching MODEL_OUTPUT_TOKEN_LIMITS.

For Claude models: Old behavior also skipped model_info (they were in an if/elif), so no change.

For o3 models: Old behavior checked model_info FIRST, then applied 100k as a cap. If model_info said max_output_tokens=50k, the old code kept 50k. New code unconditionally sets 100k.

Test gap: Your test test_o3_output_tokens_clamped only checks the case where model_info > 100k. Add a test for model_info < 100k to verify intended behavior.

Is the new behavior (always 100k for o3) intentional? If so, update the comment on lines 68-72 and the PR description. If not, move the model_info check before the MODEL_OUTPUT_TOKEN_LIMITS loop.

if model_prefix in model:
self.detected_max_output_tokens = limit
logger.debug(
"Setting max_output_tokens to %s for %s (model-specific limit)",
limit,
model,
)
return

# 2. Fall back to model_info detection
if self._model_info is not None:
if isinstance(self._model_info.get("max_output_tokens"), int):
self.detected_max_output_tokens = self._model_info.get(
"max_output_tokens"
)
elif isinstance(
max_tokens_value := self._model_info.get("max_tokens"), int
):
# 'max_tokens' is ambiguous: some providers use it for total
# context window, not output limit. Cap it to avoid requesting
# output that exceeds the context window.
self.detected_max_output_tokens = min(
max_tokens_value, DEFAULT_MAX_OUTPUT_TOKENS_CAP
)
if max_tokens_value > DEFAULT_MAX_OUTPUT_TOKENS_CAP:
logger.debug(
"Capping max_output_tokens from %s to %s for %s "
"(max_tokens may be context window, not output)",
max_tokens_value,
self.detected_max_output_tokens,
model,
)

def _validate_context_window_size(self) -> None:
"""Validate that the context window is large enough for OpenHands."""
# Allow override via environment variable
if os.environ.get(ENV_ALLOW_SHORT_CONTEXT_WINDOWS, "").lower() in (
"true",
"1",
"yes",
):
return

# Unknown context window - cannot validate
if self.detected_max_input_tokens is None:
return

# Check minimum requirement
if self.detected_max_input_tokens < MIN_CONTEXT_WINDOW_TOKENS:
raise LLMContextWindowTooSmallError(
self.detected_max_input_tokens, MIN_CONTEXT_WINDOW_TOKENS
)

def vision_is_active(self) -> bool:
"""Check if vision is supported and enabled.

Returns:
True if the model supports vision and it's not disabled.
"""
with warnings.catch_warnings():
warnings.simplefilter("ignore")
return not self._config.disable_vision and self._supports_vision()

def _supports_vision(self) -> bool:
"""Check if the model supports vision capabilities.

Returns:
True if model is vision capable. Returns False if model not
supported by litellm.
"""
# litellm.supports_vision currently returns False for 'openai/gpt-...'
# or 'anthropic/claude-...' (with prefixes) but model_info will have
# the correct value for some reason.
# Check both the full model name and the name after proxy prefix
model_for_caps = self.model_name_for_capabilities
return bool(
supports_vision(model_for_caps)
or supports_vision(model_for_caps.split("/")[-1])
or (
self._model_info is not None
and self._model_info.get("supports_vision", False)
)
)

def is_caching_prompt_active(self) -> bool:
"""Check if prompt caching is supported and enabled for current model.

Returns:
True if prompt caching is supported and enabled for the given model.
"""
if not self._config.caching_prompt:
return False
# We don't need to look-up model_info, because
# only Anthropic models need explicit caching breakpoints
return (
self._config.caching_prompt
and get_features(self.model_name_for_capabilities).supports_prompt_cache
)

def uses_responses_api(self) -> bool:
"""Check if this model uses the OpenAI Responses API path.

Returns:
True if the model should use the Responses API.
"""
# by default, uses = supports
return get_features(self.model_name_for_capabilities).supports_responses_api
Loading
Loading