-
Notifications
You must be signed in to change notification settings - Fork 168
DRAFT: refactor(llm): extract LLMCapabilities class from LLM #2279
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,272 @@ | ||
| """Model capability detection and validation for LLM instances. | ||
|
|
||
| This module extracts capability-related logic from the LLM class to improve | ||
| maintainability and testability. It handles: | ||
| - Model information lookup from litellm | ||
| - Context window validation | ||
| - Vision support detection | ||
| - Prompt caching support detection | ||
| - Responses API support detection | ||
| """ | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import os | ||
| import warnings | ||
| from dataclasses import dataclass | ||
| from typing import Final | ||
|
|
||
| from litellm.types.utils import ModelInfo | ||
| from litellm.utils import supports_vision | ||
| from pydantic import SecretStr | ||
|
|
||
| from openhands.sdk.llm.exceptions import LLMContextWindowTooSmallError | ||
| from openhands.sdk.llm.utils.model_features import get_features | ||
| from openhands.sdk.llm.utils.model_info import get_litellm_model_info | ||
| from openhands.sdk.logger import get_logger | ||
|
|
||
|
|
||
| logger = get_logger(__name__) | ||
|
|
||
| __all__ = ["CapabilitiesConfig", "LLMCapabilities", "ModelInfo"] | ||
|
|
||
|
|
||
| @dataclass(frozen=True, slots=True) | ||
| class CapabilitiesConfig: | ||
| """Configuration for LLMCapabilities initialization. | ||
|
|
||
| Groups the parameters needed to detect model capabilities, keeping | ||
| the LLMCapabilities constructor signature stable as new fields are added. | ||
| """ | ||
|
|
||
| model: str | ||
| model_canonical_name: str | None | ||
| base_url: str | None | ||
| api_key: SecretStr | str | None | ||
| disable_vision: bool | ||
| caching_prompt: bool | ||
|
|
||
|
|
||
| # Minimum context window size required for OpenHands to function properly. | ||
| # Based on typical usage: system prompt (~2k) + conversation history (~4k) | ||
| # + tool definitions (~2k) + working memory (~8k) = ~16k minimum. | ||
| MIN_CONTEXT_WINDOW_TOKENS: Final[int] = 16384 | ||
|
|
||
| # Environment variable to override the minimum context window check | ||
| ENV_ALLOW_SHORT_CONTEXT_WINDOWS: Final[str] = "ALLOW_SHORT_CONTEXT_WINDOWS" | ||
|
|
||
| # Default max output tokens when model info only provides 'max_tokens' (ambiguous). | ||
| # Some providers use 'max_tokens' for the total context window, not output limit. | ||
| # This cap prevents requesting output that exceeds the context window. | ||
| # 16384 is a safe default that works for most models (GPT-4o: 16k, Claude: 8k). | ||
| DEFAULT_MAX_OUTPUT_TOKENS_CAP: Final[int] = 16384 | ||
|
|
||
| # Model-specific output token limits. | ||
| # These override litellm's model_info when a substring match is found. | ||
| # The limit is applied as an upper cap: if litellm reports a higher value, | ||
| # it's clamped down; if the model isn't in model_info at all, this value is used. | ||
| MODEL_OUTPUT_TOKEN_LIMITS: Final[dict[str, int]] = { | ||
| "claude-3-7-sonnet": 64000, | ||
| "claude-sonnet-4": 64000, | ||
| "kimi-k2-thinking": 64000, | ||
| "o3": 100000, | ||
| } | ||
|
|
||
|
|
||
| class LLMCapabilities: | ||
| """Detects and caches model capabilities. | ||
|
|
||
| This class encapsulates capability detection for LLM models, including: | ||
| - Vision support | ||
| - Prompt caching support | ||
| - Responses API support | ||
| - Context window validation | ||
| - Auto-detection of token limits from model info | ||
|
|
||
| It is initialized with model configuration and caches model info from litellm. | ||
| Token limits are auto-detected and exposed as ``detected_max_input_tokens`` | ||
| and ``detected_max_output_tokens``. The caller (LLM) owns the resolution of | ||
| user overrides vs detected values. | ||
|
|
||
| Example: | ||
| >>> config = CapabilitiesConfig( | ||
| ... model="claude-sonnet-4-20250514", | ||
| ... model_canonical_name=None, | ||
| ... base_url=None, | ||
| ... api_key=SecretStr("key"), | ||
| ... disable_vision=False, | ||
| ... caching_prompt=True, | ||
| ... ) | ||
| >>> caps = LLMCapabilities(config) | ||
| >>> caps.vision_is_active() | ||
| True | ||
| >>> caps.is_caching_prompt_active() | ||
| True | ||
| """ | ||
|
|
||
| def __init__(self, config: CapabilitiesConfig) -> None: | ||
| """Initialize capabilities detection. | ||
|
|
||
| Args: | ||
| config: Configuration for capability detection. | ||
| """ | ||
VascoSch92 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| self._config = config | ||
|
|
||
| # Auto-detected token limits (never user overrides) | ||
| self.detected_max_input_tokens: int | None = None | ||
| self.detected_max_output_tokens: int | None = None | ||
|
|
||
VascoSch92 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| # Internal cache for model info | ||
| self._model_info: ModelInfo | None = None | ||
|
|
||
| # Initialize model info and capabilities | ||
| self._init_model_info_and_caps() | ||
|
|
||
| @property | ||
| def model(self) -> str: | ||
| """Return the model name.""" | ||
| return self._config.model | ||
|
|
||
| @property | ||
| def model_name_for_capabilities(self) -> str: | ||
| """Return canonical name for capability lookups (e.g., vision support).""" | ||
| return self._config.model_canonical_name or self._config.model | ||
|
|
||
| @property | ||
| def model_info(self) -> ModelInfo | None: | ||
| """Return the cached model info dictionary.""" | ||
| return self._model_info | ||
|
|
||
| def _init_model_info_and_caps(self) -> None: | ||
| """Initialize model info and auto-detect token limits.""" | ||
| self._model_info = get_litellm_model_info( | ||
| secret_api_key=self._config.api_key, | ||
| base_url=self._config.base_url, | ||
| model=self.model_name_for_capabilities, | ||
| ) | ||
|
|
||
| # Context window (max_input_tokens) | ||
| if self._model_info is not None and isinstance( | ||
| self._model_info.get("max_input_tokens"), int | ||
| ): | ||
| self.detected_max_input_tokens = self._model_info.get("max_input_tokens") | ||
|
|
||
| # Validate context window size | ||
| self._validate_context_window_size() | ||
|
|
||
| # Auto-detect max_output_tokens | ||
| self._auto_detect_max_output_tokens() | ||
|
|
||
| def _auto_detect_max_output_tokens(self) -> None: | ||
| """Auto-detect max_output_tokens from model info.""" | ||
| model = self._config.model | ||
|
|
||
| # 1. Check model-specific overrides (from MODEL_OUTPUT_TOKEN_LIMITS) | ||
| for model_prefix, limit in MODEL_OUTPUT_TOKEN_LIMITS.items(): | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🔴 Critical: Early Return Changes o3 Behavior This early return means model_info is never checked for models matching MODEL_OUTPUT_TOKEN_LIMITS. For Claude models: Old behavior also skipped model_info (they were in an if/elif), so no change. For o3 models: Old behavior checked model_info FIRST, then applied 100k as a cap. If model_info said max_output_tokens=50k, the old code kept 50k. New code unconditionally sets 100k. Test gap: Your test Is the new behavior (always 100k for o3) intentional? If so, update the comment on lines 68-72 and the PR description. If not, move the model_info check before the MODEL_OUTPUT_TOKEN_LIMITS loop. |
||
| if model_prefix in model: | ||
| self.detected_max_output_tokens = limit | ||
| logger.debug( | ||
| "Setting max_output_tokens to %s for %s (model-specific limit)", | ||
| limit, | ||
| model, | ||
| ) | ||
| return | ||
|
|
||
| # 2. Fall back to model_info detection | ||
| if self._model_info is not None: | ||
| if isinstance(self._model_info.get("max_output_tokens"), int): | ||
| self.detected_max_output_tokens = self._model_info.get( | ||
| "max_output_tokens" | ||
| ) | ||
| elif isinstance( | ||
| max_tokens_value := self._model_info.get("max_tokens"), int | ||
| ): | ||
| # 'max_tokens' is ambiguous: some providers use it for total | ||
| # context window, not output limit. Cap it to avoid requesting | ||
| # output that exceeds the context window. | ||
| self.detected_max_output_tokens = min( | ||
| max_tokens_value, DEFAULT_MAX_OUTPUT_TOKENS_CAP | ||
| ) | ||
| if max_tokens_value > DEFAULT_MAX_OUTPUT_TOKENS_CAP: | ||
| logger.debug( | ||
| "Capping max_output_tokens from %s to %s for %s " | ||
| "(max_tokens may be context window, not output)", | ||
| max_tokens_value, | ||
| self.detected_max_output_tokens, | ||
| model, | ||
| ) | ||
|
|
||
| def _validate_context_window_size(self) -> None: | ||
| """Validate that the context window is large enough for OpenHands.""" | ||
| # Allow override via environment variable | ||
| if os.environ.get(ENV_ALLOW_SHORT_CONTEXT_WINDOWS, "").lower() in ( | ||
| "true", | ||
| "1", | ||
| "yes", | ||
| ): | ||
| return | ||
|
|
||
| # Unknown context window - cannot validate | ||
| if self.detected_max_input_tokens is None: | ||
| return | ||
|
|
||
| # Check minimum requirement | ||
| if self.detected_max_input_tokens < MIN_CONTEXT_WINDOW_TOKENS: | ||
| raise LLMContextWindowTooSmallError( | ||
| self.detected_max_input_tokens, MIN_CONTEXT_WINDOW_TOKENS | ||
| ) | ||
|
|
||
| def vision_is_active(self) -> bool: | ||
| """Check if vision is supported and enabled. | ||
|
|
||
| Returns: | ||
| True if the model supports vision and it's not disabled. | ||
| """ | ||
| with warnings.catch_warnings(): | ||
| warnings.simplefilter("ignore") | ||
| return not self._config.disable_vision and self._supports_vision() | ||
|
|
||
| def _supports_vision(self) -> bool: | ||
| """Check if the model supports vision capabilities. | ||
|
|
||
| Returns: | ||
| True if model is vision capable. Returns False if model not | ||
| supported by litellm. | ||
| """ | ||
| # litellm.supports_vision currently returns False for 'openai/gpt-...' | ||
| # or 'anthropic/claude-...' (with prefixes) but model_info will have | ||
| # the correct value for some reason. | ||
| # Check both the full model name and the name after proxy prefix | ||
| model_for_caps = self.model_name_for_capabilities | ||
| return bool( | ||
| supports_vision(model_for_caps) | ||
| or supports_vision(model_for_caps.split("/")[-1]) | ||
| or ( | ||
| self._model_info is not None | ||
| and self._model_info.get("supports_vision", False) | ||
| ) | ||
| ) | ||
VascoSch92 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| def is_caching_prompt_active(self) -> bool: | ||
| """Check if prompt caching is supported and enabled for current model. | ||
|
|
||
| Returns: | ||
| True if prompt caching is supported and enabled for the given model. | ||
| """ | ||
| if not self._config.caching_prompt: | ||
| return False | ||
| # We don't need to look-up model_info, because | ||
| # only Anthropic models need explicit caching breakpoints | ||
| return ( | ||
| self._config.caching_prompt | ||
| and get_features(self.model_name_for_capabilities).supports_prompt_cache | ||
| ) | ||
|
|
||
| def uses_responses_api(self) -> bool: | ||
| """Check if this model uses the OpenAI Responses API path. | ||
|
|
||
| Returns: | ||
| True if the model should use the Responses API. | ||
| """ | ||
| # by default, uses = supports | ||
| return get_features(self.model_name_for_capabilities).supports_responses_api | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🟠 Important: Comment Doesn't Match Implementation
This comment claims the limit is an "upper cap" that clamps down higher values from litellm, but the implementation unconditionally overrides model_info (see line 165 early return).
The behavior change: For o3 models, the old code applied the limit as a cap AFTER model_info detection:
This meant if model_info said max_output_tokens=50k, it would keep 50k. If model_info said 150k, it would clamp to 100k.
The new code checks MODEL_OUTPUT_TOKEN_LIMITS FIRST and returns early, so model_info is never consulted for o3 models. This changes behavior for o3 models where model_info might have a value < 100k.
This contradicts the PR's claim of "No external behavior changes".
Either fix the comment to say "these override model_info" OR fix the implementation to actually apply limits as caps: