diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/constants.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/constants.py index 9b100fdc6862..1dd3b3920bc4 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/constants.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/constants.py @@ -93,7 +93,7 @@ class EvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta): SELF_HARM = "self_harm" SEXUAL = "sexual" PROTECTED_MATERIAL = "protected_material" - XPIA = "xpia" + XPIA = "indirect_attack" GROUNDEDNESS = "generic_groundedness" CODE_VULNERABILITY = "code_vulnerability" UNGROUNDED_ATTRIBUTES = "ungrounded_attributes" @@ -108,7 +108,7 @@ class _InternalEvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta): enum over time. """ - ECI = "eci" + ECI = "election_critical_information" # Mapping of evaluation metrics to their scoring patterns diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py index a469a2050be1..03ac47e5daa7 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py @@ -2,7 +2,9 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- import asyncio +import copy import importlib.metadata +import logging import math import re import time @@ -13,14 +15,15 @@ from urllib.parse import urlparse from string import Template from azure.ai.evaluation._common.onedp._client import ProjectsClient as AIProjectClient -from azure.ai.evaluation._common.onedp.models import QueryResponseInlineMessage +from azure.ai.evaluation._common.onedp.models import QueryResponseInlineMessage, EvaluatorMessage +from azure.ai.evaluation._common.onedp._utils.model_base import SdkJSONEncoder from azure.core.exceptions import HttpResponseError import jwt from azure.ai.evaluation._legacy._adapters._errors import MissingRequiredPackage from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException -from azure.ai.evaluation._http_utils import AsyncHttpPipeline, get_async_http_client +from azure.ai.evaluation._http_utils import AsyncHttpPipeline, get_async_http_client, get_http_client from azure.ai.evaluation._model_configurations import AzureAIProject from azure.ai.evaluation._user_agent import UserAgentSingleton from azure.ai.evaluation._common.utils import is_onedp_project @@ -38,6 +41,8 @@ from .utils import get_harm_severity_level, retrieve_content_type +LOGGER = logging.getLogger(__name__) + USER_TEXT_TEMPLATE_DICT: Dict[str, Template] = { "DEFAULT": Template("{$query}{$response}"), } @@ -252,7 +257,7 @@ async def submit_request( http_response = await client.post(url, json=payload, headers=headers) if http_response.status_code != 202: - print("Fail evaluating '%s' with error message: %s" % (payload["UserTextList"], http_response.text())) + LOGGER.error("Fail evaluating '%s' with error message: %s", payload["UserTextList"], http_response.text()) http_response.raise_for_status() result = http_response.json() operation_id = result["location"].split("/")[-1] @@ -933,11 +938,22 @@ def _build_sync_eval_payload( # Prepare context if available context = None if data.get("context") is not None: - context = " ".join(c["content"] for c in data["context"]["contexts"]) + # Handle both string context and dict with contexts list + context_data = data["context"] + if isinstance(context_data, str): + # Context is already a string + context = context_data + elif isinstance(context_data, dict) and "contexts" in context_data: + # Context is a dict with contexts list + context = " ".join(c["content"] for c in context_data["contexts"]) + elif isinstance(context_data, dict): + # Context is a dict but might be in a different format + # Try to get content directly or convert to string + context = context_data.get("content", str(context_data)) # Build QueryResponseInlineMessage object item_content = QueryResponseInlineMessage( - query=data.get("query", ""), + query=data.get("query", "query"), # TODO: remove default query once sync evals supports no query response=data.get("response", ""), context=context, tools=data.get("tool_calls"), @@ -971,74 +987,6 @@ def _build_sync_eval_payload( return sync_eval_payload -def _parse_sync_eval_result( - eval_result, metric_name: str, metric_display_name: Optional[str] = None -) -> Dict[str, Union[str, float]]: - """Parse the result from sync_evals response (EvalRunOutputItem) into the standard format. - - :param eval_result: The result from sync_evals.create() call (EvalRunOutputItem). - :param metric_name: The evaluation metric name. - :type metric_name: str - :param metric_display_name: The display name for the metric. - :type metric_display_name: Optional[str] - :return: The parsed result in standard format compatible with parse_response. - :rtype: Dict[str, Union[str, float]] - """ - # Handle EvalRunOutputItem structure - # Expected structure: {'results': [{'name': 'violence', 'score': 0.0, 'reason': '...', ...}]} - - display_name = metric_display_name or metric_name - - # Handle both dict and object formats - if hasattr(eval_result, "results"): - results = eval_result.results - elif isinstance(eval_result, dict) and "results" in eval_result: - results = eval_result["results"] - else: - return {} - - if not results or len(results) == 0: - return {} - - # Find the result for our specific metric - target_result = None - for result_item in results: - if isinstance(result_item, dict): - if result_item.get("name") == metric_name or result_item.get("metric") == metric_name: - target_result = result_item - break - elif hasattr(result_item, "name") and result_item.name == metric_name: - target_result = result_item - break - - if not target_result: - return {} - - # Extract values from the result item - if isinstance(target_result, dict): - score = target_result.get("score", math.nan) - reason = target_result.get("reason", "") - # Also check properties.reasoning for additional reason text - if not reason and "properties" in target_result: - props = target_result["properties"] - if isinstance(props, dict): - reason = props.get("reasoning", props.get("scoreProperties", {}).get("reasoning", "")) - else: - score = getattr(target_result, "score", math.nan) - reason = getattr(target_result, "reason", "") - if not reason and hasattr(target_result, "properties"): - props = target_result.properties - if isinstance(props, dict): - reason = props.get("reasoning", props.get("scoreProperties", {}).get("reasoning", "")) - - # Convert score to severity level using existing logic - harm_score = score if not math.isnan(score) else math.nan - severity_level = get_harm_severity_level(harm_score) if not math.isnan(harm_score) else math.nan - - # Return in the standard format expected by the red team processor - return {display_name: severity_level, f"{display_name}_score": harm_score, f"{display_name}_reason": reason} - - async def evaluate_with_rai_service_sync( data: dict, metric_name: str, @@ -1076,15 +1024,28 @@ async def evaluate_with_rai_service_sync( :rtype: EvalRunOutputItem :raises: EvaluationException if project_scope is not a OneDP project """ + api_version = "2025-10-15-preview" if not is_onedp_project(project_scope): - msg = "evaluate_with_rai_service_sync only supports OneDP projects. Use evaluate_with_rai_service for legacy projects." - raise EvaluationException( - message=msg, - internal_message=msg, - target=ErrorTarget.RAI_CLIENT, - category=ErrorCategory.INVALID_VALUE, - blame=ErrorBlame.USER_ERROR, - ) + # Get RAI service URL from discovery service and check service availability + token = await fetch_or_reuse_token(credential) + rai_svc_url = await get_rai_svc_url(project_scope, token) + await ensure_service_availability(rai_svc_url, token, annotation_task) + + # Submit annotation request and fetch result + url = rai_svc_url + f"/sync_evals:run?api-version={api_version}" + headers = {"aml-user-token": token, "Authorization": "Bearer " + token, "Content-Type": "application/json"} + sync_eval_payload = _build_sync_eval_payload(data, metric_name, annotation_task, scan_session_id) + sync_eval_payload_json = json.dumps(sync_eval_payload, cls=SdkJSONEncoder) + + with get_http_client() as client: + http_response = client.post(url, data=sync_eval_payload_json, headers=headers) + + if http_response.status_code != 200: + LOGGER.error("Fail evaluating with error message: %s", http_response.text()) + http_response.raise_for_status() + result = http_response.json() + + return result client = AIProjectClient( endpoint=project_scope, @@ -1092,7 +1053,6 @@ async def evaluate_with_rai_service_sync( user_agent_policy=UserAgentPolicy(base_user_agent=UserAgentSingleton().value), ) - # Build the sync eval payload sync_eval_payload = _build_sync_eval_payload(data, metric_name, annotation_task, scan_session_id) # Call sync_evals.create() with the JSON payload eval_result = client.sync_evals.create(eval=sync_eval_payload) @@ -1101,6 +1061,207 @@ async def evaluate_with_rai_service_sync( return eval_result +def _build_sync_eval_multimodal_payload(messages, metric_name: str) -> Dict: + """Build the sync_evals payload for multimodal evaluations. + + :param messages: The conversation messages to evaluate. + :type messages: list + :param metric_name: The evaluation metric name. + :type metric_name: str + :return: The payload formatted for sync_evals requests. + :rtype: Dict + """ + + def _coerce_messages(raw_messages): + if not raw_messages: + return [] + if isinstance(raw_messages[0], dict): + return [copy.deepcopy(message) for message in raw_messages] + try: + from azure.ai.inference.models import ChatRequestMessage + except ImportError as ex: + error_message = ( + "Please install 'azure-ai-inference' package to use SystemMessage, UserMessage, AssistantMessage" + ) + raise MissingRequiredPackage(message=error_message) from ex + if isinstance(raw_messages[0], ChatRequestMessage): + return [message.as_dict() for message in raw_messages] + return [copy.deepcopy(message) for message in raw_messages] + + def _normalize_message(message): + normalized = copy.deepcopy(message) + content = normalized.get("content") + if content is None: + normalized["content"] = [] + elif isinstance(content, list): + normalized["content"] = [ + copy.deepcopy(part) if isinstance(part, dict) else {"type": "text", "text": str(part)} + for part in content + ] + elif isinstance(content, dict): + normalized["content"] = [copy.deepcopy(content)] + else: + normalized["content"] = [{"type": "text", "text": str(content)}] + return normalized + + def _content_to_text(parts): + text_parts = [] + for part in parts: + if not isinstance(part, dict): + text_parts.append(str(part)) + elif part.get("text"): + text_parts.append(part["text"]) + elif part.get("type") in {"image_url", "input_image"}: + image_part = part.get("image_url") or part.get("image") + text_parts.append(json.dumps(image_part)) + elif part.get("type") == "input_text" and part.get("text"): + text_parts.append(part["text"]) + else: + text_parts.append(json.dumps(part)) + return "\n".join(filter(None, text_parts)) + + normalized_messages = [_normalize_message(message) for message in _coerce_messages(messages)] + filtered_messages = [message for message in normalized_messages if message.get("role") != "system"] + + assistant_messages = [message for message in normalized_messages if message.get("role") == "assistant"] + user_messages = [message for message in normalized_messages if message.get("role") == "user"] + content_type = retrieve_content_type(assistant_messages, metric_name) + + last_assistant_text = _content_to_text(assistant_messages[-1]["content"]) if assistant_messages else "" + last_user_text = _content_to_text(user_messages[-1]["content"]) if user_messages else "" + + if filtered_messages and filtered_messages[-1].get("role") == "assistant": + response_messages = [filtered_messages[-1]] + query_messages = filtered_messages[:-1] + else: + response_messages = [] + query_messages = filtered_messages + + properties = {} + if last_user_text: + properties["query_text"] = last_user_text + if last_assistant_text: + properties["response_text"] = last_assistant_text + if content_type: + properties["content_type"] = content_type + + item_content = { + "type": "azure_ai_evaluator_messages", + "query": query_messages, + "response": response_messages, + } + if properties: + item_content["properties"] = properties + + template = [] + if "query_text" in properties: + template.append( + { + "type": "message", + "role": "user", + "content": {"text": "{{item.properties.query_text}}"}, + } + ) + if "response_text" in properties: + template.append( + { + "type": "message", + "role": "assistant", + "content": {"text": "{{item.properties.response_text}}"}, + } + ) + + data_source = { + "type": "jsonl", + "source": {"type": "file_content", "content": {"item": item_content}}, + } + if template: + data_source["input_messages"] = {"type": "template", "template": template} + + data_mapping = { + "query": "{{item.query}}", + "response": "{{item.response}}", + } + if "content_type" in properties: + data_mapping["content_type"] = "{{item.properties.content_type}}" + + return { + "name": f"Safety Eval - {metric_name}", + "data_source": data_source, + "testing_criteria": [ + { + "type": "azure_ai_evaluator", + "name": metric_name, + "evaluator_name": metric_name, + "data_mapping": data_mapping, + } + ], + } + + +async def evaluate_with_rai_service_sync_multimodal( + messages, + metric_name: str, + project_scope: Union[str, AzureAIProject], + credential: TokenCredential, + scan_session_id: Optional[str] = None, +): + """Evaluate multimodal content using the sync_evals endpoint. + + :param messages: The normalized list of conversation messages. + :type messages: list + :param metric_name: The evaluation metric to use. + :type metric_name: str + :param project_scope: Azure AI project scope or endpoint. + :type project_scope: Union[str, AzureAIProject] + :param credential: Azure authentication credential. + :type credential: ~azure.core.credentials.TokenCredential + :param scan_session_id: Optional scan session identifier for correlation. + :type scan_session_id: Optional[str] + :return: The EvalRunOutputItem or legacy response payload. + :rtype: Union[Dict, EvalRunOutputItem] + """ + + api_version = "2025-10-15-preview" + sync_eval_payload = _build_sync_eval_multimodal_payload(messages, metric_name) + + if is_onedp_project(project_scope): + client = AIProjectClient( + endpoint=project_scope, + credential=credential, + user_agent_policy=UserAgentPolicy(base_user_agent=UserAgentSingleton().value), + ) + + headers = {"x-ms-client-request-id": scan_session_id} if scan_session_id else None + if headers: + return client.sync_evals.create(eval=sync_eval_payload, headers=headers) + return client.sync_evals.create(eval=sync_eval_payload) + + token = await fetch_or_reuse_token(credential) + rai_svc_url = await get_rai_svc_url(project_scope, token) + await ensure_service_availability(rai_svc_url, token, Tasks.CONTENT_HARM) + + url = rai_svc_url + f"/sync_evals:run?api-version={api_version}" + headers = { + "aml-user-token": token, + "Authorization": "Bearer " + token, + "Content-Type": "application/json", + } + if scan_session_id: + headers["x-ms-client-request-id"] = scan_session_id + + sync_eval_payload_json = json.dumps(sync_eval_payload, cls=SdkJSONEncoder) + + with get_http_client() as client: + http_response = client.post(url, data=sync_eval_payload_json, headers=headers) + + if http_response.status_code != 200: + LOGGER.error("Fail evaluating with error message: %s", http_response.text()) + http_response.raise_for_status() + + return http_response.json() + + async def evaluate_with_rai_service_multimodal( messages, metric_name: str, project_scope: Union[str, AzureAIProject], credential: TokenCredential ): diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py index d87563da10b0..98b236a12d15 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py @@ -328,13 +328,17 @@ async def get_jail_break_dataset_with_type(self, type: str, **kwargs: Any) -> Li async def get_attack_objectives( self, *, + risk_category: str, risk_types: Optional[List[str]] = None, lang: Optional[str] = None, strategy: Optional[str] = None, + target_type: Optional[str] = None, **kwargs: Any ) -> List[_models.AttackObjective]: """Get the attack objectives. + :keyword risk_category: Risk category for the attack objectives. Required. + :paramtype risk_category: str :keyword risk_types: Risk types for the attack objectives dataset. Default value is None. :paramtype risk_types: list[str] :keyword lang: The language for the attack objectives dataset, defaults to 'en'. Default value @@ -342,6 +346,8 @@ async def get_attack_objectives( :paramtype lang: str :keyword strategy: The strategy. Default value is None. :paramtype strategy: str + :keyword target_type: The target, model/agent. Default value is None. + :paramtype target_type: str :return: list of AttackObjective :rtype: list[~raiclient.models.AttackObjective] :raises ~azure.core.exceptions.HttpResponseError: @@ -360,12 +366,14 @@ async def get_attack_objectives( cls: ClsType[List[_models.AttackObjective]] = kwargs.pop("cls", None) _request = build_rai_svc_get_attack_objectives_request( + risk_categories=[risk_category], risk_types=risk_types, lang=lang, strategy=strategy, api_version=self._config.api_version, headers=_headers, params=_params, + target_type=target_type, ) path_format_arguments = { "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True), diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/raiclient/operations/_operations.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/raiclient/operations/_operations.py index b1feb1d8c24c..aa7e31c1f7c0 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/raiclient/operations/_operations.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/raiclient/operations/_operations.py @@ -117,6 +117,7 @@ def build_rai_svc_get_attack_objectives_request( # pylint: disable=name-too-lon risk_categories: Optional[List[str]] = None, lang: Optional[str] = None, strategy: Optional[str] = None, + target_type: Optional[str] = None, **kwargs: Any ) -> HttpRequest: _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {}) @@ -140,6 +141,8 @@ def build_rai_svc_get_attack_objectives_request( # pylint: disable=name-too-lon _params["lang"] = _SERIALIZER.query("lang", lang, "str") if strategy is not None: _params["strategy"] = _SERIALIZER.query("strategy", strategy, "str") + if target_type is not None: + _params["targetType"] = _SERIALIZER.query("target_type", target_type, "str") # Construct headers _headers["Accept"] = _SERIALIZER.header("accept", accept, "str") @@ -586,6 +589,7 @@ def get_attack_objectives( risk_types: Optional[List[str]] = None, lang: Optional[str] = None, strategy: Optional[str] = None, + target_type: Optional[str] = None, **kwargs: Any ) -> List[_models.AttackObjective]: """Get the attack objectives. @@ -599,6 +603,8 @@ def get_attack_objectives( :paramtype lang: str :keyword strategy: The strategy. Default value is None. :paramtype strategy: str + :keyword target_type: The target, model/agent. Default value is None. + :paramtype target_type: str :return: list of AttackObjective :rtype: list[~raiclient.models.AttackObjective] :raises ~azure.core.exceptions.HttpResponseError: @@ -624,6 +630,7 @@ def get_attack_objectives( api_version=self._config.api_version, headers=_headers, params=_params, + target_type=target_type, ) path_format_arguments = { "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True), diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py index 4f68a4c310bd..4eb0cbe3838d 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py @@ -1,7 +1,7 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from typing import Dict, TypeVar, Union, Optional +from typing import Any, Dict, TypeVar, Union, Optional from typing_extensions import override @@ -11,7 +11,10 @@ Tasks, _InternalAnnotationTasks, ) -from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service, evaluate_with_rai_service_multimodal +from azure.ai.evaluation._common.rai_service import ( + evaluate_with_rai_service_sync, + evaluate_with_rai_service_sync_multimodal, +) from azure.ai.evaluation._common.utils import validate_azure_ai_project, is_onedp_project from azure.ai.evaluation._exceptions import EvaluationException from azure.ai.evaluation._common.utils import validate_conversation @@ -129,13 +132,15 @@ async def _evaluate_conversation(self, conversation: Dict) -> Dict[str, T]: validate_conversation(conversation) messages = conversation["messages"] # Run score computation based on supplied metric. - result = await evaluate_with_rai_service_multimodal( + # Convert enum to string value for the multimodal endpoint + metric_value = self._eval_metric.value if hasattr(self._eval_metric, "value") else self._eval_metric + result = await evaluate_with_rai_service_sync_multimodal( messages=messages, - metric_name=self._eval_metric, + metric_name=metric_value, project_scope=self._azure_ai_project, credential=self._credential, ) - return result + return self._parse_eval_result(result) async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]: query = eval_input.get("query", None) @@ -165,7 +170,7 @@ async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]: ) input_data["context"] = context - return await evaluate_with_rai_service( # type: ignore + eval_result = await evaluate_with_rai_service_sync( # type: ignore metric_name=self._eval_metric, data=input_data, project_scope=self._azure_ai_project, @@ -174,6 +179,98 @@ async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]: evaluator_name=self.__class__.__name__, ) + # Parse the EvalRunOutputItem format to the expected dict format + return self._parse_eval_result(eval_result) + + def _parse_eval_result(self, eval_result) -> Dict[str, T]: + """Parse the EvalRunOutputItem format into the expected dict format. + + :param eval_result: The result from evaluate_with_rai_service_sync (EvalRunOutputItem). + :return: The parsed result in the expected format. + :rtype: Dict[str, T] + """ + # Handle EvalRunOutputItem structure + if hasattr(eval_result, "results") or (isinstance(eval_result, dict) and "results" in eval_result): + results = eval_result.results if hasattr(eval_result, "results") else eval_result.get("results", []) + + # Find the result matching our metric + for result_item in results: + result_dict = result_item if isinstance(result_item, dict) else result_item.__dict__ + # Compare against both the enum and its string value + metric_name = result_dict.get("name") or result_dict.get("metric") + if metric_name == self._eval_metric or metric_name == self._eval_metric.value: + # Extract values from EvalRunOutputItem result format + score = result_dict.get("score", 0) + reason = result_dict.get("reason", "") + + # Special handling for evaluators that use _label format + if self._eval_metric in [ + EvaluationMetrics.CODE_VULNERABILITY, + EvaluationMetrics.PROTECTED_MATERIAL, + EvaluationMetrics.UNGROUNDED_ATTRIBUTES, + EvaluationMetrics.XPIA, + _InternalEvaluationMetrics.ECI, + ]: + # These evaluators use _label instead of severity labels + # Ensure label is not None - default to False for boolean evaluators + label = result_dict.get("label") + if label is None: + label = False + parsed_result = { + f"{self._eval_metric.value}_label": label, + f"{self._eval_metric.value}_reason": reason, + } + + details_source = result_dict.get("details") + if not details_source: + properties = result_dict.get("properties", {}) + if isinstance(properties, dict): + details_source = properties.get("scoreProperties") + + if details_source and isinstance(details_source, dict): + parsed_result[f"{self._eval_metric.value}_details"] = _prepare_details(details_source) + + return parsed_result + + # Standard handling for harm severity evaluators + # Convert score to severity label if needed + severity_label = result_dict.get("label") + if severity_label is None: + # Calculate severity from score + from azure.ai.evaluation._common.utils import get_harm_severity_level + + severity_label = get_harm_severity_level(score) + + # Return in the expected format matching parse_response output + return { + self._eval_metric.value: severity_label, + f"{self._eval_metric.value}_score": score, + f"{self._eval_metric.value}_reason": reason, + } + + # If we can't parse as EvalRunOutputItem or no matching result found, + # check if it's already in the correct format (might be legacy response) + if isinstance(eval_result, dict): + # Check if it already has the expected keys + expected_key = ( + f"{self._eval_metric.value}_label" + if self._eval_metric + in [ + EvaluationMetrics.CODE_VULNERABILITY, + EvaluationMetrics.PROTECTED_MATERIAL, + EvaluationMetrics.UNGROUNDED_ATTRIBUTES, + EvaluationMetrics.XPIA, + _InternalEvaluationMetrics.ECI, + ] + else self._eval_metric.value + ) + + if expected_key in eval_result: + return eval_result + + # Return empty dict if we can't parse + return {} + def _get_task(self): """Get the annotation task for the current evaluation metric. The annotation task is used by the RAI service script to determine a the message format @@ -196,3 +293,33 @@ def _get_task(self): if self._eval_metric == EvaluationMetrics.UNGROUNDED_ATTRIBUTES: return Tasks.UNGROUNDED_ATTRIBUTES return Tasks.CONTENT_HARM + + +def _coerce_string_boolean(value: Any) -> Any: + """Convert common string boolean values to their bool equivalents.""" + + if isinstance(value, str): + lowered = value.strip().lower() + if lowered == "true": + return True + if lowered == "false": + return False + return value + + +def _prepare_details(details: Dict[str, Any]) -> Dict[str, Any]: + """Normalize detail keys and coerce string booleans recursively.""" + + normalized: Dict[str, Any] = {} + for key, value in details.items(): + normalized_key = key.replace("-", "_") if isinstance(key, str) else key + normalized[normalized_key] = _prepare_detail_value(value) + return normalized + + +def _prepare_detail_value(value: Any) -> Any: + if isinstance(value, dict): + return _prepare_details(value) + if isinstance(value, list): + return [_prepare_detail_value(item) for item in value] + return _coerce_string_boolean(value) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py index 05fbf7cac7b9..4728012eccf7 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py @@ -94,7 +94,7 @@ def __init__( **kwargs, ): super().__init__( - eval_metric=EvaluationMetrics.HATE_FAIRNESS, + eval_metric=EvaluationMetrics.HATE_UNFAIRNESS, azure_ai_project=azure_ai_project, credential=credential, conversation_aggregation_type=_AggregationType.MAX, diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_evaluation_processor.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_evaluation_processor.py index 4e5e8f09b811..79f8e95c553c 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_evaluation_processor.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_evaluation_processor.py @@ -25,8 +25,8 @@ # Azure AI Evaluation imports from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING -from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service, evaluate_with_rai_service_sync -from azure.ai.evaluation._common.utils import is_onedp_project, get_default_threshold_for_evaluator +from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service_sync +from azure.ai.evaluation._common.utils import get_default_threshold_for_evaluator from azure.ai.evaluation._evaluate._utils import _write_output # Local imports @@ -156,24 +156,15 @@ async def evaluate_conversation( @retry(**self.retry_config["network_retry"]) async def evaluate_with_rai_service_with_retry(): try: - if use_sync_endpoint: - return await evaluate_with_rai_service_sync( - data=query_response, - metric_name=effective_metric_name, - project_scope=self.azure_ai_project, - credential=self.credential, - annotation_task=annotation_task, - scan_session_id=self.scan_session_id, - ) - else: - return await evaluate_with_rai_service( - data=query_response, - metric_name=metric_name, - project_scope=self.azure_ai_project, - credential=self.credential, - annotation_task=annotation_task, - scan_session_id=self.scan_session_id, - ) + # Always use sync_evals endpoint for all projects + return await evaluate_with_rai_service_sync( + data=query_response, + metric_name=metric_name, + project_scope=self.azure_ai_project, + credential=self.credential, + annotation_task=annotation_task, + scan_session_id=self.scan_session_id, + ) except ( httpx.ConnectTimeout, httpx.ReadTimeout, @@ -407,6 +398,12 @@ async def evaluate( try: # Get the appropriate metric for this risk category metric_name = get_metric_from_risk_category(risk_category) + + # For hate_unfairness, always use "hate_unfairness" metric name for Sync API + if risk_category == RiskCategory.HateUnfairness: + metric_name = "hate_unfairness" + self.logger.debug(f"Using metric 'hate_unfairness' for Sync API") + self.logger.debug(f"Using metric '{metric_name}' for risk category '{risk_category.value}'") # Load all conversations from the data file diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py index 9e38784c3966..8e1761c3d2de 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py @@ -10,7 +10,7 @@ import asyncio import re from typing import Dict, Optional, Any, Tuple, List -from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service +from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service_sync from azure.ai.evaluation.simulator._model_tools._generated_rai_client import ( GeneratedRAIClient, ) @@ -63,20 +63,40 @@ async def send_prompt_async( metric_name = get_metric_from_risk_category(self.risk_category) annotation_task = get_annotation_task_from_risk_category(self.risk_category) - evaluation_result = await evaluate_with_rai_service( + eval_result = await evaluate_with_rai_service_sync( thing_to_eval_qr, metric_name=metric_name, project_scope=self.azure_ai_project, credential=self.credential, annotation_task=annotation_task, ) - self.logger.info(f"Evaluation result: {evaluation_result}") - - # Extract the relevant data from evaluation_result - score = evaluation_result.get(f"{metric_name}_score", 0) - reason = evaluation_result.get(f"{metric_name}_reason", "") - threshold = evaluation_result.get(f"{metric_name}_threshold", 3) - result = evaluation_result.get(metric_name, "") + self.logger.info(f"Evaluation result: {eval_result}") + + # Parse the EvalRunOutputItem format + score = 0 + reason = "" + threshold = 3 + result = "" + + # Handle EvalRunOutputItem structure + if hasattr(eval_result, "results") or (isinstance(eval_result, dict) and "results" in eval_result): + results = eval_result.results if hasattr(eval_result, "results") else eval_result.get("results", []) + + # Find the result matching our metric + for result_item in results: + result_dict = result_item if isinstance(result_item, dict) else result_item.__dict__ + if result_dict.get("name") == metric_name or result_dict.get("metric") == metric_name: + score = result_dict.get("score", 0) + reason = result_dict.get("reason", "") + threshold = result_dict.get("threshold", 3) + + # Convert score to severity label if needed + result = result_dict.get("label") + if result is None: + from azure.ai.evaluation._common.utils import get_harm_severity_level + + result = get_harm_severity_level(score) + break # Calculate pass/fail using the same logic as in _evaluate_conversation # Convert to boolean then to string for PyRIT's true/false score diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_utils/metric_mapping.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_utils/metric_mapping.py index b0122a53f82d..e570bd0322cd 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_utils/metric_mapping.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_utils/metric_mapping.py @@ -8,6 +8,9 @@ from azure.ai.evaluation._common.constants import Tasks, _InternalAnnotationTasks # Mapping of risk categories to their corresponding evaluation metrics +# Note: For HateUnfairness, the mapping defaults to HATE_FAIRNESS, but the Sync API +# (used for all projects) requires HATE_UNFAIRNESS instead. +# This is handled dynamically in _evaluation_processor.py. RISK_CATEGORY_METRIC_MAP = { RiskCategory.Violence: EvaluationMetrics.VIOLENCE, RiskCategory.HateUnfairness: EvaluationMetrics.HATE_FAIRNESS, diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py index 41cd434d680f..58bb46b4bb41 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py @@ -643,14 +643,18 @@ def test_eci_evaluator(self, request, proj_scope, cred, conv): response="Rhombus", ) assert unrelated_result is not None - assert not unrelated_result["eci_label"] - assert unrelated_result["eci_reason"] is not None, "eci_reason must not be None or empty." + assert not unrelated_result["election_critical_information_label"] + assert ( + unrelated_result["election_critical_information_reason"] is not None + ), "election_critical_information_reason must not be None or empty." # Test conversation input convo_result = eci_eval(conversation=simple_conversation) - assert convo_result["eci_label"] == 0 - assert convo_result["evaluation_per_turn"]["eci_label"] == [False, False] - assert all(convo_result["evaluation_per_turn"]["eci_reason"]), "eci_reason must not be None or empty." + assert convo_result["election_critical_information_label"] == 0 + assert convo_result["evaluation_per_turn"]["election_critical_information_label"] == [False, False] + assert all( + convo_result["evaluation_per_turn"]["election_critical_information_reason"] + ), "election_critical_information_reason must not be None or empty." @pytest.mark.parametrize( ("proj_scope", "cred", "conv"), diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_content_safety_rai_script.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_content_safety_rai_script.py index 87b8890eb342..9635c9ac09ac 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_content_safety_rai_script.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_content_safety_rai_script.py @@ -13,6 +13,7 @@ _get_service_discovery_url, ensure_service_availability, evaluate_with_rai_service, + evaluate_with_rai_service_sync, fetch_or_reuse_token, fetch_result, get_rai_svc_url, @@ -419,14 +420,19 @@ async def test_get_rai_svc_url(self, client_mock, discovery_mock): return_value="wow-that's-a-lot-of-patches", ) @patch("azure.identity.DefaultAzureCredential") - async def test_evaluate_with_rai_service( + async def test_evaluate_with_rai_service_sync( self, cred_mock, fetch_token_mock, scv_mock, avail_mock, submit_mock, fetch_result_mock, parse_mock ): - result = await evaluate_with_rai_service( - "what is the weather outside?", - EvaluationMetrics.HATE_FAIRNESS, - {"subscription_id": "fake-id", "project_name": "fake-name", "resource_group_name": "fake-group"}, - DefaultAzureCredential(), + result = await evaluate_with_rai_service_sync( + data={"query": "what is the weather outside?", "response": "test response"}, + metric_name=EvaluationMetrics.HATE_UNFAIRNESS, + project_scope={ + "subscription_id": "fake-id", + "project_name": "fake-name", + "resource_group_name": "fake-group", + }, + credential=DefaultAzureCredential(), + annotation_task="content harm", ) assert result == "wow-that's-a-lot-of-patches" assert fetch_token_mock._mock_call_count == 1 diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_rai_service_eval_chat_target.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_rai_service_eval_chat_target.py index 25675d5e2684..a144a898b401 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_rai_service_eval_chat_target.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_rai_service_eval_chat_target.py @@ -54,7 +54,7 @@ def mock_prompt_request(): @pytest.mark.asyncio -@mock.patch("azure.ai.evaluation.red_team._utils._rai_service_eval_chat_target.evaluate_with_rai_service") +@mock.patch("azure.ai.evaluation.red_team._utils._rai_service_eval_chat_target.evaluate_with_rai_service_sync") async def test_send_prompt_async_success(mock_evaluate, mock_prompt_request, mock_azure_ai_project): """Tests successful evaluation and response formatting.""" target = RAIServiceEvalChatTarget( @@ -64,12 +64,17 @@ async def test_send_prompt_async_success(mock_evaluate, mock_prompt_request, moc logger=MockLogger, ) - # Mock the evaluation result from RAI service + # Mock the evaluation result from RAI service in EvalRunOutputItem format mock_evaluate.return_value = { - "hate_fairness_score": 5, - "hate_fairness_reason": "Detected hateful content.", - "hate_fairness_threshold": 3, - "hate_fairness": "High Severity", + "results": [ + { + "name": "hate_fairness", + "score": 5, + "reason": "Detected hateful content.", + "threshold": 3, + "label": "High Severity", + } + ] } response = await target.send_prompt_async(prompt_request=mock_prompt_request) @@ -97,7 +102,7 @@ async def test_send_prompt_async_success(mock_evaluate, mock_prompt_request, moc @pytest.mark.asyncio -@mock.patch("azure.ai.evaluation.red_team._utils._rai_service_eval_chat_target.evaluate_with_rai_service") +@mock.patch("azure.ai.evaluation.red_team._utils._rai_service_eval_chat_target.evaluate_with_rai_service_sync") async def test_send_prompt_async_fail_score(mock_evaluate, mock_prompt_request, mock_azure_ai_project): """Tests evaluation resulting in a 'false' score.""" target = RAIServiceEvalChatTarget( @@ -108,10 +113,15 @@ async def test_send_prompt_async_fail_score(mock_evaluate, mock_prompt_request, ) mock_evaluate.return_value = { - "violence_score": 2, - "violence_reason": "No violent content detected.", - "violence_threshold": 4, - "violence": "Very Low Severity", + "results": [ + { + "name": "violence", + "score": 2, + "reason": "No violent content detected.", + "threshold": 4, + "label": "Very Low Severity", + } + ] } response = await target.send_prompt_async(prompt_request=mock_prompt_request) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_red_team.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_red_team.py index 50938de23fe9..02b1817bed95 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_red_team.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_red_team.py @@ -1057,7 +1057,7 @@ async def test_evaluate_method(self, mock_get_logger, red_team): "azure.ai.evaluation.red_team._utils.metric_mapping.get_metric_from_risk_category", return_value="test_metric", ), patch( - "azure.ai.evaluation._common.rai_service.evaluate_with_rai_service", new_callable=AsyncMock + "azure.ai.evaluation._common.rai_service.evaluate_with_rai_service_sync", new_callable=AsyncMock ) as mock_evaluate_rai, patch( "uuid.uuid4", return_value="test-uuid" ), patch( @@ -1074,7 +1074,7 @@ async def test_evaluate_method(self, mock_get_logger, red_team): red_team.evaluation_processor, "evaluate_conversation", mock_evaluate_conversation ): # Correctly patch the object - mock_evaluate_rai.return_value = { # Keep this mock if evaluate_with_rai_service is still used + mock_evaluate_rai.return_value = { "violence": "high", "violence_reason": "Test reason", "violence_score": 5,