Azure · slister1001 · Oct 27, 2025 · Oct 27, 2025 · Oct 27, 2025 · Oct 27, 2025
@@ -93,7 +93,7 @@ class EvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
     SELF_HARM = "self_harm"
     SEXUAL = "sexual"
     PROTECTED_MATERIAL = "protected_material"
-    XPIA = "xpia"
+    XPIA = "indirect_attack"
     GROUNDEDNESS = "generic_groundedness"
     CODE_VULNERABILITY = "code_vulnerability"
     UNGROUNDED_ATTRIBUTES = "ungrounded_attributes"
@@ -108,7 +108,7 @@ class _InternalEvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
     enum over time.
     """
 
-    ECI = "eci"
+    ECI = "election_critical_information"
 
 
 # Mapping of evaluation metrics to their scoring patterns

@@ -328,20 +328,26 @@ async def get_jail_break_dataset_with_type(self, type: str, **kwargs: Any) -> Li
     async def get_attack_objectives(
         self,
         *,
+        risk_category: str,
         risk_types: Optional[List[str]] = None,
         lang: Optional[str] = None,
         strategy: Optional[str] = None,
+        target_type: Optional[str] = None,
         **kwargs: Any
     ) -> List[_models.AttackObjective]:
         """Get the attack objectives.
 
+        :keyword risk_category: Risk category for the attack objectives. Required.
+        :paramtype risk_category: str
         :keyword risk_types: Risk types for the attack objectives dataset. Default value is None.
         :paramtype risk_types: list[str]
         :keyword lang: The language for the attack objectives dataset, defaults to 'en'. Default value
          is None.
         :paramtype lang: str
         :keyword strategy: The strategy. Default value is None.
         :paramtype strategy: str
+        :keyword target_type: The target, model/agent. Default value is None.
+        :paramtype target_type: str
         :return: list of AttackObjective
         :rtype: list[~raiclient.models.AttackObjective]
         :raises ~azure.core.exceptions.HttpResponseError:
@@ -360,12 +366,14 @@ async def get_attack_objectives(
         cls: ClsType[List[_models.AttackObjective]] = kwargs.pop("cls", None)
 
         _request = build_rai_svc_get_attack_objectives_request(
+            risk_categories=[risk_category],
             risk_types=risk_types,
             lang=lang,
             strategy=strategy,
             api_version=self._config.api_version,
             headers=_headers,
             params=_params,
+            target_type=target_type,
         )
         path_format_arguments = {
             "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),

@@ -117,6 +117,7 @@ def build_rai_svc_get_attack_objectives_request(  # pylint: disable=name-too-lon
     risk_categories: Optional[List[str]] = None,
     lang: Optional[str] = None,
     strategy: Optional[str] = None,
+    target_type: Optional[str] = None,
     **kwargs: Any
 ) -> HttpRequest:
     _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
@@ -140,6 +141,8 @@ def build_rai_svc_get_attack_objectives_request(  # pylint: disable=name-too-lon
         _params["lang"] = _SERIALIZER.query("lang", lang, "str")
     if strategy is not None:
         _params["strategy"] = _SERIALIZER.query("strategy", strategy, "str")
+    if target_type is not None:
+        _params["targetType"] = _SERIALIZER.query("target_type", target_type, "str")
 
     # Construct headers
     _headers["Accept"] = _SERIALIZER.header("accept", accept, "str")
@@ -586,6 +589,7 @@ def get_attack_objectives(
         risk_types: Optional[List[str]] = None,
         lang: Optional[str] = None,
         strategy: Optional[str] = None,
+        target_type: Optional[str] = None,
         **kwargs: Any
     ) -> List[_models.AttackObjective]:
         """Get the attack objectives.
@@ -599,6 +603,8 @@ def get_attack_objectives(
         :paramtype lang: str
         :keyword strategy: The strategy. Default value is None.
         :paramtype strategy: str
+        :keyword target_type: The target, model/agent. Default value is None.
+        :paramtype target_type: str
         :return: list of AttackObjective
         :rtype: list[~raiclient.models.AttackObjective]
         :raises ~azure.core.exceptions.HttpResponseError:
@@ -624,6 +630,7 @@ def get_attack_objectives(
             api_version=self._config.api_version,
             headers=_headers,
             params=_params,
+            target_type=target_type,
         )
         path_format_arguments = {
             "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),

@@ -1,7 +1,7 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Dict, TypeVar, Union, Optional
+from typing import Any, Dict, TypeVar, Union, Optional
 
 from typing_extensions import override
 
@@ -11,7 +11,10 @@
     Tasks,
     _InternalAnnotationTasks,
 )
-from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service, evaluate_with_rai_service_multimodal
+from azure.ai.evaluation._common.rai_service import (
+    evaluate_with_rai_service_sync,
+    evaluate_with_rai_service_sync_multimodal,
+)
 from azure.ai.evaluation._common.utils import validate_azure_ai_project, is_onedp_project
 from azure.ai.evaluation._exceptions import EvaluationException
 from azure.ai.evaluation._common.utils import validate_conversation
@@ -129,13 +132,15 @@ async def _evaluate_conversation(self, conversation: Dict) -> Dict[str, T]:
         validate_conversation(conversation)
         messages = conversation["messages"]
         # Run score computation based on supplied metric.
-        result = await evaluate_with_rai_service_multimodal(
+        # Convert enum to string value for the multimodal endpoint
+        metric_value = self._eval_metric.value if hasattr(self._eval_metric, "value") else self._eval_metric
+        result = await evaluate_with_rai_service_sync_multimodal(
             messages=messages,
-            metric_name=self._eval_metric,
+            metric_name=metric_value,
             project_scope=self._azure_ai_project,
             credential=self._credential,
         )
-        return result
+        return self._parse_eval_result(result)
 
     async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]:
         query = eval_input.get("query", None)
@@ -165,7 +170,7 @@ async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]:
                 )
             input_data["context"] = context
 
-        return await evaluate_with_rai_service(  # type: ignore
+        eval_result = await evaluate_with_rai_service_sync(  # type: ignore
             metric_name=self._eval_metric,
             data=input_data,
             project_scope=self._azure_ai_project,
@@ -174,6 +179,98 @@ async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]:
             evaluator_name=self.__class__.__name__,
         )
 
+        # Parse the EvalRunOutputItem format to the expected dict format
+        return self._parse_eval_result(eval_result)
+
+    def _parse_eval_result(self, eval_result) -> Dict[str, T]:
+        """Parse the EvalRunOutputItem format into the expected dict format.
+
+        :param eval_result: The result from evaluate_with_rai_service_sync (EvalRunOutputItem).
+        :return: The parsed result in the expected format.
+        :rtype: Dict[str, T]
+        """
+        # Handle EvalRunOutputItem structure
+        if hasattr(eval_result, "results") or (isinstance(eval_result, dict) and "results" in eval_result):
+            results = eval_result.results if hasattr(eval_result, "results") else eval_result.get("results", [])
+
+            # Find the result matching our metric
+            for result_item in results:
+                result_dict = result_item if isinstance(result_item, dict) else result_item.__dict__
+                # Compare against both the enum and its string value
+                metric_name = result_dict.get("name") or result_dict.get("metric")
+                if metric_name == self._eval_metric or metric_name == self._eval_metric.value:
+                    # Extract values from EvalRunOutputItem result format
+                    score = result_dict.get("score", 0)
+                    reason = result_dict.get("reason", "")
+
+                    # Special handling for evaluators that use _label format
+                    if self._eval_metric in [
+                        EvaluationMetrics.CODE_VULNERABILITY,
+                        EvaluationMetrics.PROTECTED_MATERIAL,
+                        EvaluationMetrics.UNGROUNDED_ATTRIBUTES,
+                        EvaluationMetrics.XPIA,
+                        _InternalEvaluationMetrics.ECI,
+                    ]:
+                        # These evaluators use _label instead of severity labels
+                        # Ensure label is not None - default to False for boolean evaluators
+                        label = result_dict.get("label")
+                        if label is None:
+                            label = False
+                        parsed_result = {
+                            f"{self._eval_metric.value}_label": label,
+                            f"{self._eval_metric.value}_reason": reason,
+                        }
+
+                        details_source = result_dict.get("details")
+                        if not details_source:
+                            properties = result_dict.get("properties", {})
+                            if isinstance(properties, dict):
+                                details_source = properties.get("scoreProperties")
+
+                        if details_source and isinstance(details_source, dict):
+                            parsed_result[f"{self._eval_metric.value}_details"] = _prepare_details(details_source)
+
+                        return parsed_result
+
+                    # Standard handling for harm severity evaluators
+                    # Convert score to severity label if needed
+                    severity_label = result_dict.get("label")
+                    if severity_label is None:
+                        # Calculate severity from score
+                        from azure.ai.evaluation._common.utils import get_harm_severity_level
+
+                        severity_label = get_harm_severity_level(score)
+
+                    # Return in the expected format matching parse_response output
+                    return {
+                        self._eval_metric.value: severity_label,
+                        f"{self._eval_metric.value}_score": score,
+                        f"{self._eval_metric.value}_reason": reason,
+                    }
+
+        # If we can't parse as EvalRunOutputItem or no matching result found,
+        # check if it's already in the correct format (might be legacy response)
+        if isinstance(eval_result, dict):
+            # Check if it already has the expected keys
+            expected_key = (
+                f"{self._eval_metric.value}_label"
+                if self._eval_metric
+                in [
+                    EvaluationMetrics.CODE_VULNERABILITY,
+                    EvaluationMetrics.PROTECTED_MATERIAL,
+                    EvaluationMetrics.UNGROUNDED_ATTRIBUTES,
+                    EvaluationMetrics.XPIA,
+                    _InternalEvaluationMetrics.ECI,
+                ]
+                else self._eval_metric.value
+            )
+
+            if expected_key in eval_result:
+                return eval_result
+
+        # Return empty dict if we can't parse
+        return {}
+
     def _get_task(self):
         """Get the annotation task for the current evaluation metric.
         The annotation task is used by the RAI service script to determine a the message format
@@ -196,3 +293,33 @@ def _get_task(self):
         if self._eval_metric == EvaluationMetrics.UNGROUNDED_ATTRIBUTES:
             return Tasks.UNGROUNDED_ATTRIBUTES
         return Tasks.CONTENT_HARM
+
+
+def _coerce_string_boolean(value: Any) -> Any:
+    """Convert common string boolean values to their bool equivalents."""
+
+    if isinstance(value, str):
+        lowered = value.strip().lower()
+        if lowered == "true":
+            return True
+        if lowered == "false":
+            return False
+    return value
+
+
+def _prepare_details(details: Dict[str, Any]) -> Dict[str, Any]:
+    """Normalize detail keys and coerce string booleans recursively."""
+
+    normalized: Dict[str, Any] = {}
+    for key, value in details.items():
+        normalized_key = key.replace("-", "_") if isinstance(key, str) else key
+        normalized[normalized_key] = _prepare_detail_value(value)
+    return normalized
+
+
+def _prepare_detail_value(value: Any) -> Any:
+    if isinstance(value, dict):
+        return _prepare_details(value)
+    if isinstance(value, list):
+        return [_prepare_detail_value(item) for item in value]
+    return _coerce_string_boolean(value)
@@ -94,7 +94,7 @@ def __init__(
         **kwargs,
     ):
         super().__init__(
-            eval_metric=EvaluationMetrics.HATE_FAIRNESS,
+            eval_metric=EvaluationMetrics.HATE_UNFAIRNESS,
             azure_ai_project=azure_ai_project,
             credential=credential,
             conversation_aggregation_type=_AggregationType.MAX,

@@ -25,8 +25,8 @@
 
 # Azure AI Evaluation imports
 from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
-from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service, evaluate_with_rai_service_sync
-from azure.ai.evaluation._common.utils import is_onedp_project, get_default_threshold_for_evaluator
+from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service_sync
+from azure.ai.evaluation._common.utils import get_default_threshold_for_evaluator
 from azure.ai.evaluation._evaluate._utils import _write_output
 
 # Local imports
@@ -156,24 +156,15 @@ async def evaluate_conversation(
                 @retry(**self.retry_config["network_retry"])
                 async def evaluate_with_rai_service_with_retry():
                     try:
-                        if use_sync_endpoint:
-                            return await evaluate_with_rai_service_sync(
-                                data=query_response,
-                                metric_name=effective_metric_name,
-                                project_scope=self.azure_ai_project,
-                                credential=self.credential,
-                                annotation_task=annotation_task,
-                                scan_session_id=self.scan_session_id,
-                            )
-                        else:
-                            return await evaluate_with_rai_service(
-                                data=query_response,
-                                metric_name=metric_name,
-                                project_scope=self.azure_ai_project,
-                                credential=self.credential,
-                                annotation_task=annotation_task,
-                                scan_session_id=self.scan_session_id,
-                            )
+                        # Always use sync_evals endpoint for all projects
+                        return await evaluate_with_rai_service_sync(
+                            data=query_response,
+                            metric_name=metric_name,
+                            project_scope=self.azure_ai_project,
+                            credential=self.credential,
+                            annotation_task=annotation_task,
+                            scan_session_id=self.scan_session_id,
+                        )
                     except (
                         httpx.ConnectTimeout,
                         httpx.ReadTimeout,
@@ -407,6 +398,12 @@ async def evaluate(
         try:
             # Get the appropriate metric for this risk category
             metric_name = get_metric_from_risk_category(risk_category)
+
+            # For hate_unfairness, always use "hate_unfairness" metric name for Sync API
+            if risk_category == RiskCategory.HateUnfairness:
+                metric_name = "hate_unfairness"
+                self.logger.debug(f"Using metric 'hate_unfairness' for Sync API")
+
             self.logger.debug(f"Using metric '{metric_name}' for risk category '{risk_category.value}'")
 
             # Load all conversations from the data file