Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ class EvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
SELF_HARM = "self_harm"
SEXUAL = "sexual"
PROTECTED_MATERIAL = "protected_material"
XPIA = "xpia"
XPIA = "indirect_attack"
GROUNDEDNESS = "generic_groundedness"
CODE_VULNERABILITY = "code_vulnerability"
UNGROUNDED_ATTRIBUTES = "ungrounded_attributes"
Expand All @@ -108,7 +108,7 @@ class _InternalEvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
enum over time.
"""

ECI = "eci"
ECI = "election_critical_information"


# Mapping of evaluation metrics to their scoring patterns
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -328,20 +328,26 @@ async def get_jail_break_dataset_with_type(self, type: str, **kwargs: Any) -> Li
async def get_attack_objectives(
self,
*,
risk_category: str,
risk_types: Optional[List[str]] = None,
lang: Optional[str] = None,
strategy: Optional[str] = None,
target_type: Optional[str] = None,
**kwargs: Any
) -> List[_models.AttackObjective]:
"""Get the attack objectives.

:keyword risk_category: Risk category for the attack objectives. Required.
:paramtype risk_category: str
:keyword risk_types: Risk types for the attack objectives dataset. Default value is None.
:paramtype risk_types: list[str]
:keyword lang: The language for the attack objectives dataset, defaults to 'en'. Default value
is None.
:paramtype lang: str
:keyword strategy: The strategy. Default value is None.
:paramtype strategy: str
:keyword target_type: The target, model/agent. Default value is None.
:paramtype target_type: str
:return: list of AttackObjective
:rtype: list[~raiclient.models.AttackObjective]
:raises ~azure.core.exceptions.HttpResponseError:
Expand All @@ -360,12 +366,14 @@ async def get_attack_objectives(
cls: ClsType[List[_models.AttackObjective]] = kwargs.pop("cls", None)

_request = build_rai_svc_get_attack_objectives_request(
risk_categories=[risk_category],
risk_types=risk_types,
lang=lang,
strategy=strategy,
api_version=self._config.api_version,
headers=_headers,
params=_params,
target_type=target_type,
)
path_format_arguments = {
"endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ def build_rai_svc_get_attack_objectives_request( # pylint: disable=name-too-lon
risk_categories: Optional[List[str]] = None,
lang: Optional[str] = None,
strategy: Optional[str] = None,
target_type: Optional[str] = None,
**kwargs: Any
) -> HttpRequest:
_headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
Expand All @@ -140,6 +141,8 @@ def build_rai_svc_get_attack_objectives_request( # pylint: disable=name-too-lon
_params["lang"] = _SERIALIZER.query("lang", lang, "str")
if strategy is not None:
_params["strategy"] = _SERIALIZER.query("strategy", strategy, "str")
if target_type is not None:
_params["targetType"] = _SERIALIZER.query("target_type", target_type, "str")

# Construct headers
_headers["Accept"] = _SERIALIZER.header("accept", accept, "str")
Expand Down Expand Up @@ -586,6 +589,7 @@ def get_attack_objectives(
risk_types: Optional[List[str]] = None,
lang: Optional[str] = None,
strategy: Optional[str] = None,
target_type: Optional[str] = None,
**kwargs: Any
) -> List[_models.AttackObjective]:
"""Get the attack objectives.
Expand All @@ -599,6 +603,8 @@ def get_attack_objectives(
:paramtype lang: str
:keyword strategy: The strategy. Default value is None.
:paramtype strategy: str
:keyword target_type: The target, model/agent. Default value is None.
:paramtype target_type: str
:return: list of AttackObjective
:rtype: list[~raiclient.models.AttackObjective]
:raises ~azure.core.exceptions.HttpResponseError:
Expand All @@ -624,6 +630,7 @@ def get_attack_objectives(
api_version=self._config.api_version,
headers=_headers,
params=_params,
target_type=target_type,
)
path_format_arguments = {
"endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
from typing import Dict, TypeVar, Union, Optional
from typing import Any, Dict, TypeVar, Union, Optional

from typing_extensions import override

Expand All @@ -11,7 +11,10 @@
Tasks,
_InternalAnnotationTasks,
)
from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service, evaluate_with_rai_service_multimodal
from azure.ai.evaluation._common.rai_service import (
evaluate_with_rai_service_sync,
evaluate_with_rai_service_sync_multimodal,
)
from azure.ai.evaluation._common.utils import validate_azure_ai_project, is_onedp_project
from azure.ai.evaluation._exceptions import EvaluationException
from azure.ai.evaluation._common.utils import validate_conversation
Expand Down Expand Up @@ -129,13 +132,15 @@ async def _evaluate_conversation(self, conversation: Dict) -> Dict[str, T]:
validate_conversation(conversation)
messages = conversation["messages"]
# Run score computation based on supplied metric.
result = await evaluate_with_rai_service_multimodal(
# Convert enum to string value for the multimodal endpoint
metric_value = self._eval_metric.value if hasattr(self._eval_metric, "value") else self._eval_metric
result = await evaluate_with_rai_service_sync_multimodal(
messages=messages,
metric_name=self._eval_metric,
metric_name=metric_value,
project_scope=self._azure_ai_project,
credential=self._credential,
)
return result
return self._parse_eval_result(result)

async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]:
query = eval_input.get("query", None)
Expand Down Expand Up @@ -165,7 +170,7 @@ async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]:
)
input_data["context"] = context

return await evaluate_with_rai_service( # type: ignore
eval_result = await evaluate_with_rai_service_sync( # type: ignore
metric_name=self._eval_metric,
data=input_data,
project_scope=self._azure_ai_project,
Expand All @@ -174,6 +179,98 @@ async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]:
evaluator_name=self.__class__.__name__,
)

# Parse the EvalRunOutputItem format to the expected dict format
return self._parse_eval_result(eval_result)

def _parse_eval_result(self, eval_result) -> Dict[str, T]:
"""Parse the EvalRunOutputItem format into the expected dict format.

:param eval_result: The result from evaluate_with_rai_service_sync (EvalRunOutputItem).
:return: The parsed result in the expected format.
:rtype: Dict[str, T]
"""
# Handle EvalRunOutputItem structure
if hasattr(eval_result, "results") or (isinstance(eval_result, dict) and "results" in eval_result):
results = eval_result.results if hasattr(eval_result, "results") else eval_result.get("results", [])

# Find the result matching our metric
for result_item in results:
result_dict = result_item if isinstance(result_item, dict) else result_item.__dict__
# Compare against both the enum and its string value
metric_name = result_dict.get("name") or result_dict.get("metric")
if metric_name == self._eval_metric or metric_name == self._eval_metric.value:
# Extract values from EvalRunOutputItem result format
score = result_dict.get("score", 0)
reason = result_dict.get("reason", "")

# Special handling for evaluators that use _label format
if self._eval_metric in [
EvaluationMetrics.CODE_VULNERABILITY,
EvaluationMetrics.PROTECTED_MATERIAL,
EvaluationMetrics.UNGROUNDED_ATTRIBUTES,
EvaluationMetrics.XPIA,
_InternalEvaluationMetrics.ECI,
]:
# These evaluators use _label instead of severity labels
# Ensure label is not None - default to False for boolean evaluators
label = result_dict.get("label")
if label is None:
label = False
parsed_result = {
f"{self._eval_metric.value}_label": label,
f"{self._eval_metric.value}_reason": reason,
}

details_source = result_dict.get("details")
if not details_source:
properties = result_dict.get("properties", {})
if isinstance(properties, dict):
details_source = properties.get("scoreProperties")

if details_source and isinstance(details_source, dict):
parsed_result[f"{self._eval_metric.value}_details"] = _prepare_details(details_source)

return parsed_result

# Standard handling for harm severity evaluators
# Convert score to severity label if needed
severity_label = result_dict.get("label")
if severity_label is None:
# Calculate severity from score
from azure.ai.evaluation._common.utils import get_harm_severity_level

severity_label = get_harm_severity_level(score)

# Return in the expected format matching parse_response output
return {
self._eval_metric.value: severity_label,
f"{self._eval_metric.value}_score": score,
f"{self._eval_metric.value}_reason": reason,
}

# If we can't parse as EvalRunOutputItem or no matching result found,
# check if it's already in the correct format (might be legacy response)
if isinstance(eval_result, dict):
# Check if it already has the expected keys
expected_key = (
f"{self._eval_metric.value}_label"
if self._eval_metric
in [
EvaluationMetrics.CODE_VULNERABILITY,
EvaluationMetrics.PROTECTED_MATERIAL,
EvaluationMetrics.UNGROUNDED_ATTRIBUTES,
EvaluationMetrics.XPIA,
_InternalEvaluationMetrics.ECI,
]
else self._eval_metric.value
)

if expected_key in eval_result:
return eval_result

# Return empty dict if we can't parse
return {}

def _get_task(self):
"""Get the annotation task for the current evaluation metric.
The annotation task is used by the RAI service script to determine a the message format
Expand All @@ -196,3 +293,33 @@ def _get_task(self):
if self._eval_metric == EvaluationMetrics.UNGROUNDED_ATTRIBUTES:
return Tasks.UNGROUNDED_ATTRIBUTES
return Tasks.CONTENT_HARM


def _coerce_string_boolean(value: Any) -> Any:
"""Convert common string boolean values to their bool equivalents."""

if isinstance(value, str):
lowered = value.strip().lower()
if lowered == "true":
return True
if lowered == "false":
return False
return value


def _prepare_details(details: Dict[str, Any]) -> Dict[str, Any]:
"""Normalize detail keys and coerce string booleans recursively."""

normalized: Dict[str, Any] = {}
for key, value in details.items():
normalized_key = key.replace("-", "_") if isinstance(key, str) else key
normalized[normalized_key] = _prepare_detail_value(value)
return normalized


def _prepare_detail_value(value: Any) -> Any:
if isinstance(value, dict):
return _prepare_details(value)
if isinstance(value, list):
return [_prepare_detail_value(item) for item in value]
return _coerce_string_boolean(value)
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def __init__(
**kwargs,
):
super().__init__(
eval_metric=EvaluationMetrics.HATE_FAIRNESS,
eval_metric=EvaluationMetrics.HATE_UNFAIRNESS,
azure_ai_project=azure_ai_project,
credential=credential,
conversation_aggregation_type=_AggregationType.MAX,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@

# Azure AI Evaluation imports
from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service, evaluate_with_rai_service_sync
from azure.ai.evaluation._common.utils import is_onedp_project, get_default_threshold_for_evaluator
from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service_sync
from azure.ai.evaluation._common.utils import get_default_threshold_for_evaluator
from azure.ai.evaluation._evaluate._utils import _write_output

# Local imports
Expand Down Expand Up @@ -156,24 +156,15 @@ async def evaluate_conversation(
@retry(**self.retry_config["network_retry"])
async def evaluate_with_rai_service_with_retry():
try:
if use_sync_endpoint:
return await evaluate_with_rai_service_sync(
data=query_response,
metric_name=effective_metric_name,
project_scope=self.azure_ai_project,
credential=self.credential,
annotation_task=annotation_task,
scan_session_id=self.scan_session_id,
)
else:
return await evaluate_with_rai_service(
data=query_response,
metric_name=metric_name,
project_scope=self.azure_ai_project,
credential=self.credential,
annotation_task=annotation_task,
scan_session_id=self.scan_session_id,
)
# Always use sync_evals endpoint for all projects
return await evaluate_with_rai_service_sync(
data=query_response,
metric_name=metric_name,
project_scope=self.azure_ai_project,
credential=self.credential,
annotation_task=annotation_task,
scan_session_id=self.scan_session_id,
)
except (
httpx.ConnectTimeout,
httpx.ReadTimeout,
Expand Down Expand Up @@ -407,6 +398,12 @@ async def evaluate(
try:
# Get the appropriate metric for this risk category
metric_name = get_metric_from_risk_category(risk_category)

# For hate_unfairness, always use "hate_unfairness" metric name for Sync API
if risk_category == RiskCategory.HateUnfairness:
metric_name = "hate_unfairness"
self.logger.debug(f"Using metric 'hate_unfairness' for Sync API")

self.logger.debug(f"Using metric '{metric_name}' for risk category '{risk_category.value}'")

# Load all conversations from the data file
Expand Down
Loading
Loading