src/strands_evals/evaluators/goal_success_rate_evaluator.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -6,7 +6,7 @@
  
    from typing_extensions import TypeVar, Union

    from ..types.evaluation import EvaluationData, EvaluationOutput

    from ..types.trace import EvaluationLevel, SessionLevelInput

    from ..types.trace import EvaluationLevel

    from .evaluator import Evaluator

    from .prompt_templates.goal_success_rate import get_template

    @@ -40,7 +40,7 @@ class GoalSuccessRateEvaluator(Evaluator[InputT, OutputT]):
  
        def __init__(

            self,

            version: str = "v0",

            version: str = "v1",

            model: Union[Model, str, None] = None,

            system_prompt: str | None = None,

        ):

    @@ -50,8 +50,7 @@ def __init__(
  
            self.model = model

        def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:

            session_input = self._parse_trajectory(evaluation_case)

            prompt = self._format_prompt(session_input)

            prompt = self._format_prompt(evaluation_case)

            evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)

            rating = evaluator_agent.structured_output(GoalSuccessRating, prompt)

            normalized_score = self._score_mapping[rating.score]

    @@ -64,8 +63,7 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva
  
            return [result]

        async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:

            session_input = self._parse_trajectory(evaluation_case)

            prompt = self._format_prompt(session_input)

            prompt = self._format_prompt(evaluation_case)

            evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)

            rating = await evaluator_agent.structured_output_async(GoalSuccessRating, prompt)

            normalized_score = self._score_mapping[rating.score]

    @@ -77,10 +75,17 @@ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT])
  
            )

            return [result]

        def _format_prompt(self, session_input: SessionLevelInput) -> str:

            """Format evaluation prompt from session-level input."""

        def _format_prompt(self, evaluation_case: EvaluationData[InputT, OutputT]) -> str:

            """Format evaluation prompt from evaluation case."""

            goal = evaluation_case.metadata.get("goal")

            session_input = self._parse_trajectory(evaluation_case)

            parts = []

            if goal:

                parts.append(f"# User Goal\n{goal}")

            if session_input.available_tools:

                parts.append(f"# Available tools\n{self._format_tools(session_input.available_tools)}")

src/strands_evals/evaluators/prompt_templates/goal_success_rate/goal_success_rate_v1.py

-Original file line number
+Diff line change
@@ -0,0 +1,26 @@
+    SYSTEM_PROMPT = """You are an objective judge evaluating whether a conversation between a User and an AI assistant successfully completed all User goals.
+    You will be provided with:
+. (Optional) A specific goal description that defines what the user is trying to achieve.
+. The list of available tools the AI assistant can use, with descriptions for each tool about when to use it and how to use it.
+. The complete conversation record with multiple turns including:
+        - User messages (User:)
+        - Assistant responses (Assistant:)
+        - Tool selected by the assistant (Action:)
+        - Tool outputs (Tool:)
+    Your task is to carefully analyze the conversation and determine if all User goals were successfully achieved. Follow these steps:
+. First, identify the user's goals:
+       - If a specific goal description is provided under "# User Goal", use that as the primary goal to evaluate.
+       - If no explicit goal is provided, derive the user's goals from their messages in the conversation record.
+. Analyze the list of available tools and reason about what tools the AI assistant should use to achieve the goal(s).
+. Check the conversation record to decide whether the AI assistant:
+       - Used the appropriate tools to address the user's goal(s)
+       - Got the expected outputs from those tools
+       - Responded to the User appropriately about the outcome
+. Determine if the goal(s) were achieved based on whether the user's needs were satisfied.
+    # Evaluation Rubric
+    - Yes: All user goals were achieved. The agent successfully completed all requested tasks, provided accurate information, and the user received satisfactory outcomes.
+    - No: Not all user goals were achieved. The agent failed to complete one or more requested tasks, provided incomplete/incorrect information, or the user's needs were not fully met."""

feat: Optional Case specific Goal for GoalSuccessRateEvaluator #75

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft

dbermuehler wants to merge 1 commit into strands-agents:main from dbermuehler:main

-Original file line number
+Diff line change
@@ -0,0 +1,26 @@
+    SYSTEM_PROMPT = """You are an objective judge evaluating whether a conversation between a User and an AI assistant successfully completed all User goals.
+    You will be provided with:
+. (Optional) A specific goal description that defines what the user is trying to achieve.
+. The list of available tools the AI assistant can use, with descriptions for each tool about when to use it and how to use it.
+. The complete conversation record with multiple turns including:
+        - User messages (User:)
+        - Assistant responses (Assistant:)
+        - Tool selected by the assistant (Action:)
+        - Tool outputs (Tool:)
+    Your task is to carefully analyze the conversation and determine if all User goals were successfully achieved. Follow these steps:
+. First, identify the user's goals:
+       - If a specific goal description is provided under "# User Goal", use that as the primary goal to evaluate.
+       - If no explicit goal is provided, derive the user's goals from their messages in the conversation record.
+. Analyze the list of available tools and reason about what tools the AI assistant should use to achieve the goal(s).
+. Check the conversation record to decide whether the AI assistant:
+       - Used the appropriate tools to address the user's goal(s)
+       - Got the expected outputs from those tools
+       - Responded to the User appropriately about the outcome
+. Determine if the goal(s) were achieved based on whether the user's needs were satisfied.
+    # Evaluation Rubric
+    - Yes: All user goals were achieved. The agent successfully completed all requested tasks, provided accurate information, and the user received satisfactory outcomes.
+    - No: Not all user goals were achieved. The agent failed to complete one or more requested tasks, provided incomplete/incorrect information, or the user's needs were not fully met."""

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

feat: Optional Case specific Goal for GoalSuccessRateEvaluator #75

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

feat: Optional Case specific Goal for GoalSuccessRateEvaluator #75

Are you sure you want to change the base?

Uh oh!

feat: Optional Case specific Goal for GoalSuccessRateEvaluator #75

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!