Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,53 @@ dataset = await generator.generate_dataset(
dataset.to_file("generated_math_research_dataset")
```

### Evaluating Langfuse Traces

```python
from langfuse_utils.utils import get_langfuse_trace

from strands_evals import Case, Dataset
from strands_evals.evaluators import HelpfulnessEvaluator
from strands_evals.mappers.langfuse_mapper import (
LangfuseSessionMapper,
)

# 1. Define a task function
def user_task_function(case: Case) -> str:
# Initialise Langfuse mapper
mapper = LangfuseSessionMapper()
# Get trace id from case if available
trace_id = case.metadata.get("trace_id", "") if case.metadata else ""
# Fetch traces from langfuse
langfuse_traces = get_langfuse_trace(trace_id=trace_id)
# Map to session
session = mapper.map_to_session(langfuse_traces, trace_id)
agent_response = mapper.get_agent_final_response(session)

return {"output": str(agent_response), "trajectory": session}

# 2. Define test cases
test_cases = [
Case[str, str](
name="knowledge-1",
input="What is the status of my package",
metadata={"category": "knowledge", "trace_id": "xxxxxxx"},
),
]


# 3. Create an evaluator
evaluator = HelpfulnessEvaluator()

# 4. Create a dataset
dataset = Dataset[str, str](cases=test_cases, evaluator=evaluator)

# 5. Run evaluations
report = dataset.run_evaluations(user_task_function)
report.run_display()
```


## Available Evaluators

- **OutputEvaluator**: Evaluates the quality and correctness of agent outputs
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ authors = [
]

dependencies = [
"langfuse>=3.9.0,<4.0.0",
"pydantic>=2.0.0,<3.0.0",
"rich>=14.0.0,<15.0.0",
"strands-agents>=1.0.0",
Expand Down
36 changes: 36 additions & 0 deletions src/examples/goal_success_rate_evaluator_langfuse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from langfuse_utils.utils import get_langfuse_trace

from strands_evals import Case, Dataset
from strands_evals.evaluators import GoalSuccessRateEvaluator
from strands_evals.mappers.langfuse_mapper import LangfuseSessionMapper


# 1. Define a task function
def user_task_function(case: Case) -> dict:
mapper = LangfuseSessionMapper()
trace_id = case.metadata.get("trace_id", "") if case.metadata else ""
langfuse_traces = get_langfuse_trace(trace_id=trace_id)
session = mapper.map_to_session(langfuse_traces, trace_id)
agent_response = mapper.get_agent_final_response(session)

return {"output": str(agent_response), "trajectory": session}


test_cases = [
Case[str, str](
name="knowledge-1",
input="What is the status of my package",
metadata={"category": "knowledge", "trace_id": "13cc6db7f2e290b54bab3061a6fc5db0"},
),
]


# 3. Create an evaluator
evaluator = GoalSuccessRateEvaluator()

# 4. Create a dataset
dataset = Dataset[str, str](cases=test_cases, evaluator=evaluator)

# 5. Run evaluations
report = dataset.run_evaluations(user_task_function)
report.run_display()
39 changes: 39 additions & 0 deletions src/examples/helpfulness_evaluator_langfuse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from langfuse_utils.utils import get_langfuse_trace

from strands_evals import Case, Dataset
from strands_evals.evaluators import HelpfulnessEvaluator
from strands_evals.mappers.langfuse_mapper import (
LangfuseSessionMapper,
)


# 1. Define a task function
def user_task_function(case: Case) -> str:
mapper = LangfuseSessionMapper()
trace_id = case.metadata.get("trace_id", "") if case.metadata else ""
langfuse_traces = get_langfuse_trace(trace_id=trace_id)
session = mapper.map_to_session(langfuse_traces, trace_id)
agent_response = mapper.get_agent_final_response(session)

return {"output": str(agent_response), "trajectory": session}


# 2. Define test cases
test_cases = [
Case[str, str](
name="knowledge-1",
input="What is the status of my package",
metadata={"category": "knowledge", "trace_id": "13cc6db7f2e290b54bab3061a6fc5db0"},
),
]


# 3. Create an evaluator
evaluator = HelpfulnessEvaluator()

# 4. Create a dataset
dataset = Dataset[str, str](cases=test_cases, evaluator=evaluator)

# 5. Run evaluations
report = dataset.run_evaluations(user_task_function)
report.run_display()
Loading
Loading