-
Notifications
You must be signed in to change notification settings - Fork 0
Pytest Plugin
Varun Pratap Bhardwaj edited this page Mar 6, 2026
·
1 revision
AgentAssay integrates seamlessly with pytest through an automatic plugin.
The pytest plugin is included with the core package:
pip install agentassayVerify the plugin is registered:
pytest --co -q 2>&1 | head -5import pytest
from agentassay.core.models import TestScenario
from agentassay.plugin.pytest_plugin import assert_pass_rate
@pytest.mark.agentassay(n=30, threshold=0.80)
def test_booking_agent(trial_runner):
"""Test that booking agent passes 80% of trials."""
# Create the agent callable
runner = trial_runner(my_booking_agent)
# Define the scenario
scenario = TestScenario(
scenario_id="book-flight",
name="Book a flight",
input_data={"prompt": "Book a flight to NYC on March 15"},
expected_properties={"max_steps": 10, "max_cost_usd": 0.50},
)
# Run trials
results = runner.run_trials(scenario)
# Assert pass rate with confidence
passed = [r.passed for r in results]
assert_pass_rate(passed, threshold=0.80, confidence=0.95)pytest test_booking_agent.py -vOutput:
test_booking_agent.py::test_booking_agent PASSED
============================= AgentAssay Summary ==============================
assert_pass_rate: rate=93.33% [77.93%, 99.18%] (28/30 trials) verdict=PASS
AgentAssay totals: 1 tests -- 1 PASS, 0 FAIL, 0 INCONCLUSIVE
Assert that pass rate meets a threshold with statistical confidence.
from agentassay.plugin.pytest_plugin import assert_pass_rate
results = [True, True, False, True, True, ...] # Trial outcomes
assert_pass_rate(results, threshold=0.80, confidence=0.95)Raises AssertionError if:
- Pass rate's lower confidence bound < threshold (FAIL verdict)
- Confidence interval straddles threshold (INCONCLUSIVE verdict)
Detect regression between baseline and current versions.
from agentassay.plugin.pytest_plugin import assert_no_regression
baseline_results = [True] * 28 + [False] * 2 # 93% (28/30)
current_results = [True] * 20 + [False] * 10 # 67% (20/30)
assert_no_regression(baseline_results, current_results, alpha=0.05)Raises AssertionError if:
- Fisher's exact test p-value < alpha (regression detected)
Assert a specific verdict.
from agentassay.plugin.pytest_plugin import assert_verdict
from agentassay.verdicts import Verdict
results = [True] * 25 + [False] * 5
assert_verdict(results, expected=Verdict.PASS, threshold=0.80)Creates a TrialRunner configured with the test's parameters.
@pytest.mark.agentassay(n=50, threshold=0.85)
def test_my_agent(trial_runner):
from agentassay.core.models import AgentConfig
config = AgentConfig(
agent_id="my-agent",
name="My Agent",
framework="custom",
model="gpt-4o",
)
runner = trial_runner(my_agent_callable, agent_config=config)
results = runner.run_trials(scenario)
# ...Access the test's AgentAssay configuration.
def test_check_config(agentassay_config):
assert agentassay_config.num_trials == 30
assert agentassay_config.threshold == 0.80Use pytest's parameterization with AgentAssay:
@pytest.mark.parametrize("scenario_id,prompt", [
("greeting", "Hello, how are you?"),
("booking", "Book a flight to NYC"),
("cancel", "Cancel my reservation"),
])
@pytest.mark.agentassay(n=20, threshold=0.80)
def test_agent_scenarios(trial_runner, scenario_id, prompt):
runner = trial_runner(my_agent)
scenario = TestScenario(
scenario_id=scenario_id,
input_data={"prompt": prompt},
)
results = runner.run_trials(scenario)
assert_pass_rate([r.passed for r in results], threshold=0.80)import pytest
from pathlib import Path
import json
@pytest.fixture
def baseline_results():
"""Load baseline results from file."""
with open("tests/baselines/booking-v1.json") as f:
data = json.load(f)
return [r["passed"] for r in data["results"]]
@pytest.mark.agentassay(n=30)
def test_no_regression(trial_runner, baseline_results):
"""Compare current version against baseline."""
runner = trial_runner(current_agent)
results = runner.run_trials(booking_scenario)
current_results = [r.passed for r in results]
assert_no_regression(baseline_results, current_results, alpha=0.05)Define custom pass/fail logic:
from agentassay.core.models import TrialResult, ExecutionTrace
def my_evaluator(trace: ExecutionTrace) -> TrialResult:
"""Custom evaluation logic."""
# Check custom properties
used_search = any(step.tool_name == "search" for step in trace.steps)
under_budget = trace.total_cost_usd < 0.10
fast_enough = trace.total_duration_ms < 2000
passed = trace.success and used_search and under_budget and fast_enough
quality_score = 1.0 if passed else 0.0
return TrialResult(
trace=trace,
passed=passed,
quality_score=quality_score,
evaluation_details={
"used_search": used_search,
"under_budget": under_budget,
"fast_enough": fast_enough,
},
)
@pytest.mark.agentassay(n=30, threshold=0.80)
def test_with_custom_evaluator(trial_runner):
runner = trial_runner(my_agent)
runner.set_evaluator(my_evaluator)
results = runner.run_trials(scenario)
assert_pass_rate([r.passed for r in results], threshold=0.80)AgentAssay adds a summary section to pytest output:
============================= AgentAssay Summary ==============================
test_greeting: rate=96.67% [82.78%, 99.92%] (29/30 trials) verdict=PASS
test_booking: rate=80.00% [61.43%, 92.29%] (24/30 trials) verdict=PASS
test_cancel: rate=73.33% [54.11%, 87.72%] (22/30 trials) verdict=INCONCLUSIVE
AgentAssay totals: 3 tests -- 2 PASS, 0 FAIL, 1 INCONCLUSIVE
Use adaptive budgeting to let each test compute its own N:
from agentassay.efficiency import AdaptiveBudgetOptimizer
@pytest.fixture
def calibrated_n(my_agent):
"""Compute optimal N via calibration."""
calibration_traces = [my_agent({"prompt": "test"}) for _ in range(10)]
optimizer = AdaptiveBudgetOptimizer(alpha=0.05, beta=0.10)
estimate = optimizer.calibrate(calibration_traces)
return estimate.recommended_n
def test_with_adaptive_n(trial_runner, calibrated_n):
"""Use calibrated N instead of fixed N."""
runner = trial_runner(my_agent)
# Override default N
results = runner.run_trials(scenario, n=calibrated_n)
assert_pass_rate([r.passed for r in results], threshold=0.80)- CI/CD Integration — Set up deployment gates
- Configuration — YAML config files
- Statistical Methods — Understand the verdicts
Part of Qualixar | Author: Varun Pratap Bhardwaj
Getting Started
Core Concepts
- Token-Efficient Testing
- Behavioral Fingerprinting
- Statistical Methods
- Coverage Model
- Mutation Testing
Guides
Reference