Skip to content

Pytest Plugin

Varun Pratap Bhardwaj edited this page Mar 6, 2026 · 1 revision

Pytest Plugin

AgentAssay integrates seamlessly with pytest through an automatic plugin.

Installation

The pytest plugin is included with the core package:

pip install agentassay

Verify the plugin is registered:

pytest --co -q 2>&1 | head -5

Basic Usage

1. Mark Tests with @pytest.mark.agentassay

import pytest
from agentassay.core.models import TestScenario
from agentassay.plugin.pytest_plugin import assert_pass_rate

@pytest.mark.agentassay(n=30, threshold=0.80)
def test_booking_agent(trial_runner):
    """Test that booking agent passes 80% of trials."""
    # Create the agent callable
    runner = trial_runner(my_booking_agent)

    # Define the scenario
    scenario = TestScenario(
        scenario_id="book-flight",
        name="Book a flight",
        input_data={"prompt": "Book a flight to NYC on March 15"},
        expected_properties={"max_steps": 10, "max_cost_usd": 0.50},
    )

    # Run trials
    results = runner.run_trials(scenario)

    # Assert pass rate with confidence
    passed = [r.passed for r in results]
    assert_pass_rate(passed, threshold=0.80, confidence=0.95)

2. Run with pytest

pytest test_booking_agent.py -v

Output:

test_booking_agent.py::test_booking_agent PASSED

============================= AgentAssay Summary ==============================
  assert_pass_rate: rate=93.33% [77.93%, 99.18%] (28/30 trials) verdict=PASS
  AgentAssay totals: 1 tests -- 1 PASS, 0 FAIL, 0 INCONCLUSIVE

Assertion Functions

assert_pass_rate

Assert that pass rate meets a threshold with statistical confidence.

from agentassay.plugin.pytest_plugin import assert_pass_rate

results = [True, True, False, True, True, ...]  # Trial outcomes

assert_pass_rate(results, threshold=0.80, confidence=0.95)

Raises AssertionError if:

  • Pass rate's lower confidence bound < threshold (FAIL verdict)
  • Confidence interval straddles threshold (INCONCLUSIVE verdict)

assert_no_regression

Detect regression between baseline and current versions.

from agentassay.plugin.pytest_plugin import assert_no_regression

baseline_results = [True] * 28 + [False] * 2   # 93% (28/30)
current_results = [True] * 20 + [False] * 10   # 67% (20/30)

assert_no_regression(baseline_results, current_results, alpha=0.05)

Raises AssertionError if:

  • Fisher's exact test p-value < alpha (regression detected)

assert_verdict

Assert a specific verdict.

from agentassay.plugin.pytest_plugin import assert_verdict
from agentassay.verdicts import Verdict

results = [True] * 25 + [False] * 5

assert_verdict(results, expected=Verdict.PASS, threshold=0.80)

Fixtures

trial_runner

Creates a TrialRunner configured with the test's parameters.

@pytest.mark.agentassay(n=50, threshold=0.85)
def test_my_agent(trial_runner):
    from agentassay.core.models import AgentConfig

    config = AgentConfig(
        agent_id="my-agent",
        name="My Agent",
        framework="custom",
        model="gpt-4o",
    )

    runner = trial_runner(my_agent_callable, agent_config=config)
    results = runner.run_trials(scenario)
    # ...

agentassay_config

Access the test's AgentAssay configuration.

def test_check_config(agentassay_config):
    assert agentassay_config.num_trials == 30
    assert agentassay_config.threshold == 0.80

Parameterized Tests

Use pytest's parameterization with AgentAssay:

@pytest.mark.parametrize("scenario_id,prompt", [
    ("greeting", "Hello, how are you?"),
    ("booking", "Book a flight to NYC"),
    ("cancel", "Cancel my reservation"),
])
@pytest.mark.agentassay(n=20, threshold=0.80)
def test_agent_scenarios(trial_runner, scenario_id, prompt):
    runner = trial_runner(my_agent)
    scenario = TestScenario(
        scenario_id=scenario_id,
        input_data={"prompt": prompt},
    )
    results = runner.run_trials(scenario)
    assert_pass_rate([r.passed for r in results], threshold=0.80)

Regression Testing with Fixtures

import pytest
from pathlib import Path
import json

@pytest.fixture
def baseline_results():
    """Load baseline results from file."""
    with open("tests/baselines/booking-v1.json") as f:
        data = json.load(f)
    return [r["passed"] for r in data["results"]]

@pytest.mark.agentassay(n=30)
def test_no_regression(trial_runner, baseline_results):
    """Compare current version against baseline."""
    runner = trial_runner(current_agent)
    results = runner.run_trials(booking_scenario)
    current_results = [r.passed for r in results]

    assert_no_regression(baseline_results, current_results, alpha=0.05)

Custom Evaluators

Define custom pass/fail logic:

from agentassay.core.models import TrialResult, ExecutionTrace

def my_evaluator(trace: ExecutionTrace) -> TrialResult:
    """Custom evaluation logic."""
    # Check custom properties
    used_search = any(step.tool_name == "search" for step in trace.steps)
    under_budget = trace.total_cost_usd < 0.10
    fast_enough = trace.total_duration_ms < 2000

    passed = trace.success and used_search and under_budget and fast_enough
    quality_score = 1.0 if passed else 0.0

    return TrialResult(
        trace=trace,
        passed=passed,
        quality_score=quality_score,
        evaluation_details={
            "used_search": used_search,
            "under_budget": under_budget,
            "fast_enough": fast_enough,
        },
    )

@pytest.mark.agentassay(n=30, threshold=0.80)
def test_with_custom_evaluator(trial_runner):
    runner = trial_runner(my_agent)
    runner.set_evaluator(my_evaluator)
    results = runner.run_trials(scenario)
    assert_pass_rate([r.passed for r in results], threshold=0.80)

Test Reporting

AgentAssay adds a summary section to pytest output:

============================= AgentAssay Summary ==============================
  test_greeting: rate=96.67% [82.78%, 99.92%] (29/30 trials) verdict=PASS
  test_booking: rate=80.00% [61.43%, 92.29%] (24/30 trials) verdict=PASS
  test_cancel: rate=73.33% [54.11%, 87.72%] (22/30 trials) verdict=INCONCLUSIVE

  AgentAssay totals: 3 tests -- 2 PASS, 0 FAIL, 1 INCONCLUSIVE

Adaptive Budget in Tests

Use adaptive budgeting to let each test compute its own N:

from agentassay.efficiency import AdaptiveBudgetOptimizer

@pytest.fixture
def calibrated_n(my_agent):
    """Compute optimal N via calibration."""
    calibration_traces = [my_agent({"prompt": "test"}) for _ in range(10)]
    optimizer = AdaptiveBudgetOptimizer(alpha=0.05, beta=0.10)
    estimate = optimizer.calibrate(calibration_traces)
    return estimate.recommended_n

def test_with_adaptive_n(trial_runner, calibrated_n):
    """Use calibrated N instead of fixed N."""
    runner = trial_runner(my_agent)
    # Override default N
    results = runner.run_trials(scenario, n=calibrated_n)
    assert_pass_rate([r.passed for r in results], threshold=0.80)

Next Steps


Part of Qualixar | Author: Varun Pratap Bhardwaj

Clone this wiki locally