From 8b2234cd8e4980f425060210e5f1e8a3f02a8dd6 Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 15 Mar 2026 16:20:59 -0400 Subject: [PATCH] Mark slow tests and exclude by default for faster local iteration Add @pytest.mark.slow to Sun-Abraham bootstrap tests (~696s), TROP parity tests (~98s), and all TROP tests. Set addopts to exclude slow tests by default, reducing local test time from ~17min to ~4min. CI workflows updated to pass -m '' to run all tests. Also vectorize SA bootstrap resampling loop (pre-compute unit-to-row index mapping, replace Python loop with np.repeat). Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/rust-test.yml | 10 +++++----- CLAUDE.md | 5 +++-- diff_diff/sun_abraham.py | 24 +++++++++--------------- pyproject.toml | 4 ++-- tests/test_rust_backend.py | 2 ++ tests/test_sun_abraham.py | 1 + tests/test_trop.py | 2 ++ 7 files changed, 24 insertions(+), 24 deletions(-) diff --git a/.github/workflows/rust-test.yml b/.github/workflows/rust-test.yml index 5f07d76e..00fce5f5 100644 --- a/.github/workflows/rust-test.yml +++ b/.github/workflows/rust-test.yml @@ -135,24 +135,24 @@ jobs: - name: Run Rust backend tests (Unix) if: runner.os != 'Windows' working-directory: /tmp - run: pytest tests/test_rust_backend.py -v + run: pytest tests/test_rust_backend.py -v -m '' - name: Run Rust backend tests (Windows) if: runner.os == 'Windows' working-directory: ${{ runner.temp }} - run: pytest tests/test_rust_backend.py -v + run: pytest tests/test_rust_backend.py -v -m '' - name: Run tests with Rust backend (Unix) if: runner.os != 'Windows' working-directory: /tmp - run: DIFF_DIFF_BACKEND=rust pytest tests/ -q -n auto --dist worksteal + run: DIFF_DIFF_BACKEND=rust pytest tests/ -q -n auto --dist worksteal -m '' - name: Run tests with Rust backend (Windows) if: runner.os == 'Windows' working-directory: ${{ runner.temp }} run: | $env:DIFF_DIFF_BACKEND="rust" - pytest tests/ -q -n auto --dist worksteal + pytest tests/ -q -n auto --dist worksteal -m '' shell: pwsh # Test pure Python fallback (without Rust extension) @@ -177,4 +177,4 @@ jobs: PYTHONPATH=. python -c "from diff_diff import HAS_RUST_BACKEND; print(f'HAS_RUST_BACKEND: {HAS_RUST_BACKEND}'); assert not HAS_RUST_BACKEND" - name: Run tests in pure Python mode - run: PYTHONPATH=. DIFF_DIFF_BACKEND=python pytest tests/ -q --ignore=tests/test_rust_backend.py -n auto --dist worksteal + run: PYTHONPATH=. DIFF_DIFF_BACKEND=python pytest tests/ -q --ignore=tests/test_rust_backend.py -n auto --dist worksteal -m '' diff --git a/CLAUDE.md b/CLAUDE.md index c66c7211..cb5aff2e 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -122,8 +122,9 @@ category (`Methodology/Correctness`, `Performance`, or `Testing/Docs`): `threshold = 0.40 if n_boot < 100 else 0.15`. - **`assert_nan_inference()`** from conftest.py: Use to validate ALL inference fields are NaN-consistent. Don't check individual fields separately. -- **Slow test suites**: `tests/test_trop.py` is very time-consuming. Skip with - `pytest --ignore=tests/test_trop.py` for unrelated changes. +- **Slow tests**: TROP, Sun-Abraham bootstrap, and TROP-parity tests are marked + `@pytest.mark.slow` and excluded by default via `addopts`. Run `pytest -m ''` + to include them, or `pytest -m slow` to run only slow tests. - **Behavioral assertions**: Always assert expected outcomes, not just no-exception. Bad: `result = func(bad_input)`. Good: `result = func(bad_input); assert np.isnan(result.coef)`. diff --git a/diff_diff/sun_abraham.py b/diff_diff/sun_abraham.py index 955e072a..82662ae5 100644 --- a/diff_diff/sun_abraham.py +++ b/diff_diff/sun_abraham.py @@ -1000,6 +1000,10 @@ def _run_bootstrap( all_units = df[unit].unique() n_units = len(all_units) + # Pre-compute unit -> row indices mapping (avoids repeated boolean scans) + unit_row_indices = {u: df.index[df[unit] == u].values for u in all_units} + unit_row_counts = {u: len(idx) for u, idx in unit_row_indices.items()} + # Store bootstrap samples rel_periods = sorted(original_event_study.keys()) bootstrap_effects = {e: np.zeros(self.n_bootstrap) for e in rel_periods} @@ -1009,23 +1013,13 @@ def _run_bootstrap( # Resample units with replacement (pairs bootstrap) boot_units = rng.choice(all_units, size=n_units, replace=True) - # Create bootstrap sample efficiently - # Build index array for all selected units - boot_indices = np.concatenate([ - df.index[df[unit] == u].values for u in boot_units - ]) + # Create bootstrap sample using pre-computed index mapping + boot_indices = np.concatenate([unit_row_indices[u] for u in boot_units]) df_b = df.iloc[boot_indices].copy() - # Reassign unique unit IDs for bootstrap sample - # Each resampled unit gets a unique ID - new_unit_ids = [] - current_id = 0 - for u in boot_units: - unit_rows = df[df[unit] == u] - for _ in range(len(unit_rows)): - new_unit_ids.append(current_id) - current_id += 1 - df_b[unit] = new_unit_ids[:len(df_b)] + # Reassign unique unit IDs (vectorized) + rows_per_unit = np.array([unit_row_counts[u] for u in boot_units]) + df_b[unit] = np.repeat(np.arange(n_units), rows_per_unit) # Recompute relative time (vectorized) df_b["_rel_time"] = np.where( diff --git a/pyproject.toml b/pyproject.toml index b058d7e0..d85a1240 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,8 +73,8 @@ python-packages = ["diff_diff"] [tool.pytest.ini_options] testpaths = ["tests"] python_files = "test_*.py" -# Run all tests including slow ones by default; use `pytest -m 'not slow'` for faster local runs -addopts = "-v --tb=short" +# Exclude slow tests by default; use `pytest -m ''` to run all tests +addopts = "-v --tb=short -m 'not slow'" markers = [ "slow: marks tests as slow (run `pytest -m 'not slow'` to exclude, or `pytest -m slow` to run only slow tests)", ] diff --git a/tests/test_rust_backend.py b/tests/test_rust_backend.py index ae2df18c..19213392 100644 --- a/tests/test_rust_backend.py +++ b/tests/test_rust_backend.py @@ -1149,6 +1149,7 @@ def test_trop_produces_valid_results(self): assert results.lambda_nn in [0.0, 0.1] +@pytest.mark.slow @pytest.mark.skipif(not HAS_RUST_BACKEND, reason="Rust backend not available") class TestTROPJointRustBackend: """Test suite for TROP joint method Rust backend functions.""" @@ -1269,6 +1270,7 @@ def test_bootstrap_trop_variance_joint_reproducible(self): np.testing.assert_almost_equal(se1, se2) +@pytest.mark.slow @pytest.mark.skipif(not HAS_RUST_BACKEND, reason="Rust backend not available") class TestTROPJointRustVsNumpy: """Tests comparing TROP joint Rust and NumPy implementations.""" diff --git a/tests/test_sun_abraham.py b/tests/test_sun_abraham.py index fde76a9a..b2a417ab 100644 --- a/tests/test_sun_abraham.py +++ b/tests/test_sun_abraham.py @@ -340,6 +340,7 @@ def test_invalid_level_error(self): results.to_dataframe(level="invalid") +@pytest.mark.slow class TestSunAbrahamBootstrap: """Tests for Sun-Abraham bootstrap inference.""" diff --git a/tests/test_trop.py b/tests/test_trop.py index 92b17b2b..d88f610e 100644 --- a/tests/test_trop.py +++ b/tests/test_trop.py @@ -6,6 +6,8 @@ import pandas as pd import pytest +pytestmark = pytest.mark.slow + from diff_diff import SyntheticDiD from diff_diff.trop import TROP, TROPResults, trop from diff_diff.prep import generate_factor_data