From 974c66009c46220647a07acc3638479ccb108a6e Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 15 Feb 2026 11:45:27 -0500 Subject: [PATCH] Remove Rust outer-loop SDID variance estimation to fix SE mismatch and perf regression The Rust placebo/bootstrap variance paths used a different RNG (Xoshiro256PlusPlus) producing different permutation sequences than Python, causing SE divergence between backends. Rayon parallelism across all replications also saturated memory bandwidth at 1k+ scale (3-10x slower than pure Python). Inner Frank-Wolfe weight calls still dispatch to Rust for ~18x speedup over R. Co-Authored-By: Claude Opus 4.6 --- CHANGELOG.md | 11 + CLAUDE.md | 3 - diff_diff/_backend.py | 12 - diff_diff/synthetic_did.py | 103 ------- docs/benchmarks.rst | 141 +++++----- rust/src/lib.rs | 5 - rust/src/sdid_variance.rs | 498 --------------------------------- tests/test_methodology_sdid.py | 88 ++++++ tests/test_rust_backend.py | 274 ------------------ 9 files changed, 168 insertions(+), 967 deletions(-) delete mode 100644 rust/src/sdid_variance.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index 5de161c2..4bfcda5c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed +- Remove Rust outer-loop variance estimation for SyntheticDiD (placebo and bootstrap) + - Fixes SE mismatch between pure Python and Rust backends (different RNG sequences) + - Fixes Rust performance regression at 1k+ scale (memory bandwidth saturation from rayon parallelism) + - Inner Frank-Wolfe weight computation still uses Rust when available + +### Documentation +- Re-run SyntheticDiD benchmarks against R after Frank-Wolfe methodology rewrite +- Updated `docs/benchmarks.rst` SDID validation results, performance tables, and known differences +- ATT now matches R to < 1e-10 (previously 0.3% diff) since both use Frank-Wolfe optimizer + ## [2.3.0] - 2026-02-09 ### Added diff --git a/CLAUDE.md b/CLAUDE.md index e1d76280..a6a62d11 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -167,9 +167,6 @@ cross-platform compilation - no OpenBLAS or Intel MKL installation required. - `compute_unit_distance_matrix()` - Parallel pairwise RMSE distance computation (4-8x speedup) - `loocv_grid_search()` - Parallel LOOCV across tuning parameters (10-50x speedup) - `bootstrap_trop_variance()` - Parallel bootstrap variance estimation (5-15x speedup) - - **`rust/src/sdid_variance.rs`** - SDID variance estimation acceleration: - - `placebo_variance_sdid()` - Parallel placebo SE computation (~8x speedup) - - `bootstrap_variance_sdid()` - Parallel bootstrap SE computation (~6x speedup) - Uses pure-Rust `faer` library for linear algebra (no external BLAS/LAPACK dependencies) - Cross-platform: builds on Linux, macOS, and Windows without additional setup - Provides 4-8x speedup for SyntheticDiD, 5-20x speedup for TROP diff --git a/diff_diff/_backend.py b/diff_diff/_backend.py index dd8af93c..954f94c4 100644 --- a/diff_diff/_backend.py +++ b/diff_diff/_backend.py @@ -35,9 +35,6 @@ compute_time_weights as _rust_compute_time_weights, compute_noise_level as _rust_compute_noise_level, sc_weight_fw as _rust_sc_weight_fw, - # SDID variance estimation (parallel placebo and bootstrap) - placebo_variance_sdid as _rust_placebo_variance_sdid, - bootstrap_variance_sdid as _rust_bootstrap_variance_sdid, ) _rust_available = True except ImportError: @@ -59,9 +56,6 @@ _rust_compute_time_weights = None _rust_compute_noise_level = None _rust_sc_weight_fw = None - # SDID variance estimation (parallel placebo and bootstrap) - _rust_placebo_variance_sdid = None - _rust_bootstrap_variance_sdid = None # Determine final backend based on environment variable and availability if _backend_env == 'python': @@ -84,9 +78,6 @@ _rust_compute_time_weights = None _rust_compute_noise_level = None _rust_sc_weight_fw = None - # SDID variance estimation (parallel placebo and bootstrap) - _rust_placebo_variance_sdid = None - _rust_bootstrap_variance_sdid = None elif _backend_env == 'rust': # Force Rust mode - fail if not available if not _rust_available: @@ -118,7 +109,4 @@ '_rust_compute_time_weights', '_rust_compute_noise_level', '_rust_sc_weight_fw', - # SDID variance estimation (parallel placebo and bootstrap) - '_rust_placebo_variance_sdid', - '_rust_bootstrap_variance_sdid', ] diff --git a/diff_diff/synthetic_did.py b/diff_diff/synthetic_did.py index 47f01438..f0591647 100644 --- a/diff_diff/synthetic_did.py +++ b/diff_diff/synthetic_did.py @@ -583,62 +583,6 @@ def _bootstrap_se( ]) n_pre = Y_pre_control.shape[0] - # Try Rust parallel implementation for ~6x speedup - from diff_diff._backend import HAS_RUST_BACKEND, _rust_bootstrap_variance_sdid - - if HAS_RUST_BACKEND and _rust_bootstrap_variance_sdid is not None: - # Generate random seed when self.seed is None - rust_seed = self.seed if self.seed is not None else int( - np.random.default_rng(None).integers(0, 2**63) - ) - - se, boot_arr, n_failed = _rust_bootstrap_variance_sdid( - np.ascontiguousarray(Y_pre_control, dtype=np.float64), - np.ascontiguousarray(Y_post_control, dtype=np.float64), - np.ascontiguousarray(Y_pre_treated, dtype=np.float64), - np.ascontiguousarray(Y_post_treated, dtype=np.float64), - np.ascontiguousarray(unit_weights, dtype=np.float64), - np.ascontiguousarray(time_weights, dtype=np.float64), - self.n_bootstrap, rust_seed, - ) - bootstrap_estimates = np.asarray(boot_arr) - n_successful = len(bootstrap_estimates) - failure_rate = 1 - (n_successful / self.n_bootstrap) - - # Apply same warning/error logic as Python path - if n_successful == 0: - raise ValueError( - f"All {self.n_bootstrap} bootstrap iterations failed. " - f"This typically occurs when:\n" - f" - Sample size is too small for reliable resampling\n" - f" - Weight matrices are singular or near-singular\n" - f" - Insufficient pre-treatment periods for weight estimation\n" - f" - Too few control units relative to treated units\n" - f"Consider using variance_method='placebo' or increasing " - f"the regularization parameters (zeta_omega, zeta_lambda)." - ) - elif n_successful == 1: - warnings.warn( - f"Only 1/{self.n_bootstrap} bootstrap iteration succeeded. " - f"Standard error cannot be computed reliably (requires at least 2). " - f"Returning SE=0.0. Consider using variance_method='placebo' or " - f"increasing the regularization (zeta_omega, zeta_lambda).", - UserWarning, - stacklevel=2, - ) - return 0.0, bootstrap_estimates - elif failure_rate > 0.05: - warnings.warn( - f"Only {n_successful}/{self.n_bootstrap} bootstrap iterations succeeded " - f"({failure_rate:.1%} failure rate). Standard errors may be unreliable. " - f"This can occur with small samples or insufficient pre-treatment periods.", - UserWarning, - stacklevel=2, - ) - - return se, bootstrap_estimates - - # Python fallback bootstrap_estimates = [] for _ in range(self.n_bootstrap): @@ -796,53 +740,6 @@ def _placebo_variance_se( ) return 0.0, np.array([]) - # Try Rust parallel implementation for ~8x speedup - from diff_diff._backend import HAS_RUST_BACKEND, _rust_placebo_variance_sdid - - if HAS_RUST_BACKEND and _rust_placebo_variance_sdid is not None: - # Generate random seed when self.seed is None (matching Python's non-reproducible behavior) - rust_seed = self.seed if self.seed is not None else int( - np.random.default_rng(None).integers(0, 2**63) - ) - - se, placebo_arr = _rust_placebo_variance_sdid( - np.ascontiguousarray(Y_pre_control, dtype=np.float64), - np.ascontiguousarray(Y_post_control, dtype=np.float64), - np.ascontiguousarray(Y_pre_treated_mean, dtype=np.float64), - np.ascontiguousarray(Y_post_treated_mean, dtype=np.float64), - n_treated, zeta_omega, zeta_lambda, min_decrease, - True, # intercept - 100, # max_iter_pre_sparsify - 10000, # max_iter - replications, rust_seed, - ) - placebo_estimates = np.asarray(placebo_arr) - n_successful = len(placebo_estimates) - - # Apply same warning/error logic as Python path - if n_successful < 2: - warnings.warn( - f"Only {n_successful} placebo replications completed successfully. " - f"Standard error cannot be estimated reliably. " - f"Consider using variance_method='bootstrap' or increasing " - f"the number of control units.", - UserWarning, - stacklevel=3, - ) - return 0.0, placebo_estimates - - failure_rate = 1 - (n_successful / replications) - if failure_rate > 0.05: - warnings.warn( - f"Only {n_successful}/{replications} placebo replications succeeded " - f"({failure_rate:.1%} failure rate). Standard errors may be unreliable.", - UserWarning, - stacklevel=3, - ) - - return se, placebo_estimates - - # Python fallback placebo_estimates = [] for _ in range(replications): diff --git a/docs/benchmarks.rst b/docs/benchmarks.rst index eee42d90..3c4366a0 100644 --- a/docs/benchmarks.rst +++ b/docs/benchmarks.rst @@ -88,8 +88,8 @@ Summary Table - Yes - **PASS** * - SyntheticDiD - - 0.011 - - 3.1% + - < 1e-10 + - 0.3% - Yes - **PASS** @@ -173,32 +173,36 @@ Synthetic DiD Results :header-rows: 1 * - Metric - - diff-diff + - diff-diff (Pure) + - diff-diff (Rust) - R synthdid - Difference * - ATT - - 3.851 - 3.840 - - 0.011 (0.3%) + - 3.840 + - 3.840 + - < 1e-10 * - SE - - 0.106 - - 0.103 - - 3.1% + - 0.105 + - 0.099 + - 0.105 + - 0.3% (pure) * - Time (s) - - 0.017 - - 7.49 - - **433x faster** - -**Validation**: PASS - Both ATT and SE estimates match closely. Both implementations -use placebo-based variance estimation (R's Algorithm 4 from Arkhangelsky et al. 2021). - -The small SE difference (3.1%) is due to different unit/time weight optimization -algorithms: - -- diff-diff uses projected gradient descent -- R synthdid uses Frank-Wolfe optimization with adaptive regularization - -This leads to slightly different weights, which propagate to the placebo estimates. + - 3.41 + - 1.65 + - 8.19 + - **2.4x faster** (pure) + +**Validation**: PASS - ATT estimates are numerically identical across all +implementations. Both diff-diff and R's synthdid use Frank-Wolfe optimization +with two-pass sparsification and auto-computed regularization (``zeta_omega``, +``zeta_lambda``), producing identical unit and time weights. Both use +placebo-based variance estimation (Algorithm 4 from Arkhangelsky et al. 2021). + +The small SE difference (0.3% at small scale, up to ~7% at larger scales) is +due to Monte Carlo variance in the placebo procedure, which randomly permutes +control units to construct pseudo-treated groups. Different random seeds across +implementations produce slightly different placebo samples. Callaway-Sant'Anna Results ~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -362,49 +366,38 @@ Three-Way Performance Summary - R (s) - Python Pure (s) - Python Rust (s) - - Rust/R + - Pure/R - Rust/Pure * - small - - 7.73 - - 0.016 - - 0.004 - - **2147x** - - **4.4x** + - 8.19 + - 3.41 + - 1.65 + - **2.4x** + - **2.1x** * - 1k - - 108.1 - - 0.068 - - 0.101 - - **1070x** - - 0.7x + - 111.7 + - 24.0 + - 76.1 + - **4.7x** + - 0.3x * - 5k - - 504.5 - - 2.97 - - 0.69 - - **734x** - - **4.3x** - * - 10k - - 1107.2 - - 19.54 - - 2.58 - - **429x** - - **7.6x** - * - 20k - - 2451.0 - - 137.3 - - 10.9 - - **225x** - - **12.6x** + - 524.2 + - 31.7 + - 307.5 + - **16.5x** + - 0.1x .. note:: - **SyntheticDiD Performance**: diff-diff achieves **225x to 2147x speedup** over - R's synthdid package. At 10k scale, R takes ~18 minutes while Python Rust - completes in 2.6 seconds. At 20k scale, R takes ~41 minutes while Python Rust - completes in 11 seconds. The Rust backend provides **4-13x additional speedup** - over pure Python for SyntheticDiD due to optimized simplex projection and - synthetic weight computation. ATT estimates differ slightly due to different - weight optimization algorithms (projected gradient descent vs Frank-Wolfe), - but confidence intervals overlap. + **SyntheticDiD Performance**: diff-diff's pure Python backend achieves + **2.4x to 16.5x speedup** over R's synthdid package using the same + Frank-Wolfe optimization algorithm. At 5k scale, R takes ~9 minutes while + pure Python completes in 32 seconds. ATT estimates are numerically identical + (< 1e-10 difference) since both implementations use the same Frank-Wolfe + optimizer with two-pass sparsification. The Rust backend provides a speedup + at small scale (2.1x over pure Python) but is slower at larger scales due to + overhead in the placebo variance estimation loop; this is a known area for + future optimization. Dataset Sizes ~~~~~~~~~~~~~ @@ -457,29 +450,30 @@ Key Observations - **BasicDiD/TWFE**: 2-17x faster than R at all scales - **CallawaySantAnna**: 4-11x faster than R at all scales (vectorized WIF computation) - - **SyntheticDiD**: 225-2147x faster than R (R takes 41 minutes at 20k scale!) + - **SyntheticDiD**: 2.4-16.5x faster than R (pure Python), with both + implementations using the same Frank-Wolfe algorithm 2. **Rust backend benefit depends on the estimator**: - - **SyntheticDiD**: Rust provides **4-13x speedup** over pure Python due to - optimized simplex projection and synthetic weight computation + - **SyntheticDiD**: Rust provides speedup at small scale (2.1x) but is + slower at larger scales due to placebo variance loop overhead - **BasicDiD/CallawaySantAnna**: Rust provides minimal benefit (~1x) since these estimators use OLS/variance computations already optimized in NumPy/SciPy 3. **When to use Rust backend**: - - **SyntheticDiD**: Recommended - provides significant speedup (4-13x) + - **SyntheticDiD at small scale**: Rust is ~2x faster than pure Python - **Bootstrap inference**: May help with parallelized iterations - **BasicDiD/CallawaySantAnna**: Optional - pure Python is equally fast 4. **Scaling behavior**: Python implementations show excellent scaling behavior - across all estimators. SyntheticDiD is 225x faster than R at 20k scale. - CallawaySantAnna achieves **exact SE accuracy** (0.0% difference) while - being 4-11x faster than R through vectorized NumPy operations. + across all estimators. SyntheticDiD pure Python is 16.5x faster than R at + 5k scale. CallawaySantAnna achieves **exact SE accuracy** (0.0% difference) + while being 4-11x faster than R through vectorized NumPy operations. 5. **No Rust required for most use cases**: Users without Rust/maturin can install diff-diff and get full functionality with excellent performance. - Only SyntheticDiD benefits significantly from the Rust backend. + Pure Python is the fastest option for SyntheticDiD at 1k+ scales. 6. **CallawaySantAnna accuracy and speed**: As of v2.0.3, CallawaySantAnna achieves both exact numerical accuracy (0.0% SE difference from R) AND @@ -650,9 +644,11 @@ When to Trust Results ``treated * time_f | unit`` interaction syntax (unit FE absorbed). Both average ATT and all period-level effects match to machine precision. Use with confidence. -- **SyntheticDiD**: Both point estimates (0.3% diff) and standard errors (3.1% diff) - match R closely. Use ``variance_method="placebo"`` (default) to match R's - inference. Results are fully validated. +- **SyntheticDiD**: Point estimates are numerically identical (< 1e-10 diff) and + standard errors match closely (0.3% diff at small scale). Both implementations + use Frank-Wolfe optimization with identical weights. Use + ``variance_method="placebo"`` (default) to match R's inference. Results are + fully validated. - **CallawaySantAnna**: Both group-time effects (ATT(g,t)) and overall ATT aggregation match R exactly. Standard errors are numerically equivalent @@ -668,9 +664,10 @@ Known Differences 2. **Aggregation Weights**: Overall ATT is a weighted average of ATT(g,t). Weighting schemes may differ between implementations. -3. **Weight Optimization**: SyntheticDiD uses different optimization algorithms - for unit/time weights (projected gradient descent vs Frank-Wolfe). This leads - to slightly different weights but equivalent ATT estimates. +3. **Placebo Variance**: SyntheticDiD SE estimates differ slightly (0.3-7%) + across implementations due to Monte Carlo variance in the placebo procedure. + Point estimates and unit/time weights are numerically identical since both + implementations use the same Frank-Wolfe optimizer. References ---------- diff --git a/rust/src/lib.rs b/rust/src/lib.rs index e7c232c1..15606281 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -7,7 +7,6 @@ use pyo3::prelude::*; mod bootstrap; mod linalg; -mod sdid_variance; mod trop; mod weights; @@ -43,10 +42,6 @@ fn _rust_backend(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(trop::loocv_grid_search_joint, m)?)?; m.add_function(wrap_pyfunction!(trop::bootstrap_trop_variance_joint, m)?)?; - // SDID variance estimation (parallel placebo and bootstrap) - m.add_function(wrap_pyfunction!(sdid_variance::placebo_variance_sdid, m)?)?; - m.add_function(wrap_pyfunction!(sdid_variance::bootstrap_variance_sdid, m)?)?; - // Version info m.add("__version__", env!("CARGO_PKG_VERSION"))?; diff --git a/rust/src/sdid_variance.rs b/rust/src/sdid_variance.rs deleted file mode 100644 index cae15cf4..00000000 --- a/rust/src/sdid_variance.rs +++ /dev/null @@ -1,498 +0,0 @@ -//! SDID variance estimation acceleration. -//! -//! This module provides parallelized implementations of: -//! - Placebo-based variance estimation (Algorithm 4, Arkhangelsky et al. 2021) -//! - Bootstrap variance estimation with fixed weights -//! -//! Both functions parallelize across replications using rayon, providing -//! near-linear speedup on multi-core machines (e.g., ~8x on 8 cores). -//! -//! Reference: -//! Arkhangelsky, D., Athey, S., Hirshberg, D. A., Imbens, G. W., & Wager, S. -//! (2021). Synthetic Difference-in-Differences. American Economic Review, -//! 111(12), 4088-4118. - -use ndarray::{Array1, Array2, Axis}; -use numpy::{PyArray1, PyReadonlyArray1, PyReadonlyArray2, ToPyArray}; -use pyo3::prelude::*; -use rayon::prelude::*; - -use crate::weights; - -/// Fisher-Yates permutation of 0..n using the provided RNG. -fn fisher_yates_permutation(rng: &mut impl rand::Rng, n: usize) -> Vec { - let mut perm: Vec = (0..n).collect(); - for i in (1..n).rev() { - let j = rng.gen_range(0..=i); - perm.swap(i, j); - } - perm -} - -/// Extract columns from a matrix by index. -fn extract_submatrix_cols(matrix: &Array2, col_indices: &[usize]) -> Array2 { - let n_rows = matrix.nrows(); - let n_cols = col_indices.len(); - let mut result = Array2::zeros((n_rows, n_cols)); - for (new_j, &old_j) in col_indices.iter().enumerate() { - for i in 0..n_rows { - result[[i, new_j]] = matrix[[i, old_j]]; - } - } - result -} - -/// Compute column means for selected columns. -fn column_means(matrix: &Array2, col_indices: &[usize]) -> Array1 { - let n_rows = matrix.nrows(); - let n_cols = col_indices.len(); - if n_cols == 0 { - return Array1::zeros(n_rows); - } - let mut means = Array1::zeros(n_rows); - for &j in col_indices { - for i in 0..n_rows { - means[i] += matrix[[i, j]]; - } - } - means /= n_cols as f64; - means -} - -/// Compute the SDID estimator (ATT) from pre/post control/treated data and weights. -/// -/// Matches `compute_sdid_estimator` in Python's `utils.py:1587-1604`: -/// -/// ```text -/// τ̂ = (Ȳ_treated,post - Σ_t λ_t * Y_treated,t) -/// - Σ_j ω_j * (Ȳ_j,post - Σ_t λ_t * Y_j,t) -/// ``` -fn sdid_estimator_internal( - y_pre_control: &Array2, - y_post_control: &Array2, - y_pre_treated_mean: &Array1, - y_post_treated_mean: &Array1, - unit_weights: &Array1, - time_weights: &Array1, -) -> f64 { - // Weighted pre-treatment averages - // time_weights @ Y_pre_control -> (n_control,) - let weighted_pre_control = time_weights.dot(&y_pre_control.view()); - // time_weights @ Y_pre_treated_mean -> scalar - let weighted_pre_treated = time_weights.dot(y_pre_treated_mean); - - // Post-treatment averages - let mean_post_control = y_post_control.mean_axis(Axis(0)).unwrap(); - let mean_post_treated = y_post_treated_mean.mean().unwrap(); - - // DiD for treated: post - weighted pre - let did_treated = mean_post_treated - weighted_pre_treated; - - // Weighted DiD for controls: sum over j of omega_j * (post_j - weighted_pre_j) - let diff_control = &mean_post_control - &weighted_pre_control; - let did_control = unit_weights.dot(&diff_control); - - // SDID estimator - did_treated - did_control -} - -/// Compute placebo-based variance for SDID in parallel. -/// -/// Implements Algorithm 4 from Arkhangelsky et al. (2021), matching R's -/// `synthdid::vcov(method = "placebo")`: -/// -/// 1. Randomly permute control indices -/// 2. Split into pseudo-controls and pseudo-treated -/// 3. Re-estimate both omega and lambda on the permuted data -/// 4. Compute SDID estimate with re-estimated weights -/// 5. Repeat `replications` times (in parallel) -/// 6. SE = sqrt((r-1)/r) * sd(estimates) -/// -/// # Arguments -/// * `y_pre_control` - Control outcomes in pre-treatment periods (n_pre, n_control) -/// * `y_post_control` - Control outcomes in post-treatment periods (n_post, n_control) -/// * `y_pre_treated_mean` - Mean treated outcomes in pre-treatment (n_pre,) -/// * `y_post_treated_mean` - Mean treated outcomes in post-treatment (n_post,) -/// * `n_treated` - Number of treated units in the original estimation -/// * `zeta_omega` - Regularization parameter for unit weights -/// * `zeta_lambda` - Regularization parameter for time weights -/// * `min_decrease` - Convergence threshold for Frank-Wolfe -/// * `intercept` - Column-center if true (default: true) -/// * `max_iter_pre_sparsify` - Iterations for first pass (default: 100) -/// * `max_iter` - Iterations for second pass (default: 10000) -/// * `replications` - Number of placebo replications -/// * `seed` - Random seed for reproducibility -/// -/// # Returns -/// (se, placebo_estimates) where se is the standard error and -/// placebo_estimates is the array of successful placebo effects. -#[pyfunction] -#[pyo3(signature = (y_pre_control, y_post_control, y_pre_treated_mean, y_post_treated_mean, - n_treated, zeta_omega, zeta_lambda, min_decrease, - intercept=true, max_iter_pre_sparsify=100, max_iter=10000, - replications=200, seed=42))] -#[allow(clippy::too_many_arguments)] -pub fn placebo_variance_sdid<'py>( - py: Python<'py>, - y_pre_control: PyReadonlyArray2<'py, f64>, - y_post_control: PyReadonlyArray2<'py, f64>, - y_pre_treated_mean: PyReadonlyArray1<'py, f64>, - y_post_treated_mean: PyReadonlyArray1<'py, f64>, - n_treated: usize, - zeta_omega: f64, - zeta_lambda: f64, - min_decrease: f64, - intercept: bool, - max_iter_pre_sparsify: usize, - max_iter: usize, - replications: usize, - seed: u64, -) -> PyResult<(f64, Bound<'py, PyArray1>)> { - // Convert to owned arrays for Send across threads - let y_pre_c = y_pre_control.as_array().to_owned(); - let y_post_c = y_post_control.as_array().to_owned(); - let _y_pre_t_mean = y_pre_treated_mean.as_array().to_owned(); - let _y_post_t_mean = y_post_treated_mean.as_array().to_owned(); - - let n_control = y_pre_c.ncols(); - - // Check if we have enough controls for the split - let n_pseudo_control = n_control.saturating_sub(n_treated); - if n_pseudo_control < 1 { - let empty = Array1::::zeros(0); - return Ok((0.0, empty.to_pyarray_bound(py))); - } - - // Parallel loop over replications - let placebo_estimates: Vec = (0..replications) - .into_par_iter() - .filter_map(|b| { - use rand::prelude::*; - use rand_xoshiro::Xoshiro256PlusPlus; - - let mut rng = Xoshiro256PlusPlus::seed_from_u64(seed.wrapping_add(b as u64)); - - // Generate random permutation of control indices - let perm = fisher_yates_permutation(&mut rng, n_control); - - // Split into pseudo-controls and pseudo-treated - let pseudo_control_idx = &perm[..n_pseudo_control]; - let pseudo_treated_idx = &perm[n_pseudo_control..]; - - // Extract submatrices - let y_pre_pseudo_control = extract_submatrix_cols(&y_pre_c, pseudo_control_idx); - let y_post_pseudo_control = extract_submatrix_cols(&y_post_c, pseudo_control_idx); - let y_pre_pseudo_treated_mean = column_means(&y_pre_c, pseudo_treated_idx); - let y_post_pseudo_treated_mean = column_means(&y_post_c, pseudo_treated_idx); - - // Re-estimate unit weights on permuted data - let pseudo_omega = weights::compute_sdid_unit_weights_internal( - &y_pre_pseudo_control.view(), - &y_pre_pseudo_treated_mean.view(), - zeta_omega, - intercept, - min_decrease, - max_iter_pre_sparsify, - max_iter, - ); - - // Re-estimate time weights on permuted data - let pseudo_lambda = weights::compute_time_weights_internal( - &y_pre_pseudo_control.view(), - &y_post_pseudo_control.view(), - zeta_lambda, - intercept, - min_decrease, - max_iter_pre_sparsify, - max_iter, - ); - - // Compute placebo SDID estimate - let tau = sdid_estimator_internal( - &y_pre_pseudo_control, - &y_post_pseudo_control, - &y_pre_pseudo_treated_mean, - &y_post_pseudo_treated_mean, - &pseudo_omega, - &pseudo_lambda, - ); - - if tau.is_finite() { - Some(tau) - } else { - None - } - }) - .collect(); - - let n_successful = placebo_estimates.len(); - - // Compute SE: sqrt((r-1)/r) * sd(estimates, ddof=1) matching R's formula - let se = if n_successful < 2 { - 0.0 - } else { - let n = n_successful as f64; - let mean = placebo_estimates.iter().sum::() / n; - let variance = placebo_estimates - .iter() - .map(|x| (x - mean).powi(2)) - .sum::() - / (n - 1.0); - ((n - 1.0) / n).sqrt() * variance.sqrt() - }; - - let estimates_arr = Array1::from_vec(placebo_estimates); - Ok((se, estimates_arr.to_pyarray_bound(py))) -} - -/// Compute bootstrap variance for SDID in parallel. -/// -/// Resamples all units (control + treated) with replacement, renormalizes -/// original unit weights for the resampled controls, and computes the -/// SDID estimator with **fixed** weights (no re-estimation). -/// -/// This matches R's `synthdid::vcov(method="bootstrap")`. -/// -/// # Arguments -/// * `y_pre_control` - Control outcomes in pre-treatment (n_pre, n_control) -/// * `y_post_control` - Control outcomes in post-treatment (n_post, n_control) -/// * `y_pre_treated` - Treated outcomes in pre-treatment (n_pre, n_treated) -/// * `y_post_treated` - Treated outcomes in post-treatment (n_post, n_treated) -/// * `unit_weights` - Original unit weights (n_control,) -/// * `time_weights` - Original time weights (n_pre,) -/// * `n_bootstrap` - Number of bootstrap iterations -/// * `seed` - Random seed -/// -/// # Returns -/// (se, bootstrap_estimates, n_failed) -#[pyfunction] -#[pyo3(signature = (y_pre_control, y_post_control, y_pre_treated, y_post_treated, - unit_weights, time_weights, n_bootstrap, seed))] -#[allow(clippy::too_many_arguments)] -pub fn bootstrap_variance_sdid<'py>( - py: Python<'py>, - y_pre_control: PyReadonlyArray2<'py, f64>, - y_post_control: PyReadonlyArray2<'py, f64>, - y_pre_treated: PyReadonlyArray2<'py, f64>, - y_post_treated: PyReadonlyArray2<'py, f64>, - unit_weights: PyReadonlyArray1<'py, f64>, - time_weights: PyReadonlyArray1<'py, f64>, - n_bootstrap: usize, - seed: u64, -) -> PyResult<(f64, Bound<'py, PyArray1>, usize)> { - // Convert to owned arrays for Send across threads - let y_pre_c = y_pre_control.as_array().to_owned(); - let y_post_c = y_post_control.as_array().to_owned(); - let y_pre_t = y_pre_treated.as_array().to_owned(); - let y_post_t = y_post_treated.as_array().to_owned(); - let omega = unit_weights.as_array().to_owned(); - let lambda = time_weights.as_array().to_owned(); - - let n_control = y_pre_c.ncols(); - let n_treated = y_pre_t.ncols(); - let n_total = n_control + n_treated; - let n_pre = y_pre_c.nrows(); - - // Build full panel: (n_pre+n_post, n_control+n_treated) - let n_post = y_post_c.nrows(); - let n_times = n_pre + n_post; - let mut y_full = Array2::zeros((n_times, n_total)); - for t in 0..n_pre { - for j in 0..n_control { - y_full[[t, j]] = y_pre_c[[t, j]]; - } - for j in 0..n_treated { - y_full[[t, n_control + j]] = y_pre_t[[t, j]]; - } - } - for t in 0..n_post { - for j in 0..n_control { - y_full[[n_pre + t, j]] = y_post_c[[t, j]]; - } - for j in 0..n_treated { - y_full[[n_pre + t, n_control + j]] = y_post_t[[t, j]]; - } - } - - // Parallel bootstrap loop - let bootstrap_estimates: Vec = (0..n_bootstrap) - .into_par_iter() - .filter_map(|b| { - use rand::prelude::*; - use rand_xoshiro::Xoshiro256PlusPlus; - - let mut rng = Xoshiro256PlusPlus::seed_from_u64(seed.wrapping_add(b as u64)); - - // Resample all units with replacement - let boot_idx: Vec = (0..n_total) - .map(|_| rng.gen_range(0..n_total)) - .collect(); - - // Identify control vs treated in resampled set - let mut boot_control_idx = Vec::new(); - let mut boot_treated_idx = Vec::new(); - for &idx in &boot_idx { - if idx < n_control { - boot_control_idx.push(idx); - } else { - boot_treated_idx.push(idx); - } - } - - // Skip degenerate samples - if boot_control_idx.is_empty() || boot_treated_idx.is_empty() { - return None; - } - - // Renormalize unit weights for resampled controls - let boot_omega_raw: Array1 = - Array1::from_iter(boot_control_idx.iter().map(|&i| omega[i])); - let boot_omega = weights::sum_normalize_internal(&boot_omega_raw); - - let n_boot_control = boot_control_idx.len(); - let n_boot_treated = boot_treated_idx.len(); - - // Extract resampled submatrices - let mut y_boot_pre_c = Array2::zeros((n_pre, n_boot_control)); - let mut y_boot_post_c = Array2::zeros((n_post, n_boot_control)); - for (new_j, &old_j) in boot_control_idx.iter().enumerate() { - for t in 0..n_pre { - y_boot_pre_c[[t, new_j]] = y_full[[t, old_j]]; - } - for t in 0..n_post { - y_boot_post_c[[t, new_j]] = y_full[[n_pre + t, old_j]]; - } - } - - // Compute treated means for resampled treated units - let mut y_boot_pre_t_mean = Array1::zeros(n_pre); - let mut y_boot_post_t_mean = Array1::zeros(n_post); - for &old_j in &boot_treated_idx { - for t in 0..n_pre { - y_boot_pre_t_mean[t] += y_full[[t, old_j]]; - } - for t in 0..n_post { - y_boot_post_t_mean[t] += y_full[[n_pre + t, old_j]]; - } - } - y_boot_pre_t_mean /= n_boot_treated as f64; - y_boot_post_t_mean /= n_boot_treated as f64; - - // Compute ATT with FIXED time_weights and renormalized omega - let tau = sdid_estimator_internal( - &y_boot_pre_c, - &y_boot_post_c, - &y_boot_pre_t_mean, - &y_boot_post_t_mean, - &boot_omega, - &lambda, - ); - - if tau.is_finite() { - Some(tau) - } else { - None - } - }) - .collect(); - - let n_successful = bootstrap_estimates.len(); - let n_failed = n_bootstrap - n_successful; - - // Compute SE: std(estimates, ddof=1) — NO sqrt((r-1)/r) for bootstrap - let se = if n_successful < 2 { - 0.0 - } else { - let n = n_successful as f64; - let mean = bootstrap_estimates.iter().sum::() / n; - let variance = bootstrap_estimates - .iter() - .map(|x| (x - mean).powi(2)) - .sum::() - / (n - 1.0); - variance.sqrt() - }; - - let estimates_arr = Array1::from_vec(bootstrap_estimates); - Ok((se, estimates_arr.to_pyarray_bound(py), n_failed)) -} - -#[cfg(test)] -mod tests { - use super::*; - use ndarray::array; - - #[test] - fn test_fisher_yates_permutation_length() { - use rand::SeedableRng; - use rand_xoshiro::Xoshiro256PlusPlus; - let mut rng = Xoshiro256PlusPlus::seed_from_u64(42); - let perm = fisher_yates_permutation(&mut rng, 10); - assert_eq!(perm.len(), 10); - // All elements should be present - let mut sorted = perm.clone(); - sorted.sort(); - assert_eq!(sorted, (0..10).collect::>()); - } - - #[test] - fn test_fisher_yates_permutation_different_seeds() { - use rand::SeedableRng; - use rand_xoshiro::Xoshiro256PlusPlus; - let mut rng1 = Xoshiro256PlusPlus::seed_from_u64(42); - let mut rng2 = Xoshiro256PlusPlus::seed_from_u64(43); - let perm1 = fisher_yates_permutation(&mut rng1, 20); - let perm2 = fisher_yates_permutation(&mut rng2, 20); - assert_ne!(perm1, perm2); - } - - #[test] - fn test_extract_submatrix_cols() { - let m = array![[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0]]; - let sub = extract_submatrix_cols(&m, &[0, 2]); - assert_eq!(sub.nrows(), 2); - assert_eq!(sub.ncols(), 2); - assert!((sub[[0, 0]] - 1.0).abs() < 1e-10); - assert!((sub[[0, 1]] - 3.0).abs() < 1e-10); - assert!((sub[[1, 0]] - 5.0).abs() < 1e-10); - assert!((sub[[1, 1]] - 7.0).abs() < 1e-10); - } - - #[test] - fn test_column_means() { - let m = array![[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0]]; - let means = column_means(&m, &[1, 3]); - // Mean of columns 1 and 3: - // Row 0: (2+4)/2 = 3 - // Row 1: (6+8)/2 = 7 - assert!((means[0] - 3.0).abs() < 1e-10); - assert!((means[1] - 7.0).abs() < 1e-10); - } - - #[test] - fn test_sdid_estimator_internal_zero_weights() { - // When all time weights are on last pre-period and unit weights are uniform, - // should reduce to a simpler DiD - let y_pre_c = array![[1.0, 2.0], [3.0, 4.0]]; // (2 pre, 2 control) - let y_post_c = array![[5.0, 6.0]]; // (1 post, 2 control) - let y_pre_t = array![2.0, 4.0]; // (2 pre,) means - let y_post_t = array![10.0]; // (1 post,) means - - let omega = array![0.5, 0.5]; // uniform - let lambda = array![0.0, 1.0]; // all weight on last pre-period - - let tau = sdid_estimator_internal( - &y_pre_c, &y_post_c, &y_pre_t, &y_post_t, &omega, &lambda, - ); - - // Manual computation: - // weighted_pre_control = [0, 1] @ [[1,2],[3,4]] = [3, 4] - // weighted_pre_treated = [0, 1] @ [2, 4] = 4 - // mean_post_control = [5, 6] - // mean_post_treated = 10 - // did_treated = 10 - 4 = 6 - // did_control = [0.5, 0.5] @ ([5,6] - [3,4]) = [0.5, 0.5] @ [2, 2] = 2 - // tau = 6 - 2 = 4 - assert!((tau - 4.0).abs() < 1e-10, "Expected 4.0, got {}", tau); - } -} diff --git a/tests/test_methodology_sdid.py b/tests/test_methodology_sdid.py index 3eb86645..dd3cb40d 100644 --- a/tests/test_methodology_sdid.py +++ b/tests/test_methodology_sdid.py @@ -1039,6 +1039,94 @@ def test_floor_equivalence(self): assert abs(w_floor.sum() - 1.0) < 1e-10, "Weights should sum to 1" +class TestBackendSEConsistency: + """Test that pure Python and Rust-accelerated backends produce identical SEs.""" + + def test_placebo_se_matches_across_backends(self): + """SyntheticDiD placebo SE is identical regardless of backend. + + After removing the Rust outer-loop variance fast paths, both backends + use the same Python loop for placebo/bootstrap variance. The only + difference is that inner Frank-Wolfe weight calls may dispatch to Rust + (faer) vs NumPy. With the same seed, RNG sequences are identical, so + SEs should match within floating-point tolerance (rtol=1e-4 allows for + accumulated faer vs NumPy differences across hundreds of iterations). + """ + import sys + + df = _make_panel(n_control=20, n_treated=3, n_pre=5, n_post=3, + att=5.0, seed=42) + post_periods = list(range(5, 8)) + + # Run with default backend (Rust-accelerated inner calls if available) + sdid_default = SyntheticDiD( + variance_method="placebo", n_bootstrap=50, seed=42 + ) + results_default = sdid_default.fit( + df, "outcome", "treated", "unit", "period", post_periods + ) + + # Run with pure Python backend + utils_mod = sys.modules["diff_diff.utils"] + with patch.object(utils_mod, "HAS_RUST_BACKEND", False): + sdid_py = SyntheticDiD( + variance_method="placebo", n_bootstrap=50, seed=42 + ) + results_py = sdid_py.fit( + df.copy(), "outcome", "treated", "unit", "period", post_periods + ) + + # ATT must be identical (same data, same algorithm) + np.testing.assert_allclose( + results_default.att, results_py.att, rtol=1e-6, + err_msg="ATT should match between backends" + ) + + # SE must match within tolerance (faer vs NumPy in FW inner loop) + np.testing.assert_allclose( + results_default.se, results_py.se, rtol=1e-4, + err_msg="Placebo SE should match between backends" + ) + + def test_bootstrap_se_matches_across_backends(self): + """SyntheticDiD bootstrap SE is identical regardless of backend.""" + import sys + + df = _make_panel(n_control=20, n_treated=3, n_pre=5, n_post=3, + att=5.0, seed=42) + post_periods = list(range(5, 8)) + + # Run with default backend + sdid_default = SyntheticDiD( + variance_method="bootstrap", n_bootstrap=50, seed=42 + ) + results_default = sdid_default.fit( + df, "outcome", "treated", "unit", "period", post_periods + ) + + # Run with pure Python backend + utils_mod = sys.modules["diff_diff.utils"] + with patch.object(utils_mod, "HAS_RUST_BACKEND", False): + sdid_py = SyntheticDiD( + variance_method="bootstrap", n_bootstrap=50, seed=42 + ) + results_py = sdid_py.fit( + df.copy(), "outcome", "treated", "unit", "period", post_periods + ) + + # ATT must match + np.testing.assert_allclose( + results_default.att, results_py.att, rtol=1e-6, + err_msg="ATT should match between backends" + ) + + # Bootstrap SE must match (same RNG, same loop, only inner FW differs) + np.testing.assert_allclose( + results_default.se, results_py.se, rtol=1e-4, + err_msg="Bootstrap SE should match between backends" + ) + + class TestEmptyPostGuard: """Test compute_time_weights guard for empty Y_post_control.""" diff --git a/tests/test_rust_backend.py b/tests/test_rust_backend.py index 6bcc2698..376585b6 100644 --- a/tests/test_rust_backend.py +++ b/tests/test_rust_backend.py @@ -1871,280 +1871,6 @@ def test_full_sdid_rust_vs_python(self): ) -@pytest.mark.skipif(not HAS_RUST_BACKEND, reason="Rust backend not available") -class TestSDIDVarianceRustBackend: - """Test suite for SDID parallel variance estimation in Rust.""" - - @staticmethod - def _make_sdid_data(n_control=10, n_treated=2, n_pre=6, n_post=2, - true_effect=3.0, seed=42): - """Create well-conditioned SDID test data.""" - rng = np.random.default_rng(seed) - Y_pre_control = rng.standard_normal((n_pre, n_control)) - Y_post_control = rng.standard_normal((n_post, n_control)) - Y_pre_treated = rng.standard_normal((n_pre, n_treated)) - Y_post_treated = Y_pre_treated[:n_post, :] + true_effect + rng.standard_normal((n_post, n_treated)) * 0.3 - - Y_pre_treated_mean = np.mean(Y_pre_treated, axis=1) - Y_post_treated_mean = np.mean(Y_post_treated, axis=1) - - return (Y_pre_control, Y_post_control, Y_pre_treated, - Y_post_treated, Y_pre_treated_mean, Y_post_treated_mean) - - def test_placebo_se_returns_valid(self): - """Rust placebo SE produces valid float > 0.""" - from diff_diff._rust_backend import placebo_variance_sdid - - data = self._make_sdid_data() - Y_pre_c, Y_post_c, _, _, Y_pre_t_mean, Y_post_t_mean = data - - se, estimates = placebo_variance_sdid( - Y_pre_c, Y_post_c, Y_pre_t_mean, Y_post_t_mean, - 2, # n_treated - 0.5, # zeta_omega - 0.01, # zeta_lambda - 1e-5, # min_decrease - True, 100, 10000, 50, 42, - ) - - assert isinstance(se, float) - assert se > 0, f"SE should be > 0, got {se}" - assert np.isfinite(se), f"SE should be finite, got {se}" - assert len(estimates) > 0, "Should have some estimates" - - def test_placebo_se_reproducible(self): - """Same seed produces same SE and estimates.""" - from diff_diff._rust_backend import placebo_variance_sdid - - data = self._make_sdid_data() - Y_pre_c, Y_post_c, _, _, Y_pre_t_mean, Y_post_t_mean = data - - args = (Y_pre_c, Y_post_c, Y_pre_t_mean, Y_post_t_mean, - 2, 0.5, 0.01, 1e-5, True, 100, 10000, 30, 42) - - se1, est1 = placebo_variance_sdid(*args) - se2, est2 = placebo_variance_sdid(*args) - - assert abs(se1 - se2) < 1e-10, f"SEs should match: {se1} vs {se2}" - np.testing.assert_array_equal(est1, est2) - - def test_placebo_estimates_correct_count(self): - """All replications succeed on well-conditioned data.""" - from diff_diff._rust_backend import placebo_variance_sdid - - data = self._make_sdid_data() - Y_pre_c, Y_post_c, _, _, Y_pre_t_mean, Y_post_t_mean = data - replications = 50 - - _, estimates = placebo_variance_sdid( - Y_pre_c, Y_post_c, Y_pre_t_mean, Y_post_t_mean, - 2, 0.5, 0.01, 1e-5, True, 100, 10000, replications, 42, - ) - - assert len(estimates) == replications, \ - f"Expected {replications} estimates, got {len(estimates)}" - - def test_placebo_se_statistically_matches_python(self): - """Rust and Python SEs are within 30% (different RNG sequences).""" - from diff_diff._rust_backend import placebo_variance_sdid - from diff_diff.utils import ( - compute_sdid_unit_weights as py_unit_weights, - compute_time_weights as py_time_weights, - compute_sdid_estimator, - ) - from diff_diff.utils import _sum_normalize - - data = self._make_sdid_data(seed=123) - Y_pre_c, Y_post_c, _, _, Y_pre_t_mean, Y_post_t_mean = data - n_treated = 2 - replications = 100 - - # Rust SE (average over multiple seeds for stability) - rust_ses = [] - for seed in [42, 43, 44]: - se_r, _ = placebo_variance_sdid( - Y_pre_c, Y_post_c, Y_pre_t_mean, Y_post_t_mean, - n_treated, 0.5, 0.01, 1e-5, True, 100, 10000, replications, seed, - ) - rust_ses.append(se_r) - rust_mean_se = np.mean(rust_ses) - - # Python SE (average over multiple seeds) - python_ses = [] - for seed in [42, 43, 44]: - rng = np.random.default_rng(seed) - n_control = Y_pre_c.shape[1] - n_pseudo_control = n_control - n_treated - estimates = [] - for _ in range(replications): - perm = rng.permutation(n_control) - pc_idx = perm[:n_pseudo_control] - pt_idx = perm[n_pseudo_control:] - y_pre_pc = Y_pre_c[:, pc_idx] - y_post_pc = Y_post_c[:, pc_idx] - y_pre_pt_mean = np.mean(Y_pre_c[:, pt_idx], axis=1) - y_post_pt_mean = np.mean(Y_post_c[:, pt_idx], axis=1) - omega = py_unit_weights(y_pre_pc, y_pre_pt_mean, zeta_omega=0.5, min_decrease=1e-5) - lam = py_time_weights(y_pre_pc, y_post_pc, zeta_lambda=0.01, min_decrease=1e-5) - tau = compute_sdid_estimator(y_pre_pc, y_post_pc, y_pre_pt_mean, y_post_pt_mean, omega, lam) - estimates.append(tau) - n = len(estimates) - py_se = np.sqrt((n - 1) / n) * np.std(estimates, ddof=1) - python_ses.append(py_se) - python_mean_se = np.mean(python_ses) - - # SEs should be in the same ballpark (within 30%) - ratio = rust_mean_se / python_mean_se if python_mean_se > 0 else float('inf') - assert 0.7 < ratio < 1.3, \ - f"SE ratio {ratio:.2f} out of range: Rust={rust_mean_se:.4f}, Python={python_mean_se:.4f}" - - def test_placebo_insufficient_controls(self): - """Returns (0.0, []) when n_control <= n_treated.""" - from diff_diff._rust_backend import placebo_variance_sdid - - Y_pre_c = np.random.randn(5, 2) - Y_post_c = np.random.randn(2, 2) - Y_pre_t_mean = np.random.randn(5) - Y_post_t_mean = np.random.randn(2) - - se, estimates = placebo_variance_sdid( - Y_pre_c, Y_post_c, Y_pre_t_mean, Y_post_t_mean, - 3, # n_treated > n_control=2 - 0.5, 0.01, 1e-5, True, 100, 10000, 50, 42, - ) - - assert se == 0.0 - assert len(estimates) == 0 - - def test_bootstrap_se_returns_valid(self): - """Rust bootstrap SE produces valid float > 0.""" - from diff_diff._rust_backend import bootstrap_variance_sdid - - data = self._make_sdid_data() - Y_pre_c, Y_post_c, Y_pre_t, Y_post_t, _, _ = data - - # Need unit/time weights - omega = np.ones(Y_pre_c.shape[1]) / Y_pre_c.shape[1] - lam = np.ones(Y_pre_c.shape[0]) / Y_pre_c.shape[0] - - se, estimates, n_failed = bootstrap_variance_sdid( - Y_pre_c, Y_post_c, Y_pre_t, Y_post_t, - omega, lam, 50, 42, - ) - - assert isinstance(se, float) - assert se > 0, f"SE should be > 0, got {se}" - assert np.isfinite(se) - assert len(estimates) > 0 - - def test_bootstrap_se_reproducible(self): - """Same seed produces same SE.""" - from diff_diff._rust_backend import bootstrap_variance_sdid - - data = self._make_sdid_data() - Y_pre_c, Y_post_c, Y_pre_t, Y_post_t, _, _ = data - omega = np.ones(Y_pre_c.shape[1]) / Y_pre_c.shape[1] - lam = np.ones(Y_pre_c.shape[0]) / Y_pre_c.shape[0] - - args = (Y_pre_c, Y_post_c, Y_pre_t, Y_post_t, omega, lam, 30, 42) - - se1, est1, _ = bootstrap_variance_sdid(*args) - se2, est2, _ = bootstrap_variance_sdid(*args) - - assert abs(se1 - se2) < 1e-10 - np.testing.assert_array_equal(est1, est2) - - def test_bootstrap_se_statistically_matches_python(self): - """Rust and Python bootstrap SEs are within 30%.""" - from diff_diff._rust_backend import bootstrap_variance_sdid - from diff_diff.utils import compute_sdid_estimator, _sum_normalize - - data = self._make_sdid_data(seed=99) - Y_pre_c, Y_post_c, Y_pre_t, Y_post_t, _, _ = data - n_control = Y_pre_c.shape[1] - n_treated = Y_pre_t.shape[1] - n_total = n_control + n_treated - n_pre = Y_pre_c.shape[0] - n_bootstrap = 200 - - omega = np.ones(n_control) / n_control - lam = np.ones(n_pre) / n_pre - - # Rust SE - se_rust, _, _ = bootstrap_variance_sdid( - Y_pre_c, Y_post_c, Y_pre_t, Y_post_t, - omega, lam, n_bootstrap, 42, - ) - - # Python SE - Y_full = np.block([ - [Y_pre_c, Y_pre_t], - [Y_post_c, Y_post_t] - ]) - rng = np.random.default_rng(42) - py_estimates = [] - for _ in range(n_bootstrap): - boot_idx = rng.choice(n_total, size=n_total, replace=True) - boot_is_control = boot_idx < n_control - if not np.any(boot_is_control) or np.all(boot_is_control): - continue - boot_omega = _sum_normalize(omega[boot_idx[boot_is_control]]) - Y_boot = Y_full[:, boot_idx] - Y_boot_pre_t_mean = np.mean(Y_boot[:n_pre, ~boot_is_control], axis=1) - Y_boot_post_t_mean = np.mean(Y_boot[n_pre:, ~boot_is_control], axis=1) - tau = compute_sdid_estimator( - Y_boot[:n_pre, boot_is_control], - Y_boot[n_pre:, boot_is_control], - Y_boot_pre_t_mean, Y_boot_post_t_mean, - boot_omega, lam, - ) - py_estimates.append(tau) - se_python = float(np.std(py_estimates, ddof=1)) - - ratio = se_rust / se_python if se_python > 0 else float('inf') - assert 0.5 < ratio < 2.0, \ - f"Bootstrap SE ratio {ratio:.2f}: Rust={se_rust:.4f}, Python={se_python:.4f}" - - def test_full_sdid_fit_uses_rust_variance(self): - """SyntheticDiD.fit() uses Rust variance when available and produces valid results.""" - import pandas as pd - from diff_diff import SyntheticDiD - - np.random.seed(42) - n_control, n_treated, n_pre, n_post = 8, 2, 6, 2 - true_effect = 3.0 - data = [] - for i in range(n_control + n_treated): - is_treated = i >= n_control - for t in range(n_pre + n_post): - post = t >= n_pre - y = 1.0 + 0.5 * i + 0.3 * t + np.random.randn() * 0.3 - if is_treated and post: - y += true_effect - data.append({ - 'unit': i, 'time': t, 'outcome': y, - 'treated': 1 if is_treated else 0, - }) - df = pd.DataFrame(data) - post_periods = list(range(n_pre, n_pre + n_post)) - - # Fit with placebo variance (uses Rust) - sdid = SyntheticDiD(variance_method="placebo", n_bootstrap=50, seed=42) - results = sdid.fit(df, 'outcome', 'treated', 'unit', 'time', post_periods) - - assert np.isfinite(results.att), f"ATT should be finite: {results.att}" - assert np.isfinite(results.se), f"SE should be finite: {results.se}" - assert results.se > 0, f"SE should be > 0: {results.se}" - - # Fit with bootstrap variance (uses Rust) - sdid_boot = SyntheticDiD(variance_method="bootstrap", n_bootstrap=50, seed=42) - results_boot = sdid_boot.fit(df, 'outcome', 'treated', 'unit', 'time', post_periods) - - assert np.isfinite(results_boot.att), f"Bootstrap ATT should be finite: {results_boot.att}" - assert np.isfinite(results_boot.se), f"Bootstrap SE should be finite: {results_boot.se}" - assert results_boot.se > 0, f"Bootstrap SE should be > 0: {results_boot.se}" - - class TestFallbackWhenNoRust: """Test that pure Python fallback works when Rust is unavailable."""