diff --git a/README.md b/README.md index f6d103b1..571753d2 100644 --- a/README.md +++ b/README.md @@ -1983,7 +1983,7 @@ TROP( max_iter=100, # Max iterations for factor estimation tol=1e-6, # Convergence tolerance alpha=0.05, # Significance level for CIs - n_bootstrap=200, # Bootstrap replications + n_bootstrap=200, # Bootstrap replications (minimum 2; TROP requires bootstrap for SEs) seed=None # Random seed ) ``` @@ -2064,8 +2064,6 @@ SunAbraham( | `time` | str | Time period column | | `first_treat` | str | Column with first treatment period (0 for never-treated) | | `covariates` | list | Covariate column names | -| `min_pre_periods` | int | Minimum pre-treatment periods to include | -| `min_post_periods` | int | Minimum post-treatment periods to include | ### SunAbrahamResults @@ -2105,6 +2103,7 @@ ImputationDiD( alpha=0.05, # Significance level for CIs cluster=None, # Column for cluster-robust SEs n_bootstrap=0, # Bootstrap iterations (0 = analytical) + bootstrap_weights='rademacher', # 'rademacher', 'mammen', or 'webb' seed=None, # Random seed rank_deficient_action='warn', # 'warn', 'error', or 'silent' horizon_max=None, # Max event-study horizon @@ -2159,6 +2158,7 @@ TwoStageDiD( alpha=0.05, # Significance level for CIs cluster=None, # Column for cluster-robust SEs (defaults to unit) n_bootstrap=0, # Bootstrap iterations (0 = analytical GMM SEs) + bootstrap_weights='rademacher', # 'rademacher', 'mammen', or 'webb' seed=None, # Random seed rank_deficient_action='warn', # 'warn', 'error', or 'silent' horizon_max=None, # Max event-study horizon diff --git a/TODO.md b/TODO.md index 37485884..7d4b8ea5 100644 --- a/TODO.md +++ b/TODO.md @@ -28,7 +28,7 @@ Target: < 1000 lines per module for maintainability. | ~~`trop.py`~~ | ~~2904~~ ~2560 | ✅ Partially split: results extracted to `trop_results.py` (~340 lines) | | ~~`imputation.py`~~ | ~~2480~~ ~1740 | ✅ Split into imputation.py, imputation_results.py, imputation_bootstrap.py | | ~~`two_stage.py`~~ | ~~2209~~ ~1490 | ✅ Split into two_stage.py, two_stage_results.py, two_stage_bootstrap.py | -| `utils.py` | 1879 | Monitor -- legacy placebo functions stay to avoid circular imports | +| `utils.py` | 1780 | Monitor -- legacy placebo function removed | | `visualization.py` | 1678 | Monitor -- growing but cohesive | | `linalg.py` | 1537 | Monitor -- unified backend, splitting would hurt cohesion | | `honest_did.py` | 1511 | Acceptable | @@ -58,18 +58,18 @@ Deferred items from PR reviews that were not addressed before merge. | Issue | Location | PR | Priority | |-------|----------|----|----------| -| TwoStageDiD & ImputationDiD bootstrap hardcodes Rademacher only; no `bootstrap_weights` parameter unlike CallawaySantAnna | `two_stage_bootstrap.py`, `imputation_bootstrap.py` | #156, #141 | Medium | -| TwoStageDiD GMM score logic duplicated between analytic/bootstrap with inconsistent NaN/overflow handling | `two_stage.py`, `two_stage_bootstrap.py` | #156 | Medium | -| ImputationDiD weight construction duplicated between aggregation and bootstrap (drift risk) -- has explicit code comment acknowledging duplication | `imputation.py`, `imputation_bootstrap.py` | #141 | Medium | -| ImputationDiD dense `(A0'A0).toarray()` scales O((U+T+K)^2), OOM risk on large panels | `imputation.py` | #141 | Medium | +| ~~TwoStageDiD & ImputationDiD bootstrap hardcodes Rademacher only; no `bootstrap_weights` parameter unlike CallawaySantAnna~~ | ~~`two_stage_bootstrap.py`, `imputation_bootstrap.py`~~ | ~~#156, #141~~ | ✅ Fixed: Added `bootstrap_weights` parameter to both estimators | +| ~~TwoStageDiD GMM score logic duplicated between analytic/bootstrap with inconsistent NaN/overflow handling~~ | ~~`two_stage.py`, `two_stage_bootstrap.py`~~ | ~~#156~~ | ✅ Fixed: Unified via `_compute_gmm_scores()` static method | +| ~~ImputationDiD weight construction duplicated between aggregation and bootstrap (drift risk)~~ | ~~`imputation.py`, `imputation_bootstrap.py`~~ | ~~#141~~ | ✅ Fixed: Extracted `_compute_target_weights()` helper in `imputation_bootstrap.py` | +| ImputationDiD dense `(A0'A0).toarray()` scales O((U+T+K)^2), OOM risk on large panels | `imputation.py` | #141 | Medium (deferred — only triggers when sparse solver fails; fixing requires sparse least-squares alternatives) | #### Performance | Issue | Location | PR | Priority | |-------|----------|----|----------| -| TwoStageDiD per-column `.toarray()` in loop for cluster scores | `two_stage_bootstrap.py` | #156 | Medium | +| ~~TwoStageDiD per-column `.toarray()` in loop for cluster scores~~ | ~~`two_stage_bootstrap.py`~~ | ~~#156~~ | ✅ Fixed: Single `.toarray()` call replaces per-column loop | | ImputationDiD event-study SEs recompute full conservative variance per horizon (should cache A0/A1 factorization) | `imputation.py` | #141 | Low | -| Legacy `compute_placebo_effects` uses deprecated projected-gradient weights (marked deprecated, users directed to `SyntheticDiD`) | `utils.py:1689-1691` | #145 | Low | +| ~~Legacy `compute_placebo_effects` uses deprecated projected-gradient weights~~ | ~~`utils.py:1689-1691`~~ | ~~#145~~ | ✅ Fixed: Removed function entirely | | Rust faer SVD ndarray-to-faer conversion overhead (minimal vs SVD cost) | `rust/src/linalg.rs:67` | #115 | Low | #### Testing/Docs @@ -78,11 +78,11 @@ Deferred items from PR reviews that were not addressed before merge. |-------|----------|----|----------| | Tutorial notebooks not executed in CI | `docs/tutorials/*.ipynb` | #159 | Low | | ~~TwoStageDiD `test_nan_propagation` is a no-op~~ | ~~`tests/test_two_stage.py:643-652`~~ | ~~#156~~ | ✅ Fixed | -| ImputationDiD bootstrap + covariate path untested | `tests/test_imputation.py` | #141 | Low | -| TROP `n_bootstrap >= 2` validation missing (can yield 0/NaN SE silently) | `trop.py:462` | #124 | Low | -| SunAbraham deprecated `min_pre_periods`/`min_post_periods` still in `fit()` docstring | `sun_abraham.py:458-487` | #153 | Low | +| ~~ImputationDiD bootstrap + covariate path untested~~ | ~~`tests/test_imputation.py`~~ | ~~#141~~ | ✅ Fixed: Added `test_bootstrap_with_covariates` | +| ~~TROP `n_bootstrap >= 2` validation missing (can yield 0/NaN SE silently)~~ | ~~`trop.py:462`~~ | ~~#124~~ | ✅ Fixed: Added `ValueError` for `n_bootstrap < 2` | +| ~~SunAbraham deprecated `min_pre_periods`/`min_post_periods` still in `fit()` docstring~~ | ~~`sun_abraham.py:458-487`~~ | ~~#153~~ | ✅ Fixed: Removed deprecated params from `fit()` | | R comparison tests spawn separate `Rscript` per test (slow CI) | `tests/test_methodology_twfe.py:294` | #139 | Low | -| Rust TROP bootstrap SE returns 0.0 instead of NaN for <2 samples | `rust/src/trop.rs:1038-1054` | #115 | Low | +| ~~Rust TROP bootstrap SE returns 0.0 instead of NaN for <2 samples~~ | ~~`rust/src/trop.rs:1038-1054`~~ | ~~#115~~ | ✅ Already fixed: Returns `f64::NAN` at `rust/src/trop.rs:1034` | --- diff --git a/diff_diff/imputation.py b/diff_diff/imputation.py index ae651efe..e9943ecb 100644 --- a/diff_diff/imputation.py +++ b/diff_diff/imputation.py @@ -22,7 +22,7 @@ from scipy import sparse, stats from scipy.sparse.linalg import spsolve -from diff_diff.imputation_bootstrap import ImputationDiDBootstrapMixin +from diff_diff.imputation_bootstrap import ImputationDiDBootstrapMixin, _compute_target_weights from diff_diff.imputation_results import ImputationBootstrapResults, ImputationDiDResults # noqa: F401 (re-export) from diff_diff.linalg import solve_ols from diff_diff.utils import safe_inference @@ -63,6 +63,8 @@ class ImputationDiD(ImputationDiDBootstrapMixin): n_bootstrap : int, default=0 Number of bootstrap iterations. If 0, uses analytical inference (conservative variance from Theorem 3). + bootstrap_weights : str, default="rademacher" + Type of bootstrap weights: "rademacher", "mammen", or "webb". seed : int, optional Random seed for reproducibility. rank_deficient_action : str, default="warn" @@ -126,6 +128,7 @@ def __init__( alpha: float = 0.05, cluster: Optional[str] = None, n_bootstrap: int = 0, + bootstrap_weights: str = "rademacher", seed: Optional[int] = None, rank_deficient_action: str = "warn", horizon_max: Optional[int] = None, @@ -136,6 +139,11 @@ def __init__( f"rank_deficient_action must be 'warn', 'error', or 'silent', " f"got '{rank_deficient_action}'" ) + if bootstrap_weights not in ("rademacher", "mammen", "webb"): + raise ValueError( + f"bootstrap_weights must be 'rademacher', 'mammen', or 'webb', " + f"got '{bootstrap_weights}'" + ) if aux_partition not in ("cohort_horizon", "cohort", "horizon"): raise ValueError( f"aux_partition must be 'cohort_horizon', 'cohort', or 'horizon', " @@ -146,6 +154,7 @@ def __init__( self.alpha = alpha self.cluster = cluster self.n_bootstrap = n_bootstrap + self.bootstrap_weights = bootstrap_weights self.seed = seed self.rank_deficient_action = rank_deficient_action self.horizon_max = horizon_max @@ -1359,15 +1368,7 @@ def _aggregate_event_study( effect = float(np.mean(valid_tau)) # Compute SE via conservative variance with horizon-specific weights - weights_h = np.zeros(int(omega_1_mask.sum())) - # Map h_mask (relative to df_1) to weights array - h_indices_in_omega1 = np.where(h_mask)[0] - n_valid = len(valid_tau) - # Only weight valid (finite) observations - finite_mask = np.isfinite(tau_hat[h_mask]) - valid_h_indices = h_indices_in_omega1[finite_mask] - for idx in valid_h_indices: - weights_h[idx] = 1.0 / n_valid + weights_h, n_valid = _compute_target_weights(tau_hat, h_mask) se = self._compute_conservative_variance( df=df, @@ -1477,12 +1478,7 @@ def _aggregate_group( effect = float(np.mean(valid_tau)) # Compute SE with group-specific weights - weights_g = np.zeros(int(omega_1_mask.sum())) - finite_mask = np.isfinite(tau_hat) & g_mask - g_indices = np.where(finite_mask)[0] - n_valid = len(valid_tau) - for idx in g_indices: - weights_g[idx] = 1.0 / n_valid + weights_g, _ = _compute_target_weights(tau_hat, g_mask) se = self._compute_conservative_variance( df=df, @@ -1664,6 +1660,7 @@ def get_params(self) -> Dict[str, Any]: "alpha": self.alpha, "cluster": self.cluster, "n_bootstrap": self.n_bootstrap, + "bootstrap_weights": self.bootstrap_weights, "seed": self.seed, "rank_deficient_action": self.rank_deficient_action, "horizon_max": self.horizon_max, diff --git a/diff_diff/imputation_bootstrap.py b/diff_diff/imputation_bootstrap.py index 7d9f1e3d..4524cfb8 100644 --- a/diff_diff/imputation_bootstrap.py +++ b/diff_diff/imputation_bootstrap.py @@ -19,6 +19,39 @@ ] +def _compute_target_weights( + tau_hat: np.ndarray, + target_mask: np.ndarray, +) -> "tuple[np.ndarray, int]": + """ + Equal weights for finite tau_hat observations within target_mask. + + Used by both aggregation and bootstrap paths to avoid weight logic + duplication. + + Parameters + ---------- + tau_hat : np.ndarray + Per-observation treatment effects (may contain NaN). + target_mask : np.ndarray + Boolean mask selecting the target subset within tau_hat. + + Returns + ------- + weights : np.ndarray + Weight array (same length as tau_hat). 1/n_valid for finite + observations in target_mask, 0 elsewhere. + n_valid : int + Number of finite observations in the target subset. + """ + finite_target = np.isfinite(tau_hat) & target_mask + n_valid = int(finite_target.sum()) + weights = np.zeros(len(tau_hat)) + if n_valid > 0: + weights[np.where(finite_target)[0]] = 1.0 / n_valid + return weights, n_valid + + class ImputationDiDBootstrapMixin: """Mixin providing bootstrap inference methods for ImputationDiD.""" @@ -91,7 +124,8 @@ def _precompute_bootstrap_psi( For each aggregation target (overall, per-horizon, per-group), computes psi_i = sum_t v_it * epsilon_tilde_it for each cluster. The multiplier - bootstrap then perturbs these psi sums with Rademacher weights. + bootstrap then perturbs these psi sums with multiplier weights + (rademacher/mammen/webb; configurable via ``bootstrap_weights``). Computational cost scales with the number of aggregation targets, since each target requires its own v_untreated computation (weight-dependent). @@ -120,13 +154,10 @@ def _precompute_bootstrap_psi( result["overall"] = (overall_psi, cluster_ids) # Event study: per-horizon weights - # NOTE: weight logic duplicated from _aggregate_event_study. - # If weight scheme changes there, update here too. if event_study_effects: result["event_study"] = {} df_1 = df.loc[omega_1_mask] rel_times = df_1["_rel_time"].values - n_omega_1 = int(omega_1_mask.sum()) # Balanced cohort mask (same logic as _aggregate_event_study) balanced_mask = None @@ -150,24 +181,18 @@ def _precompute_bootstrap_psi( h_mask = rel_times == h if balanced_mask is not None: h_mask = h_mask & balanced_mask - weights_h = np.zeros(n_omega_1) - finite_h = np.isfinite(tau_hat) & h_mask - n_valid_h = int(finite_h.sum()) + weights_h, n_valid_h = _compute_target_weights(tau_hat, h_mask) if n_valid_h == 0: continue - weights_h[np.where(finite_h)[0]] = 1.0 / n_valid_h psi_h, _ = self._compute_cluster_psi_sums(**common, weights=weights_h) result["event_study"][h] = psi_h # Group effects: per-group weights - # NOTE: weight logic duplicated from _aggregate_group. - # If weight scheme changes there, update here too. if group_effects: result["group"] = {} df_1 = df.loc[omega_1_mask] cohorts = df_1[first_treat].values - n_omega_1 = int(omega_1_mask.sum()) for g in group_effects: if group_effects[g].get("n_obs", 0) == 0: @@ -175,12 +200,9 @@ def _precompute_bootstrap_psi( if not np.isfinite(group_effects[g].get("effect", np.nan)): continue g_mask = cohorts == g - weights_g = np.zeros(n_omega_1) - finite_g = np.isfinite(tau_hat) & g_mask - n_valid_g = int(finite_g.sum()) + weights_g, n_valid_g = _compute_target_weights(tau_hat, g_mask) if n_valid_g == 0: continue - weights_g[np.where(finite_g)[0]] = 1.0 / n_valid_g psi_g, _ = self._compute_cluster_psi_sums(**common, weights=weights_g) result["group"][g] = psi_g @@ -197,7 +219,8 @@ def _run_bootstrap( """ Run multiplier bootstrap on pre-computed influence function sums. - Uses T_b = sum_i w_b_i * psi_i where w_b_i are Rademacher weights + Uses T_b = sum_i w_b_i * psi_i where w_b_i are multiplier weights + (rademacher/mammen/webb; configurable via ``bootstrap_weights``) and psi_i are cluster-level influence function sums from Theorem 3. SE = std(T_b, ddof=1). """ @@ -216,7 +239,7 @@ def _run_bootstrap( # Generate ALL weights upfront: shape (n_bootstrap, n_clusters) all_weights = _generate_bootstrap_weights_batch( - self.n_bootstrap, n_clusters, "rademacher", rng + self.n_bootstrap, n_clusters, self.bootstrap_weights, rng ) # Overall ATT bootstrap draws @@ -295,7 +318,7 @@ def _run_bootstrap( return ImputationBootstrapResults( n_bootstrap=self.n_bootstrap, - weight_type="rademacher", + weight_type=self.bootstrap_weights, alpha=self.alpha, overall_att_se=overall_se, overall_att_ci=overall_ci, diff --git a/diff_diff/imputation_results.py b/diff_diff/imputation_results.py index 6a05a5d8..6520fca4 100644 --- a/diff_diff/imputation_results.py +++ b/diff_diff/imputation_results.py @@ -33,7 +33,7 @@ class ImputationBootstrapResults: n_bootstrap : int Number of bootstrap iterations. weight_type : str - Type of bootstrap weights (currently "rademacher" only). + Type of bootstrap weights: "rademacher", "mammen", or "webb". alpha : float Significance level used for confidence intervals. overall_att_se : float diff --git a/diff_diff/sun_abraham.py b/diff_diff/sun_abraham.py index 7f84fec7..e9c0bfbd 100644 --- a/diff_diff/sun_abraham.py +++ b/diff_diff/sun_abraham.py @@ -433,8 +433,6 @@ def fit( time: str, first_treat: str, covariates: Optional[List[str]] = None, - min_pre_periods: int = 1, - min_post_periods: int = 1, ) -> SunAbrahamResults: """ Fit the Sun-Abraham estimator using saturated regression. @@ -454,10 +452,6 @@ def fit( Use 0 (or np.inf) for never-treated units. covariates : list, optional List of covariate column names to include in regression. - min_pre_periods : int, default=1 - **Deprecated**: Accepted but ignored. Will be removed in a future version. - min_post_periods : int, default=1 - **Deprecated**: Accepted but ignored. Will be removed in a future version. Returns ------- @@ -469,22 +463,6 @@ def fit( ValueError If required columns are missing or data validation fails. """ - # Deprecation warnings for unimplemented parameters - if min_pre_periods != 1: - warnings.warn( - "min_pre_periods is not yet implemented and will be ignored. " - "This parameter will be removed in a future version.", - FutureWarning, - stacklevel=2, - ) - if min_post_periods != 1: - warnings.warn( - "min_post_periods is not yet implemented and will be ignored. " - "This parameter will be removed in a future version.", - FutureWarning, - stacklevel=2, - ) - # Validate inputs required_cols = [outcome, unit, time, first_treat] if covariates: diff --git a/diff_diff/trop.py b/diff_diff/trop.py index 1059cca0..41f02d04 100644 --- a/diff_diff/trop.py +++ b/diff_diff/trop.py @@ -93,7 +93,7 @@ class TROP: alpha : float, default=0.05 Significance level for confidence intervals. n_bootstrap : int, default=200 - Number of bootstrap replications for variance estimation. + Number of bootstrap replications for variance estimation. Must be >= 2. seed : int, optional Random seed for reproducibility. @@ -156,6 +156,12 @@ def __init__( self.lambda_unit_grid = lambda_unit_grid or [0.0, 0.1, 0.5, 1.0, 2.0, 5.0] self.lambda_nn_grid = lambda_nn_grid or [0.0, 0.01, 0.1, 1.0, 10.0] + if n_bootstrap < 2: + raise ValueError( + "n_bootstrap must be >= 2 for TROP (bootstrap variance " + "estimation is always used)" + ) + self.max_iter = max_iter self.tol = tol self.alpha = alpha diff --git a/diff_diff/two_stage.py b/diff_diff/two_stage.py index 5e03dfb6..ab1f2297 100644 --- a/diff_diff/two_stage.py +++ b/diff_diff/two_stage.py @@ -29,6 +29,11 @@ from scipy import sparse from scipy.sparse.linalg import factorized as sparse_factorized +# Maximum number of elements before falling back to per-column sparse aggregation. +# 10M float64 elements ≈ 80 MB peak allocation. Above this, per-column .getcol() +# trades throughput for bounded memory. Keep in sync with two_stage_bootstrap.py. +_SPARSE_DENSE_THRESHOLD = 10_000_000 + from diff_diff.linalg import solve_ols from diff_diff.two_stage_bootstrap import TwoStageDiDBootstrapMixin from diff_diff.two_stage_results import TwoStageBootstrapResults, TwoStageDiDResults # noqa: F401 (re-export) @@ -67,6 +72,8 @@ class TwoStageDiD(TwoStageDiDBootstrapMixin): n_bootstrap : int, default=0 Number of bootstrap iterations. If 0, uses analytical GMM sandwich inference. + bootstrap_weights : str, default="rademacher" + Type of bootstrap weights: "rademacher", "mammen", or "webb". seed : int, optional Random seed for reproducibility. rank_deficient_action : str, default="warn" @@ -125,6 +132,7 @@ def __init__( alpha: float = 0.05, cluster: Optional[str] = None, n_bootstrap: int = 0, + bootstrap_weights: str = "rademacher", seed: Optional[int] = None, rank_deficient_action: str = "warn", horizon_max: Optional[int] = None, @@ -134,11 +142,17 @@ def __init__( f"rank_deficient_action must be 'warn', 'error', or 'silent', " f"got '{rank_deficient_action}'" ) + if bootstrap_weights not in ("rademacher", "mammen", "webb"): + raise ValueError( + f"bootstrap_weights must be 'rademacher', 'mammen', or 'webb', " + f"got '{bootstrap_weights}'" + ) self.anticipation = anticipation self.alpha = alpha self.cluster = cluster self.n_bootstrap = n_bootstrap + self.bootstrap_weights = bootstrap_weights self.seed = seed self.rank_deficient_action = rank_deficient_action self.horizon_max = horizon_max @@ -1065,6 +1079,41 @@ def _stage2_group( return group_effects + # ========================================================================= + # GMM score computation + # ========================================================================= + + @staticmethod + def _compute_gmm_scores( + c_by_cluster: np.ndarray, + gamma_hat: np.ndarray, + s2_by_cluster: np.ndarray, + ) -> np.ndarray: + """ + Compute per-cluster GMM scores S_g = gamma_hat' c_g - X'_{2g} eps_{2g}. + + Handles NaN/overflow from rank-deficient FE by wrapping in errstate + and replacing non-finite values with 0. + + Parameters + ---------- + c_by_cluster : np.ndarray, shape (G, p) + Per-cluster Stage 1 scores. + gamma_hat : np.ndarray, shape (p, k) + Cross-moment correction matrix. + s2_by_cluster : np.ndarray, shape (G, k) + Per-cluster Stage 2 scores. + + Returns + ------- + np.ndarray, shape (G, k) + Per-cluster influence scores. + """ + with np.errstate(invalid="ignore", divide="ignore", over="ignore"): + correction = np.dot(c_by_cluster, gamma_hat) + np.nan_to_num(correction, copy=False, nan=0.0, posinf=0.0, neginf=0.0) + return correction - s2_by_cluster + # ========================================================================= # GMM Sandwich Variance (Butts & Gardner 2022) # ========================================================================= @@ -1178,12 +1227,19 @@ def _compute_gmm_variance( unique_clusters, cluster_indices = np.unique(cluster_ids, return_inverse=True) G = len(unique_clusters) - # Aggregate sparse rows by cluster using column-wise np.add.at - weighted_X10_csc = weighted_X10.tocsc() + n_elements = weighted_X10.shape[0] * weighted_X10.shape[1] c_by_cluster = np.zeros((G, p)) - for j_col in range(p): - col_data = weighted_X10_csc.getcol(j_col).toarray().ravel() - np.add.at(c_by_cluster[:, j_col], cluster_indices, col_data) + if n_elements > _SPARSE_DENSE_THRESHOLD: + # Per-column path: limits peak memory for large FE matrices + weighted_X10_csc = weighted_X10.tocsc() + for j_col in range(p): + col_data = weighted_X10_csc.getcol(j_col).toarray().ravel() + np.add.at(c_by_cluster[:, j_col], cluster_indices, col_data) + else: + # Dense path: faster for moderate-size matrices + weighted_X10_dense = weighted_X10.toarray() + for j_col in range(p): + np.add.at(c_by_cluster[:, j_col], cluster_indices, weighted_X10_dense[:, j_col]) # 3. Per-cluster Stage 2 scores: X'_{2g} eps_{2g} weighted_X2 = X_2 * eps_2[:, None] # (n x k) dense @@ -1192,11 +1248,7 @@ def _compute_gmm_variance( np.add.at(s2_by_cluster[:, j_col], cluster_indices, weighted_X2[:, j_col]) # 4. S_g = gamma_hat' c_g - X'_{2g} eps_{2g} - with np.errstate(invalid="ignore", divide="ignore", over="ignore"): - correction = np.dot(c_by_cluster, gamma_hat) # (G x p) @ (p x k) = (G x k) - # Replace NaN/inf from overflow (rank-deficient FE) with 0 - np.nan_to_num(correction, copy=False, nan=0.0, posinf=0.0, neginf=0.0) - S = correction - s2_by_cluster # (G x k) + S = self._compute_gmm_scores(c_by_cluster, gamma_hat, s2_by_cluster) # 5. Meat: sum_g S_g S'_g = S' S with np.errstate(invalid="ignore", over="ignore"): @@ -1304,6 +1356,7 @@ def get_params(self) -> Dict[str, Any]: "alpha": self.alpha, "cluster": self.cluster, "n_bootstrap": self.n_bootstrap, + "bootstrap_weights": self.bootstrap_weights, "seed": self.seed, "rank_deficient_action": self.rank_deficient_action, "horizon_max": self.horizon_max, diff --git a/diff_diff/two_stage_bootstrap.py b/diff_diff/two_stage_bootstrap.py index f33a5501..51d0cd63 100644 --- a/diff_diff/two_stage_bootstrap.py +++ b/diff_diff/two_stage_bootstrap.py @@ -15,6 +15,9 @@ from diff_diff.linalg import solve_ols from diff_diff.staggered_bootstrap import _generate_bootstrap_weights_batch +# Maximum number of elements before falling back to per-column sparse aggregation. +# Keep in sync with two_stage.py. +_SPARSE_DENSE_THRESHOLD = 10_000_000 from diff_diff.two_stage_results import TwoStageBootstrapResults __all__ = [ @@ -106,19 +109,26 @@ def _compute_cluster_S_scores( unique_clusters, cluster_indices = np.unique(cluster_ids, return_inverse=True) G = len(unique_clusters) - weighted_X10_csc = weighted_X10.tocsc() + n_elements = weighted_X10.shape[0] * weighted_X10.shape[1] c_by_cluster = np.zeros((G, p)) - for j_col in range(p): - col_data = weighted_X10_csc.getcol(j_col).toarray().ravel() - np.add.at(c_by_cluster[:, j_col], cluster_indices, col_data) + if n_elements > _SPARSE_DENSE_THRESHOLD: + # Per-column path: limits peak memory for large FE matrices + weighted_X10_csc = weighted_X10.tocsc() + for j_col in range(p): + col_data = weighted_X10_csc.getcol(j_col).toarray().ravel() + np.add.at(c_by_cluster[:, j_col], cluster_indices, col_data) + else: + # Dense path: faster for moderate-size matrices + weighted_X10_dense = weighted_X10.toarray() + for j_col in range(p): + np.add.at(c_by_cluster[:, j_col], cluster_indices, weighted_X10_dense[:, j_col]) weighted_X2 = X_2 * eps_2[:, None] s2_by_cluster = np.zeros((G, k)) for j_col in range(k): np.add.at(s2_by_cluster[:, j_col], cluster_indices, weighted_X2[:, j_col]) - correction = np.dot(c_by_cluster, gamma_hat) - S = correction - s2_by_cluster + S = self._compute_gmm_scores(c_by_cluster, gamma_hat, s2_by_cluster) # Bread XtX_2 = np.dot(X_2.T, X_2) @@ -201,7 +211,7 @@ def _run_bootstrap( n_clusters = len(unique_clusters) all_weights = _generate_bootstrap_weights_batch( - self.n_bootstrap, n_clusters, "rademacher", rng + self.n_bootstrap, n_clusters, self.bootstrap_weights, rng ) # T_b = bread @ (sum_g w_bg * S_g) = bread @ (W @ S)' per boot @@ -385,7 +395,7 @@ def _run_bootstrap( return TwoStageBootstrapResults( n_bootstrap=self.n_bootstrap, - weight_type="rademacher", + weight_type=self.bootstrap_weights, alpha=self.alpha, overall_att_se=overall_se, overall_att_ci=overall_ci, diff --git a/diff_diff/two_stage_results.py b/diff_diff/two_stage_results.py index a1277726..16b4916a 100644 --- a/diff_diff/two_stage_results.py +++ b/diff_diff/two_stage_results.py @@ -34,7 +34,7 @@ class TwoStageBootstrapResults: n_bootstrap : int Number of bootstrap iterations. weight_type : str - Type of bootstrap weights (currently "rademacher" only). + Type of bootstrap weights: "rademacher", "mammen", or "webb". alpha : float Significance level used for confidence intervals. overall_att_se : float diff --git a/diff_diff/utils.py b/diff_diff/utils.py index 712af691..1b4b3c60 100644 --- a/diff_diff/utils.py +++ b/diff_diff/utils.py @@ -1643,104 +1643,6 @@ def compute_sdid_estimator( return float(tau) -def compute_placebo_effects( - Y_pre_control: np.ndarray, - Y_post_control: np.ndarray, - Y_pre_treated: np.ndarray, - unit_weights: np.ndarray, - time_weights: np.ndarray, - control_unit_ids: List[Any], - n_placebo: Optional[int] = None -) -> np.ndarray: - """ - Compute placebo treatment effects by treating each control as treated. - - Used for inference in synthetic DiD when bootstrap is not appropriate. - - Parameters - ---------- - Y_pre_control : np.ndarray - Control outcomes in pre-treatment periods, shape (n_pre, n_control). - Y_post_control : np.ndarray - Control outcomes in post-treatment periods, shape (n_post, n_control). - Y_pre_treated : np.ndarray - Treated outcomes in pre-treatment periods, shape (n_pre,). - unit_weights : np.ndarray - Unit weights, shape (n_control,). - time_weights : np.ndarray - Time weights, shape (n_pre,). - control_unit_ids : list - List of control unit identifiers. - n_placebo : int, optional - Number of placebo tests. If None, uses all control units. - - Returns - ------- - np.ndarray - Array of placebo treatment effects. - - Notes - ----- - For each control unit j, we pretend it was treated and compute - the SDID estimate using the remaining controls. The distribution - of these placebo effects provides a reference for inference. - - .. deprecated:: - This function uses legacy projected-gradient weights and fixed time - weights. Use ``SyntheticDiD(variance_method='placebo')`` instead. - """ - warnings.warn( - "compute_placebo_effects uses legacy projected-gradient weights and " - "fixed time weights, which does not match the SDID methodology " - "(Algorithm 4 requires re-estimating both omega and lambda per " - "placebo iteration). Use SyntheticDiD(variance_method='placebo') " - "for correct placebo inference. This function will be removed in a " - "future version.", - DeprecationWarning, - stacklevel=2, - ) - n_pre, n_control = Y_pre_control.shape - - if n_placebo is None: - n_placebo = n_control - - placebo_effects = [] - - for j in range(min(n_placebo, n_control)): - # Treat unit j as the "treated" unit - Y_pre_placebo_treated = Y_pre_control[:, j] - Y_post_placebo_treated = Y_post_control[:, j] - - # Use remaining units as controls - remaining_idx = [i for i in range(n_control) if i != j] - - if len(remaining_idx) == 0: - continue - - Y_pre_remaining = Y_pre_control[:, remaining_idx] - Y_post_remaining = Y_post_control[:, remaining_idx] - - # Recompute weights for remaining controls - remaining_weights = compute_synthetic_weights( - Y_pre_remaining, - Y_pre_placebo_treated - ) - - # Compute placebo effect - placebo_tau = compute_sdid_estimator( - Y_pre_remaining, - Y_post_remaining, - Y_pre_placebo_treated, - Y_post_placebo_treated, - remaining_weights, - time_weights - ) - - placebo_effects.append(placebo_tau) - - return np.asarray(placebo_effects) - - def demean_by_group( data: pd.DataFrame, variables: List[str], diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md index 08116c96..59e54d02 100644 --- a/docs/methodology/REGISTRY.md +++ b/docs/methodology/REGISTRY.md @@ -427,7 +427,7 @@ where weights ŵ_{g,e} = n_{g,e} / Σ_g n_{g,e} (sample share of cohort g at eve - Extrapolation beyond observed event-times: not estimated - Event-time range: no artificial cap (estimates all available relative times, matching R's `fixest::sunab()`) - No post-treatment effects: returns `(NaN, NaN)` for overall ATT/SE; all inference fields (t_stat, p_value, conf_int) propagate NaN via `np.isfinite()` guards -- `min_pre_periods`/`min_post_periods` parameters: deprecated (accepted but ignored, emit `FutureWarning`) +- `min_pre_periods`/`min_post_periods` parameters: removed (previously deprecated with `FutureWarning`; callers passing these will now get `TypeError`) - Variance fallback: when full weight vector cannot be constructed for overall ATT SE, uses simplified variance (ignores covariances between periods) with `UserWarning` - Rank-deficient design matrix (covariate collinearity): - Detection: Pivoted QR decomposition with tolerance `1e-07` (R's `qr()` default) @@ -545,7 +545,7 @@ Y_it = alpha_i + beta_t [+ X'_it * delta] + W'_it * gamma + epsilon_it - **treatment_effects DataFrame weights:** `weight` column uses `1/n_valid` for finite tau_hat and 0 for NaN tau_hat, consistent with the ATT estimand. - **Rank-deficient covariates in variance:** Covariates with NaN coefficients (dropped for rank deficiency in Step 1) are excluded from the variance design matrices `A_0`/`A_1`. Only covariates with finite coefficients participate in the `v_it` projection. - **Sparse variance solver:** `_compute_v_untreated_with_covariates` uses `scipy.sparse.linalg.spsolve` to solve `(A_0'A_0) z = A_1'w` without densifying the normal equations matrix. Falls back to dense `lstsq` if the sparse solver fails. -- **Bootstrap inference:** Uses multiplier bootstrap on the Theorem 3 influence function: `psi_i = sum_t v_it * epsilon_tilde_it`. Cluster-level psi sums are pre-computed for each aggregation target (overall, per-horizon, per-group), then perturbed with Rademacher weights. This is a library extension (not in the paper) consistent with CallawaySantAnna/SunAbraham bootstrap patterns. +- **Bootstrap inference:** Uses multiplier bootstrap on the Theorem 3 influence function: `psi_i = sum_t v_it * epsilon_tilde_it`. Cluster-level psi sums are pre-computed for each aggregation target (overall, per-horizon, per-group), then perturbed with multiplier weights (Rademacher by default; configurable via `bootstrap_weights` parameter to use Mammen or Webb weights, matching CallawaySantAnna). This is a library extension (not in the paper) consistent with CallawaySantAnna/SunAbraham bootstrap patterns. - **Auxiliary residuals (Equation 8):** Uses v_it-weighted tau_tilde_g formula: `tau_tilde_g = sum(v_it * tau_hat_it) / sum(v_it)` within each partition group. Zero-weight groups (common in event-study SE computation) fall back to unweighted mean. **Reference implementation(s):** @@ -610,7 +610,7 @@ where `psi_i` is the stacked influence function for unit i across all its observ *Bootstrap:* -Our implementation uses multiplier bootstrap on the GMM influence function: cluster-level `psi` sums are pre-computed, then perturbed with Rademacher weights. The R `did2s` package defaults to block bootstrap (resampling clusters with replacement). Both approaches are asymptotically valid; the multiplier bootstrap is computationally cheaper and consistent with the CallawaySantAnna/ImputationDiD bootstrap patterns in this library. +Our implementation uses multiplier bootstrap on the GMM influence function: cluster-level `psi` sums are pre-computed, then perturbed with multiplier weights (Rademacher by default; configurable via `bootstrap_weights` parameter to use Mammen or Webb weights, matching CallawaySantAnna). The R `did2s` package defaults to block bootstrap (resampling clusters with replacement). Both approaches are asymptotically valid; the multiplier bootstrap is computationally cheaper and consistent with the CallawaySantAnna/ImputationDiD bootstrap patterns in this library. *Edge cases:* - **Always-treated units:** Units treated in all observed periods have no untreated observations for Stage 1 FE estimation. These are excluded with a warning listing the affected unit IDs. Their treated observations do NOT contribute to Stage 2. @@ -941,6 +941,7 @@ Q(λ) = Σ_{j,s: D_js=0} [τ̂_js^loocv(λ)]² - **n_post_periods metadata**: Counts periods where D=1 is actually observed (at least one unit has D=1), not calendar periods from first treatment. In unbalanced panels where treated units are missing in some post-treatment periods, only periods with observed D=1 values are counted. - Wrong D specification: if user provides event-style D (only first treatment period), the absorbing-state validation will raise ValueError with helpful guidance +- **Bootstrap minimum**: `n_bootstrap` must be >= 2 (enforced via `ValueError`). TROP uses bootstrap for all variance estimation — there is no analytical SE formula. - **LOOCV failure metadata**: When LOOCV fits fail in the Rust backend, the first failed observation coordinates (t, i) are returned to Python for informative warning messages - **Inference CI distribution**: After `safe_inference()` migration, CI uses t-distribution (df = max(1, n_treated_obs - 1)), consistent with p_value. Previously CI used normal-distribution while p_value used t-distribution (inconsistent). This is a minor behavioral change; CIs may be slightly wider for small n_treated_obs. diff --git a/tests/test_imputation.py b/tests/test_imputation.py index b2b9c1b8..5e2b46b7 100644 --- a/tests/test_imputation.py +++ b/tests/test_imputation.py @@ -425,6 +425,19 @@ def test_get_params(self): assert params["aux_partition"] == "cohort" assert params["cluster"] is None assert params["rank_deficient_action"] == "warn" + assert params["bootstrap_weights"] == "rademacher" + + def test_bootstrap_weights_in_get_set_params(self): + """bootstrap_weights should appear in get_params and be settable.""" + est = ImputationDiD(bootstrap_weights="mammen") + assert est.get_params()["bootstrap_weights"] == "mammen" + est.set_params(bootstrap_weights="webb") + assert est.bootstrap_weights == "webb" + + def test_bootstrap_weights_invalid_raises(self): + """Invalid bootstrap_weights value should raise ValueError.""" + with pytest.raises(ValueError, match="bootstrap_weights"): + ImputationDiD(bootstrap_weights="invalid") def test_set_params(self): """Test set_params modifies attributes.""" @@ -1076,6 +1089,97 @@ def test_bootstrap_percentile_ci(self, ci_params): np.testing.assert_allclose(br.overall_att_ci[0], expected_lower, rtol=1e-10) np.testing.assert_allclose(br.overall_att_ci[1], expected_upper, rtol=1e-10) + def test_bootstrap_weights_mammen(self, ci_params): + """Bootstrap with mammen weights should produce valid results.""" + data = generate_test_data() + n_boot = ci_params.bootstrap(50) + est = ImputationDiD(n_bootstrap=n_boot, bootstrap_weights="mammen", seed=42) + results = est.fit( + data, outcome="outcome", unit="unit", time="time", first_treat="first_treat", + ) + + br = results.bootstrap_results + assert br is not None + assert br.weight_type == "mammen" + assert br.overall_att_se > 0 + assert np.isfinite(br.overall_att_p_value) + + def test_bootstrap_weights_webb(self, ci_params): + """Bootstrap with webb weights should produce valid results.""" + data = generate_test_data() + n_boot = ci_params.bootstrap(50) + est = ImputationDiD(n_bootstrap=n_boot, bootstrap_weights="webb", seed=42) + results = est.fit( + data, outcome="outcome", unit="unit", time="time", first_treat="first_treat", + ) + + br = results.bootstrap_results + assert br is not None + assert br.weight_type == "webb" + assert br.overall_att_se > 0 + assert np.isfinite(br.overall_att_p_value) + + def test_bootstrap_weights_event_study(self, ci_params): + """Bootstrap with non-default weights should work for event study aggregation.""" + data = generate_test_data() + n_boot = ci_params.bootstrap(50) + est = ImputationDiD( + n_bootstrap=n_boot, bootstrap_weights="mammen", seed=42 + ) + results = est.fit( + data, outcome="outcome", unit="unit", time="time", + first_treat="first_treat", aggregate="event_study", + ) + + br = results.bootstrap_results + assert br is not None + assert br.weight_type == "mammen" + assert br.event_study_ses is not None + assert len(br.event_study_ses) > 0 + for h, se in br.event_study_ses.items(): + assert se > 0, f"Non-positive SE at horizon {h}" + + def test_bootstrap_weights_group(self, ci_params): + """Bootstrap with non-default weights should work for group aggregation.""" + data = generate_test_data() + n_boot = ci_params.bootstrap(50) + est = ImputationDiD( + n_bootstrap=n_boot, bootstrap_weights="mammen", seed=42 + ) + results = est.fit( + data, outcome="outcome", unit="unit", time="time", + first_treat="first_treat", aggregate="group", + ) + + br = results.bootstrap_results + assert br is not None + assert br.weight_type == "mammen" + assert br.group_ses is not None + assert len(br.group_ses) > 0 + for g, se in br.group_ses.items(): + assert se > 0, f"Non-positive SE for group {g}" + + def test_bootstrap_with_covariates(self, ci_params): + """Bootstrap should work with covariates.""" + data = generate_test_data() + # Add a covariate + rng = np.random.default_rng(123) + data["x1"] = rng.normal(0, 1, len(data)) + n_boot = ci_params.bootstrap(50) + est = ImputationDiD(n_bootstrap=n_boot, seed=42) + results = est.fit( + data, + outcome="outcome", + unit="unit", + time="time", + first_treat="first_treat", + covariates=["x1"], + ) + + assert results.bootstrap_results is not None + assert results.bootstrap_results.overall_att_se > 0 + assert np.isfinite(results.bootstrap_results.overall_att_p_value) + # ============================================================================= # TestImputationVsOtherEstimators diff --git a/tests/test_sun_abraham.py b/tests/test_sun_abraham.py index ce60fb0b..d450ede8 100644 --- a/tests/test_sun_abraham.py +++ b/tests/test_sun_abraham.py @@ -1178,28 +1178,6 @@ def test_no_post_effects_bootstrap_returns_nan(self, ci_params): f"Expected (NaN, NaN) overall_conf_int, got {results.overall_conf_int}" ) - def test_deprecated_min_pre_periods_warning(self): - """Test that min_pre_periods emits FutureWarning (Step 5c).""" - data = generate_staggered_data(seed=42) - - sa = SunAbraham(n_bootstrap=0) - with pytest.warns(FutureWarning, match="min_pre_periods"): - sa.fit( - data, outcome="outcome", unit="unit", time="time", - first_treat="first_treat", min_pre_periods=2, - ) - - def test_deprecated_min_post_periods_warning(self): - """Test that min_post_periods emits FutureWarning (Step 5c).""" - data = generate_staggered_data(seed=42) - - sa = SunAbraham(n_bootstrap=0) - with pytest.warns(FutureWarning, match="min_post_periods"): - sa.fit( - data, outcome="outcome", unit="unit", time="time", - first_treat="first_treat", min_post_periods=2, - ) - def test_event_time_no_truncation(self): """Test that event times beyond ±20 are estimated (Step 5d). @@ -1464,6 +1442,15 @@ def test_never_treated_inf_encoding(self): f"SE differs: inf={results_inf.overall_se}, zero={results_zero.overall_se}" ) + def test_removed_params_raise_typeerror(self): + """Removed min_pre_periods/min_post_periods raise TypeError.""" + data = generate_staggered_data(n_units=30, n_periods=6, seed=42) + sa = SunAbraham(n_bootstrap=0) + with pytest.raises(TypeError, match="unexpected keyword argument"): + sa.fit(data, "outcome", "unit", "time", "first_treat", min_pre_periods=2) + with pytest.raises(TypeError, match="unexpected keyword argument"): + sa.fit(data, "outcome", "unit", "time", "first_treat", min_post_periods=2) + def test_all_never_treated_inf_raises(self): """Test that all-never-treated data with np.inf encoding raises ValueError.""" data = generate_staggered_data(n_units=100, n_periods=10, n_cohorts=3, seed=42) diff --git a/tests/test_trop.py b/tests/test_trop.py index 1ee95da5..41a137f2 100644 --- a/tests/test_trop.py +++ b/tests/test_trop.py @@ -95,6 +95,13 @@ def simple_panel_data(): class TestTROP: """Tests for TROP estimator.""" + def test_n_bootstrap_less_than_2_raises(self): + """n_bootstrap < 2 should raise ValueError.""" + with pytest.raises(ValueError, match="n_bootstrap must be >= 2"): + TROP(n_bootstrap=1) + with pytest.raises(ValueError, match="n_bootstrap must be >= 2"): + TROP(n_bootstrap=0) + def test_basic_fit(self, simple_panel_data): """Test basic model fitting.""" trop_est = TROP( diff --git a/tests/test_two_stage.py b/tests/test_two_stage.py index 5450992e..ed311e00 100644 --- a/tests/test_two_stage.py +++ b/tests/test_two_stage.py @@ -907,6 +907,19 @@ def test_get_params(self): assert params["horizon_max"] == 5 assert params["rank_deficient_action"] == "warn" assert params["cluster"] is None + assert params["bootstrap_weights"] == "rademacher" + + def test_bootstrap_weights_in_get_set_params(self): + """bootstrap_weights should appear in get_params and be settable.""" + est = TwoStageDiD(bootstrap_weights="mammen") + assert est.get_params()["bootstrap_weights"] == "mammen" + est.set_params(bootstrap_weights="webb") + assert est.bootstrap_weights == "webb" + + def test_bootstrap_weights_invalid_raises(self): + """Invalid bootstrap_weights value should raise ValueError.""" + with pytest.raises(ValueError, match="bootstrap_weights"): + TwoStageDiD(bootstrap_weights="invalid") def test_set_params(self): """set_params should modify attributes.""" @@ -1047,6 +1060,76 @@ def test_bootstrap_event_study(self, ci_params): for h, se in results.bootstrap_results.event_study_ses.items(): assert se > 0 + def test_bootstrap_weights_mammen(self, ci_params): + """Bootstrap with mammen weights should produce valid results.""" + data = generate_test_data() + n_boot = ci_params.bootstrap(50) + results = TwoStageDiD( + n_bootstrap=n_boot, bootstrap_weights="mammen", seed=42 + ).fit( + data, outcome="outcome", unit="unit", time="time", first_treat="first_treat" + ) + + br = results.bootstrap_results + assert br is not None + assert br.weight_type == "mammen" + assert br.overall_att_se > 0 + assert np.isfinite(br.overall_att_p_value) + + def test_bootstrap_weights_webb(self, ci_params): + """Bootstrap with webb weights should produce valid results.""" + data = generate_test_data() + n_boot = ci_params.bootstrap(50) + results = TwoStageDiD( + n_bootstrap=n_boot, bootstrap_weights="webb", seed=42 + ).fit( + data, outcome="outcome", unit="unit", time="time", first_treat="first_treat" + ) + + br = results.bootstrap_results + assert br is not None + assert br.weight_type == "webb" + assert br.overall_att_se > 0 + assert np.isfinite(br.overall_att_p_value) + + def test_bootstrap_weights_event_study(self, ci_params): + """Bootstrap with non-default weights should work for event study aggregation.""" + data = generate_test_data() + n_boot = ci_params.bootstrap(50) + results = TwoStageDiD( + n_bootstrap=n_boot, bootstrap_weights="mammen", seed=42 + ).fit( + data, outcome="outcome", unit="unit", time="time", + first_treat="first_treat", aggregate="event_study", + ) + + br = results.bootstrap_results + assert br is not None + assert br.weight_type == "mammen" + assert br.event_study_ses is not None + assert len(br.event_study_ses) > 0 + for h, se in br.event_study_ses.items(): + assert se > 0, f"Non-positive SE at horizon {h}" + + def test_bootstrap_weights_group(self, ci_params): + """Bootstrap with non-default weights should work for group aggregation.""" + data = generate_test_data() + n_boot = ci_params.bootstrap(50) + results = TwoStageDiD( + n_bootstrap=n_boot, bootstrap_weights="mammen", seed=42 + ).fit( + data, outcome="outcome", unit="unit", time="time", + first_treat="first_treat", aggregate="group", + ) + + br = results.bootstrap_results + assert br is not None + assert br.weight_type == "mammen" + assert br.group_ses is not None + assert len(br.group_ses) > 0 + for g, se in br.group_ses.items(): + assert se > 0, f"Non-positive SE for group {g}" + # ============================================================================= # TestTwoStageDiDConvenience @@ -1116,3 +1199,35 @@ def test_print_summary(self, capsys): results.print_summary() captured = capsys.readouterr() assert "Two-Stage DiD" in captured.out + + def test_sparse_fallback_path(self): + """Size guard falls back to per-column path and produces same results.""" + import diff_diff.two_stage as ts_mod + + data = generate_test_data(n_units=50, n_periods=6, seed=42) + + # Run with normal (high) threshold — uses dense path + result_dense = TwoStageDiD().fit( + data, outcome="outcome", unit="unit", time="time", first_treat="first_treat" + ) + + # Patch threshold to 1 to force per-column path on all data + orig = ts_mod._SPARSE_DENSE_THRESHOLD + try: + ts_mod._SPARSE_DENSE_THRESHOLD = 1 + result_sparse = TwoStageDiD().fit( + data, + outcome="outcome", + unit="unit", + time="time", + first_treat="first_treat", + ) + finally: + ts_mod._SPARSE_DENSE_THRESHOLD = orig + + np.testing.assert_allclose( + result_dense.overall_att, result_sparse.overall_att, rtol=1e-10 + ) + np.testing.assert_allclose( + result_dense.overall_se, result_sparse.overall_se, rtol=1e-10 + ) diff --git a/tests/test_utils.py b/tests/test_utils.py index 3d79846d..61a0ed80 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -23,7 +23,6 @@ check_parallel_trends_robust, compute_confidence_interval, compute_p_value, - compute_placebo_effects, compute_robust_se, compute_sdid_estimator, compute_synthetic_weights, @@ -1046,116 +1045,6 @@ def test_tighter_margin_harder_to_pass(self, parallel_trends_data): assert results_wide["tost_p_value"] <= results_tight["tost_p_value"] -# ============================================================================= -# Tests for compute_placebo_effects -# ============================================================================= - - -class TestComputePlaceboEffects: - """Tests for compute_placebo_effects function.""" - - def test_returns_correct_number_of_effects(self, sdid_panel_data): - """Test that function returns correct number of placebo effects.""" - # Prepare data matrices - control_data = sdid_panel_data[sdid_panel_data["treated"] == 0] - treated_data = sdid_panel_data[sdid_panel_data["treated"] == 1] - - control_pivot = control_data.pivot(index="period", columns="unit", values="outcome") - treated_mean = treated_data.groupby("period")["outcome"].mean() - - pre_periods = [0, 1, 2, 3, 4] - post_periods = [5, 6, 7] - - Y_pre_control = control_pivot.loc[pre_periods].values - Y_post_control = control_pivot.loc[post_periods].values - Y_pre_treated = treated_mean.loc[pre_periods].values - - # Compute weights - unit_weights = compute_synthetic_weights(Y_pre_control, Y_pre_treated) - time_weights = compute_time_weights(Y_pre_control, Y_post_control, zeta_lambda=1e-6) - - control_units = list(control_pivot.columns) - - with pytest.warns(DeprecationWarning, match="compute_placebo_effects uses legacy"): - placebo_effects = compute_placebo_effects( - Y_pre_control, - Y_post_control, - Y_pre_treated, - unit_weights, - time_weights, - control_units - ) - - # Should have one placebo effect per control unit - assert len(placebo_effects) == len(control_units) - - def test_placebo_effects_distribution(self, sdid_panel_data): - """Test that placebo effects form a reasonable distribution.""" - control_data = sdid_panel_data[sdid_panel_data["treated"] == 0] - treated_data = sdid_panel_data[sdid_panel_data["treated"] == 1] - - control_pivot = control_data.pivot(index="period", columns="unit", values="outcome") - treated_mean = treated_data.groupby("period")["outcome"].mean() - - pre_periods = [0, 1, 2, 3, 4] - post_periods = [5, 6, 7] - - Y_pre_control = control_pivot.loc[pre_periods].values - Y_post_control = control_pivot.loc[post_periods].values - Y_pre_treated = treated_mean.loc[pre_periods].values - - unit_weights = compute_synthetic_weights(Y_pre_control, Y_pre_treated) - time_weights = compute_time_weights(Y_pre_control, Y_post_control, zeta_lambda=1e-6) - - control_units = list(control_pivot.columns) - - with pytest.warns(DeprecationWarning, match="compute_placebo_effects uses legacy"): - placebo_effects = compute_placebo_effects( - Y_pre_control, - Y_post_control, - Y_pre_treated, - unit_weights, - time_weights, - control_units - ) - - # Placebo effects should be centered around zero (no treatment) - assert np.abs(np.mean(placebo_effects)) < 2.0 # Allow some deviation - - def test_n_placebo_limits_output(self, sdid_panel_data): - """Test that n_placebo parameter limits output size.""" - control_data = sdid_panel_data[sdid_panel_data["treated"] == 0] - treated_data = sdid_panel_data[sdid_panel_data["treated"] == 1] - - control_pivot = control_data.pivot(index="period", columns="unit", values="outcome") - treated_mean = treated_data.groupby("period")["outcome"].mean() - - pre_periods = [0, 1, 2, 3, 4] - post_periods = [5, 6, 7] - - Y_pre_control = control_pivot.loc[pre_periods].values - Y_post_control = control_pivot.loc[post_periods].values - Y_pre_treated = treated_mean.loc[pre_periods].values - - unit_weights = compute_synthetic_weights(Y_pre_control, Y_pre_treated) - time_weights = compute_time_weights(Y_pre_control, Y_post_control, zeta_lambda=1e-6) - - control_units = list(control_pivot.columns) - - with pytest.warns(DeprecationWarning, match="compute_placebo_effects uses legacy"): - placebo_effects = compute_placebo_effects( - Y_pre_control, - Y_post_control, - Y_pre_treated, - unit_weights, - time_weights, - control_units, - n_placebo=5 - ) - - assert len(placebo_effects) == 5 - - # ============================================================================= # Additional tests for compute_synthetic_weights # =============================================================================