From 816a92c9115fb169bc47d956152ec8fe7ad877c9 Mon Sep 17 00:00:00 2001 From: igerber Date: Mon, 19 Jan 2026 09:26:35 -0500 Subject: [PATCH 1/2] Add DGP functions to prep.py for all supported DiD designs Consolidate Data Generating Process functions from tutorials and tests into diff_diff/prep.py as reusable library utilities: - generate_staggered_data(): Staggered adoption DiD (CallawaySantAnna, SunAbraham) - generate_factor_data(): Factor model data (TROP, SyntheticDiD) - generate_ddd_data(): Triple Difference (DDD) designs - generate_panel_data(): Panel data with optional parallel trends violations - generate_event_study_data(): Event study with simultaneous treatment Changes: - Add 5 new DGP functions to diff_diff/prep.py with full documentation - Export new functions from diff_diff/__init__.py - Add 33 tests covering all new functions in tests/test_prep.py - Update test files to use library functions where compatible - Update tutorials 02, 04, 07, 08, 10 to import from library - Fix pre-existing API bug in tutorial 07 (show_mdv -> mdv parameter) Users can now generate synthetic data via: from diff_diff import generate_staggered_data, generate_factor_data, ... Co-Authored-By: Claude Opus 4.5 --- diff_diff/__init__.py | 10 + diff_diff/prep.py | 655 ++++++++++++++++++++++++ docs/tutorials/02_staggered_did.ipynb | 50 +- docs/tutorials/04_parallel_trends.ipynb | 62 +-- docs/tutorials/07_pretrends_power.ipynb | 85 +-- docs/tutorials/08_triple_diff.ipynb | 50 +- docs/tutorials/10_trop.ipynb | 178 +------ tests/test_prep.py | 379 ++++++++++++++ tests/test_staggered.py | 71 +-- tests/test_triple_diff.py | 24 +- tests/test_trop.py | 63 +-- 11 files changed, 1101 insertions(+), 526 deletions(-) diff --git a/diff_diff/__init__.py b/diff_diff/__init__.py index aebf32a5..7143b960 100644 --- a/diff_diff/__init__.py +++ b/diff_diff/__init__.py @@ -71,6 +71,11 @@ balance_panel, create_event_time, generate_did_data, + generate_ddd_data, + generate_event_study_data, + generate_factor_data, + generate_panel_data, + generate_staggered_data, make_post_indicator, make_treatment_indicator, rank_control_units, @@ -190,6 +195,11 @@ "validate_did_data", "summarize_did_data", "generate_did_data", + "generate_staggered_data", + "generate_factor_data", + "generate_ddd_data", + "generate_panel_data", + "generate_event_study_data", "create_event_time", "aggregate_to_cohorts", "rank_control_units", diff --git a/diff_diff/prep.py b/diff_diff/prep.py index b276eb49..d7f2dddf 100644 --- a/diff_diff/prep.py +++ b/diff_diff/prep.py @@ -1336,3 +1336,658 @@ def _suggest_treatment_candidates( # Return top candidates result = result.nlargest(n_candidates, 'treatment_candidate_score') return result.reset_index(drop=True) + + +def generate_staggered_data( + n_units: int = 100, + n_periods: int = 10, + cohort_periods: Optional[List[int]] = None, + never_treated_frac: float = 0.3, + treatment_effect: float = 2.0, + dynamic_effects: bool = True, + effect_growth: float = 0.1, + unit_fe_sd: float = 2.0, + time_trend: float = 0.1, + noise_sd: float = 0.5, + seed: Optional[int] = None, +) -> pd.DataFrame: + """ + Generate synthetic data for staggered adoption DiD analysis. + + Creates panel data where different units receive treatment at different + times (staggered rollout). Useful for testing CallawaySantAnna, + SunAbraham, and other staggered DiD estimators. + + Parameters + ---------- + n_units : int, default=100 + Total number of units in the panel. + n_periods : int, default=10 + Number of time periods. + cohort_periods : list of int, optional + Periods when treatment cohorts are first treated. + If None, defaults to [3, 5, 7] for a 10-period panel. + never_treated_frac : float, default=0.3 + Fraction of units that are never treated (cohort 0). + treatment_effect : float, default=2.0 + Base treatment effect at time of treatment. + dynamic_effects : bool, default=True + If True, treatment effects grow over time since treatment. + effect_growth : float, default=0.1 + Per-period growth in treatment effect (if dynamic_effects=True). + Effect at time t since treatment: effect * (1 + effect_growth * t). + unit_fe_sd : float, default=2.0 + Standard deviation of unit fixed effects. + time_trend : float, default=0.1 + Linear time trend coefficient. + noise_sd : float, default=0.5 + Standard deviation of idiosyncratic noise. + seed : int, optional + Random seed for reproducibility. + + Returns + ------- + pd.DataFrame + Synthetic staggered adoption data with columns: + - unit: Unit identifier + - period: Time period + - outcome: Outcome variable + - first_treat: First treatment period (0 = never treated) + - treated: Binary indicator (1 if treated at this observation) + - treat: Binary unit-level ever-treated indicator + - true_effect: The true treatment effect for this observation + + Examples + -------- + Generate staggered adoption data: + + >>> data = generate_staggered_data(n_units=100, n_periods=10, seed=42) + >>> data['first_treat'].value_counts().sort_index() + 0 30 + 3 24 + 5 23 + 7 23 + Name: first_treat, dtype: int64 + + Use with Callaway-Sant'Anna estimator: + + >>> from diff_diff import CallawaySantAnna + >>> cs = CallawaySantAnna() + >>> results = cs.fit(data, outcome='outcome', unit='unit', + ... time='period', first_treat='first_treat') + >>> results.overall_att > 0 + True + """ + rng = np.random.default_rng(seed) + + # Default cohort periods if not specified + if cohort_periods is None: + cohort_periods = [3, 5, 7] if n_periods >= 8 else [n_periods // 3, 2 * n_periods // 3] + + # Validate cohort periods + for cp in cohort_periods: + if cp < 1 or cp >= n_periods: + raise ValueError( + f"Cohort period {cp} must be between 1 and {n_periods - 1}" + ) + + # Determine number of never-treated and treated units + n_never = int(n_units * never_treated_frac) + n_treated = n_units - n_never + + # Assign treatment cohorts + first_treat = np.zeros(n_units, dtype=int) + if n_treated > 0: + cohort_assignments = rng.choice(len(cohort_periods), size=n_treated) + first_treat[n_never:] = [cohort_periods[c] for c in cohort_assignments] + + # Generate unit fixed effects + unit_fe = rng.normal(0, unit_fe_sd, n_units) + + # Build data + records = [] + for unit in range(n_units): + unit_first_treat = first_treat[unit] + is_ever_treated = unit_first_treat > 0 + + for period in range(n_periods): + # Check if treated at this observation + is_treated = is_ever_treated and period >= unit_first_treat + + # Base outcome: unit FE + time trend + y = 10.0 + unit_fe[unit] + time_trend * period + + # Treatment effect + effect = 0.0 + if is_treated: + time_since_treatment = period - unit_first_treat + if dynamic_effects: + effect = treatment_effect * (1 + effect_growth * time_since_treatment) + else: + effect = treatment_effect + y += effect + + # Add noise + y += rng.normal(0, noise_sd) + + records.append({ + "unit": unit, + "period": period, + "outcome": y, + "first_treat": unit_first_treat, + "treated": int(is_treated), + "treat": int(is_ever_treated), + "true_effect": effect, + }) + + return pd.DataFrame(records) + + +def generate_factor_data( + n_units: int = 50, + n_pre: int = 10, + n_post: int = 5, + n_treated: int = 10, + n_factors: int = 2, + treatment_effect: float = 2.0, + factor_strength: float = 1.0, + treated_loading_shift: float = 0.5, + unit_fe_sd: float = 1.0, + noise_sd: float = 0.5, + seed: Optional[int] = None, +) -> pd.DataFrame: + """ + Generate synthetic panel data with interactive fixed effects (factor model). + + Creates data following the DGP: + Y_it = mu + alpha_i + beta_t + Lambda_i'F_t + tau*D_it + eps_it + + where Lambda_i'F_t is the interactive fixed effects component. Useful for + testing TROP (Triply Robust Panel) and comparing with SyntheticDiD. + + Parameters + ---------- + n_units : int, default=50 + Total number of units in the panel. + n_pre : int, default=10 + Number of pre-treatment periods. + n_post : int, default=5 + Number of post-treatment periods. + n_treated : int, default=10 + Number of treated units (assigned to first n_treated unit IDs). + n_factors : int, default=2 + Number of latent factors in the interactive fixed effects. + treatment_effect : float, default=2.0 + True average treatment effect on the treated. + factor_strength : float, default=1.0 + Scaling factor for interactive fixed effects. + treated_loading_shift : float, default=0.5 + Shift in factor loadings for treated units (creates confounding). + unit_fe_sd : float, default=1.0 + Standard deviation of unit fixed effects. + noise_sd : float, default=0.5 + Standard deviation of idiosyncratic noise. + seed : int, optional + Random seed for reproducibility. + + Returns + ------- + pd.DataFrame + Synthetic factor model data with columns: + - unit: Unit identifier + - period: Time period + - outcome: Outcome variable + - treated: Binary indicator (1 if treated at this observation) + - treat: Binary unit-level ever-treated indicator + - true_effect: The true treatment effect for this observation + + Examples + -------- + Generate data with factor structure: + + >>> data = generate_factor_data(n_units=50, n_factors=2, seed=42) + >>> data.shape + (750, 6) + + Use with TROP estimator: + + >>> from diff_diff import TROP + >>> trop = TROP(n_bootstrap=50, seed=42) + >>> results = trop.fit(data, outcome='outcome', treatment='treated', + ... unit='unit', time='period', + ... post_periods=list(range(10, 15))) + + Notes + ----- + The treated units have systematically different factor loadings + (shifted by `treated_loading_shift`), which creates confounding + that standard DiD cannot address but TROP can handle. + """ + rng = np.random.default_rng(seed) + + n_control = n_units - n_treated + n_periods = n_pre + n_post + + if n_treated > n_units: + raise ValueError(f"n_treated ({n_treated}) cannot exceed n_units ({n_units})") + if n_treated < 1: + raise ValueError("n_treated must be at least 1") + + # Generate factors F: (n_periods, n_factors) + F = rng.normal(0, 1, (n_periods, n_factors)) + + # Generate loadings Lambda: (n_factors, n_units) + # Treated units have shifted loadings (creates confounding) + Lambda = rng.normal(0, 1, (n_factors, n_units)) + Lambda[:, :n_treated] += treated_loading_shift + + # Unit fixed effects (treated units have higher baseline) + alpha = rng.normal(0, unit_fe_sd, n_units) + alpha[:n_treated] += 1.0 + + # Time fixed effects (linear trend) + beta = np.linspace(0, 2, n_periods) + + # Generate outcomes + records = [] + for i in range(n_units): + is_ever_treated = i < n_treated + + for t in range(n_periods): + post = t >= n_pre + + # Base outcome + y = 10.0 + alpha[i] + beta[t] + + # Interactive fixed effects: Lambda_i' F_t + y += factor_strength * (Lambda[:, i] @ F[t, :]) + + # Treatment effect + effect = 0.0 + if is_ever_treated and post: + effect = treatment_effect + y += effect + + # Add noise + y += rng.normal(0, noise_sd) + + records.append({ + "unit": i, + "period": t, + "outcome": y, + "treated": int(is_ever_treated and post), + "treat": int(is_ever_treated), + "true_effect": effect, + }) + + return pd.DataFrame(records) + + +def generate_ddd_data( + n_per_cell: int = 100, + treatment_effect: float = 2.0, + group_effect: float = 2.0, + partition_effect: float = 1.0, + time_effect: float = 0.5, + noise_sd: float = 1.0, + add_covariates: bool = False, + seed: Optional[int] = None, +) -> pd.DataFrame: + """ + Generate synthetic data for Triple Difference (DDD) analysis. + + Creates data following the DGP: + Y = mu + G + P + T + G*P + G*T + P*T + tau*G*P*T + eps + + where G=group, P=partition, T=time. The treatment effect (tau) only + applies to units that are in the treated group (G=1), eligible + partition (P=1), and post-treatment period (T=1). + + Parameters + ---------- + n_per_cell : int, default=100 + Number of observations per cell (8 cells total: 2x2x2). + treatment_effect : float, default=2.0 + True average treatment effect on the treated (G=1, P=1, T=1). + group_effect : float, default=2.0 + Main effect of being in treated group. + partition_effect : float, default=1.0 + Main effect of being in eligible partition. + time_effect : float, default=0.5 + Main effect of post-treatment period. + noise_sd : float, default=1.0 + Standard deviation of idiosyncratic noise. + add_covariates : bool, default=False + If True, adds age and education covariates that affect outcome. + seed : int, optional + Random seed for reproducibility. + + Returns + ------- + pd.DataFrame + Synthetic DDD data with columns: + - outcome: Outcome variable + - group: Group indicator (0=control, 1=treated) + - partition: Partition indicator (0=ineligible, 1=eligible) + - time: Time indicator (0=pre, 1=post) + - unit_id: Unique unit identifier + - true_effect: The true treatment effect for this observation + - age: Age covariate (if add_covariates=True) + - education: Education covariate (if add_covariates=True) + + Examples + -------- + Generate DDD data: + + >>> data = generate_ddd_data(n_per_cell=100, treatment_effect=3.0, seed=42) + >>> data.shape + (800, 6) + >>> data.groupby(['group', 'partition', 'time']).size() + group partition time + 0 0 0 100 + 1 100 + 1 0 100 + 1 100 + 1 0 0 100 + 1 100 + 1 0 100 + 1 100 + dtype: int64 + + Use with TripleDifference estimator: + + >>> from diff_diff import TripleDifference + >>> ddd = TripleDifference() + >>> results = ddd.fit(data, outcome='outcome', group='group', + ... partition='partition', time='time') + >>> abs(results.att - 3.0) < 1.0 + True + """ + rng = np.random.default_rng(seed) + + records = [] + unit_id = 0 + + for g in [0, 1]: # group (0=control state, 1=treated state) + for p in [0, 1]: # partition (0=ineligible, 1=eligible) + for t in [0, 1]: # time (0=pre, 1=post) + for _ in range(n_per_cell): + # Base outcome with main effects + y = 50 + group_effect * g + partition_effect * p + time_effect * t + + # Second-order interactions (non-treatment) + y += 1.5 * g * p # group-partition interaction + y += 1.0 * g * t # group-time interaction (diff trends) + y += 0.5 * p * t # partition-time interaction + + # Treatment effect: ONLY for G=1, P=1, T=1 + effect = 0.0 + if g == 1 and p == 1 and t == 1: + effect = treatment_effect + y += effect + + # Covariates (always generated for consistency) + age = rng.normal(40, 10) + education = rng.choice([12, 14, 16, 18], p=[0.3, 0.3, 0.25, 0.15]) + + if add_covariates: + y += 0.1 * age + 0.5 * education + + # Add noise + y += rng.normal(0, noise_sd) + + record = { + "outcome": y, + "group": g, + "partition": p, + "time": t, + "unit_id": unit_id, + "true_effect": effect, + } + + if add_covariates: + record["age"] = age + record["education"] = education + + records.append(record) + unit_id += 1 + + return pd.DataFrame(records) + + +def generate_panel_data( + n_units: int = 100, + n_periods: int = 8, + treatment_period: int = 4, + treatment_fraction: float = 0.5, + treatment_effect: float = 5.0, + parallel_trends: bool = True, + trend_violation: float = 1.0, + unit_fe_sd: float = 2.0, + noise_sd: float = 0.5, + seed: Optional[int] = None, +) -> pd.DataFrame: + """ + Generate synthetic panel data for parallel trends testing. + + Creates panel data with optional violation of parallel trends, useful + for testing parallel trends diagnostics, placebo tests, and sensitivity + analysis methods. + + Parameters + ---------- + n_units : int, default=100 + Total number of units in the panel. + n_periods : int, default=8 + Number of time periods. + treatment_period : int, default=4 + First post-treatment period (0-indexed). + treatment_fraction : float, default=0.5 + Fraction of units that receive treatment. + treatment_effect : float, default=5.0 + True average treatment effect on the treated. + parallel_trends : bool, default=True + If True, treated and control groups have parallel pre-treatment trends. + If False, treated group has a steeper pre-treatment trend. + trend_violation : float, default=1.0 + Size of the differential trend for treated group when parallel_trends=False. + Treated units have trend = common_trend + trend_violation. + unit_fe_sd : float, default=2.0 + Standard deviation of unit fixed effects. + noise_sd : float, default=0.5 + Standard deviation of idiosyncratic noise. + seed : int, optional + Random seed for reproducibility. + + Returns + ------- + pd.DataFrame + Synthetic panel data with columns: + - unit: Unit identifier + - period: Time period + - treated: Binary unit-level treatment indicator + - post: Binary post-treatment indicator + - outcome: Outcome variable + - true_effect: The true treatment effect for this observation + + Examples + -------- + Generate data with parallel trends: + + >>> data_parallel = generate_panel_data(parallel_trends=True, seed=42) + >>> from diff_diff.utils import check_parallel_trends + >>> result = check_parallel_trends(data_parallel, outcome='outcome', + ... time='period', treatment_group='treated', + ... pre_periods=[0, 1, 2, 3]) + >>> result['parallel_trends_plausible'] + True + + Generate data with trend violation: + + >>> data_violation = generate_panel_data(parallel_trends=False, seed=42) + >>> result = check_parallel_trends(data_violation, outcome='outcome', + ... time='period', treatment_group='treated', + ... pre_periods=[0, 1, 2, 3]) + >>> result['parallel_trends_plausible'] + False + """ + rng = np.random.default_rng(seed) + + if treatment_period < 1: + raise ValueError("treatment_period must be at least 1") + if treatment_period >= n_periods: + raise ValueError(f"treatment_period must be less than n_periods ({n_periods})") + + n_treated = int(n_units * treatment_fraction) + + records = [] + for unit in range(n_units): + is_treated = unit < n_treated + unit_fe = rng.normal(0, unit_fe_sd) + + for period in range(n_periods): + post = period >= treatment_period + + # Base time effect (common trend) + if parallel_trends: + time_effect = period * 1.0 + else: + # Different trends: treated has steeper pre-treatment trend + if is_treated: + time_effect = period * (1.0 + trend_violation) + else: + time_effect = period * 1.0 + + y = 10.0 + unit_fe + time_effect + + # Treatment effect (only for treated in post-period) + effect = 0.0 + if is_treated and post: + effect = treatment_effect + y += effect + + # Add noise + y += rng.normal(0, noise_sd) + + records.append({ + "unit": unit, + "period": period, + "treated": int(is_treated), + "post": int(post), + "outcome": y, + "true_effect": effect, + }) + + return pd.DataFrame(records) + + +def generate_event_study_data( + n_units: int = 300, + n_pre: int = 5, + n_post: int = 5, + treatment_fraction: float = 0.5, + treatment_effect: float = 5.0, + unit_fe_sd: float = 2.0, + noise_sd: float = 2.0, + seed: Optional[int] = None, +) -> pd.DataFrame: + """ + Generate synthetic data for event study analysis. + + Creates panel data with simultaneous treatment at period n_pre. + Useful for testing MultiPeriodDiD, pre-trends power analysis, + and HonestDiD sensitivity analysis. + + Parameters + ---------- + n_units : int, default=300 + Total number of units in the panel. + n_pre : int, default=5 + Number of pre-treatment periods. + n_post : int, default=5 + Number of post-treatment periods. + treatment_fraction : float, default=0.5 + Fraction of units that receive treatment. + treatment_effect : float, default=5.0 + True average treatment effect on the treated. + unit_fe_sd : float, default=2.0 + Standard deviation of unit fixed effects. + noise_sd : float, default=2.0 + Standard deviation of idiosyncratic noise. + seed : int, optional + Random seed for reproducibility. + + Returns + ------- + pd.DataFrame + Synthetic event study data with columns: + - unit: Unit identifier + - period: Time period + - treated: Binary unit-level treatment indicator + - post: Binary post-treatment indicator + - outcome: Outcome variable + - event_time: Time relative to treatment (negative=pre, 0+=post) + - true_effect: The true treatment effect for this observation + + Examples + -------- + Generate event study data: + + >>> data = generate_event_study_data(n_units=300, n_pre=5, n_post=5, seed=42) + >>> data['event_time'].unique() + array([-5, -4, -3, -2, -1, 0, 1, 2, 3, 4]) + + Use with MultiPeriodDiD: + + >>> from diff_diff import MultiPeriodDiD + >>> mp_did = MultiPeriodDiD() + >>> results = mp_did.fit(data, outcome='outcome', treatment='treated', + ... time='period', post_periods=[5, 6, 7, 8, 9]) + + Notes + ----- + The event_time column is relative to treatment: + - Negative values: pre-treatment periods + - 0: first post-treatment period + - Positive values: subsequent post-treatment periods + """ + rng = np.random.default_rng(seed) + + n_periods = n_pre + n_post + treatment_period = n_pre + n_treated = int(n_units * treatment_fraction) + + records = [] + for unit in range(n_units): + is_treated = unit < n_treated + unit_fe = rng.normal(0, unit_fe_sd) + + for period in range(n_periods): + post = period >= treatment_period + event_time = period - treatment_period + + # Common time trend + time_effect = period * 0.5 + + y = 10.0 + unit_fe + time_effect + + # Treatment effect (only for treated in post-period) + effect = 0.0 + if is_treated and post: + effect = treatment_effect + y += effect + + # Add noise + y += rng.normal(0, noise_sd) + + records.append({ + "unit": unit, + "period": period, + "treated": int(is_treated), + "post": int(post), + "outcome": y, + "event_time": event_time, + "true_effect": effect, + }) + + return pd.DataFrame(records) diff --git a/docs/tutorials/02_staggered_did.ipynb b/docs/tutorials/02_staggered_did.ipynb index 8a4c35db..cbde2ab2 100644 --- a/docs/tutorials/02_staggered_did.ipynb +++ b/docs/tutorials/02_staggered_did.ipynb @@ -73,53 +73,7 @@ } }, "outputs": [], - "source": [ - "# Generate staggered adoption data\n", - "np.random.seed(42)\n", - "\n", - "n_units = 100\n", - "n_periods = 8\n", - "\n", - "# Define treatment cohorts\n", - "# Cohort 3: treated starting period 3\n", - "# Cohort 5: treated starting period 5\n", - "# Cohort 0: never treated\n", - "cohorts = {3: 30, 5: 30, 0: 40} # cohort: n_units\n", - "\n", - "data = []\n", - "unit_id = 0\n", - "\n", - "for cohort, n_in_cohort in cohorts.items():\n", - " for _ in range(n_in_cohort):\n", - " unit_effect = np.random.normal(0, 2)\n", - " \n", - " for period in range(n_periods):\n", - " # Outcome model\n", - " y = 10.0 + unit_effect + period * 0.5\n", - " \n", - " # Treatment effect (heterogeneous by cohort and time since treatment)\n", - " if cohort > 0 and period >= cohort:\n", - " time_since_treatment = period - cohort\n", - " # Effect grows over time: 2.0 + 0.5 * time_since_treatment\n", - " treatment_effect = 2.0 + 0.5 * time_since_treatment\n", - " y += treatment_effect\n", - " \n", - " y += np.random.normal(0, 0.5)\n", - " \n", - " data.append({\n", - " 'unit': unit_id,\n", - " 'period': period,\n", - " 'cohort': cohort,\n", - " 'treated': int(cohort > 0 and period >= cohort),\n", - " 'outcome': y\n", - " })\n", - " \n", - " unit_id += 1\n", - "\n", - "df = pd.DataFrame(data)\n", - "print(f\"Dataset: {len(df)} observations, {n_units} units, {n_periods} periods\")\n", - "df.head(10)" - ] + "source": "# Generate staggered adoption data using the library function\nfrom diff_diff import generate_staggered_data\n\n# Generate data with 100 units, 8 periods, two treatment cohorts (periods 3 and 5),\n# and 40% never-treated\ndf = generate_staggered_data(\n n_units=100,\n n_periods=8,\n cohort_periods=[3, 5], # Treatment cohorts at periods 3 and 5\n never_treated_frac=0.4,\n treatment_effect=2.0,\n dynamic_effects=True,\n effect_growth=0.5, # Effect grows 0.5 per period\n unit_fe_sd=2.0,\n noise_sd=0.5,\n seed=42\n)\n\n# Add a 'cohort' column that matches the old format (first_treat is already there)\ndf['cohort'] = df['first_treat']\n\nprint(f\"Dataset: {len(df)} observations, {df['unit'].nunique()} units, {df['period'].nunique()} periods\")\ndf.head(10)" }, { "cell_type": "code", @@ -958,4 +912,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/docs/tutorials/04_parallel_trends.ipynb b/docs/tutorials/04_parallel_trends.ipynb index 17b6aa4c..ef57e8ec 100644 --- a/docs/tutorials/04_parallel_trends.ipynb +++ b/docs/tutorials/04_parallel_trends.ipynb @@ -79,65 +79,7 @@ } }, "outputs": [], - "source": [ - "def generate_panel_data(n_units=100, n_periods=8, parallel=True, seed=42):\n", - " \"\"\"\n", - " Generate panel data with or without parallel trends.\n", - " \n", - " Parameters\n", - " ----------\n", - " parallel : bool\n", - " If True, treated and control have the same pre-treatment trend.\n", - " If False, treated has a steeper trend.\n", - " \"\"\"\n", - " np.random.seed(seed)\n", - " \n", - " treatment_time = n_periods // 2\n", - " \n", - " data = []\n", - " for unit in range(n_units):\n", - " is_treated = unit < n_units // 2\n", - " unit_effect = np.random.normal(0, 2)\n", - " \n", - " for period in range(n_periods):\n", - " # Base trend\n", - " if parallel:\n", - " # Same trend for both groups\n", - " time_effect = period * 1.0\n", - " else:\n", - " # Different trends\n", - " if is_treated:\n", - " time_effect = period * 2.0 # Steeper trend for treated\n", - " else:\n", - " time_effect = period * 1.0\n", - " \n", - " y = 10.0 + unit_effect + time_effect\n", - " \n", - " # Treatment effect in post-period\n", - " post = period >= treatment_time\n", - " if is_treated and post:\n", - " y += 5.0 # True ATT\n", - " \n", - " y += np.random.normal(0, 0.5)\n", - " \n", - " data.append({\n", - " 'unit': unit,\n", - " 'period': period,\n", - " 'treated': int(is_treated),\n", - " 'post': int(post),\n", - " 'outcome': y\n", - " })\n", - " \n", - " return pd.DataFrame(data)\n", - "\n", - "# Generate both datasets\n", - "df_parallel = generate_panel_data(parallel=True)\n", - "df_nonparallel = generate_panel_data(parallel=False)\n", - "\n", - "print(\"Generated two datasets:\")\n", - "print(f\" - df_parallel: Parallel trends holds\")\n", - "print(f\" - df_nonparallel: Parallel trends violated\")" - ] + "source": "# Generate panel data using the library function\nfrom diff_diff import generate_panel_data\n\n# Generate data with parallel trends\ndf_parallel = generate_panel_data(\n n_units=100,\n n_periods=8,\n treatment_period=4,\n treatment_fraction=0.5,\n treatment_effect=5.0,\n parallel_trends=True, # Parallel trends holds\n unit_fe_sd=2.0,\n noise_sd=0.5,\n seed=42\n)\n\n# Generate data with non-parallel trends (violation)\ndf_nonparallel = generate_panel_data(\n n_units=100,\n n_periods=8,\n treatment_period=4,\n treatment_fraction=0.5,\n treatment_effect=5.0,\n parallel_trends=False, # Treated has steeper trend\n trend_violation=1.0, # Differential trend = 1.0 per period\n unit_fe_sd=2.0,\n noise_sd=0.5,\n seed=42\n)\n\nprint(\"Generated two datasets:\")\nprint(f\" - df_parallel: Parallel trends holds\")\nprint(f\" - df_nonparallel: Parallel trends violated\")" }, { "cell_type": "markdown", @@ -856,4 +798,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/docs/tutorials/07_pretrends_power.ipynb b/docs/tutorials/07_pretrends_power.ipynb index 01965101..cf96f251 100644 --- a/docs/tutorials/07_pretrends_power.ipynb +++ b/docs/tutorials/07_pretrends_power.ipynb @@ -104,53 +104,7 @@ } }, "outputs": [], - "source": [ - "def generate_event_study_data(n_units=300, n_periods=10, true_att=5.0, seed=42):\n", - " \"\"\"\n", - " Generate panel data for event study analysis.\n", - " \n", - " - 5 pre-treatment periods (0-4)\n", - " - 5 post-treatment periods (5-9)\n", - " - Half of units are treated starting at period 5\n", - " \"\"\"\n", - " np.random.seed(seed)\n", - " treatment_time = n_periods // 2\n", - " \n", - " data = []\n", - " for unit in range(n_units):\n", - " is_treated = unit < n_units // 2\n", - " unit_effect = np.random.normal(0, 2)\n", - " \n", - " for period in range(n_periods):\n", - " # Common time trend\n", - " time_effect = period * 0.5\n", - " \n", - " y = 10.0 + unit_effect + time_effect\n", - " \n", - " # Treatment effect (only post-treatment)\n", - " post = period >= treatment_time\n", - " if is_treated and post:\n", - " y += true_att\n", - " \n", - " y += np.random.normal(0, 2)\n", - " \n", - " data.append({\n", - " 'unit': unit,\n", - " 'period': period,\n", - " 'treated': int(is_treated),\n", - " 'post': int(post),\n", - " 'outcome': y\n", - " })\n", - " \n", - " return pd.DataFrame(data)\n", - "\n", - "# Generate data\n", - "df = generate_event_study_data()\n", - "print(f\"Generated {len(df)} observations\")\n", - "print(f\"Units: {df['unit'].nunique()} ({df[df['treated']==1]['unit'].nunique()} treated)\")\n", - "print(f\"Periods: {df['period'].nunique()} (5 pre, 5 post)\")\n", - "print(f\"True ATT: 5.0\")" - ] + "source": "# Generate event study data using the library function\nfrom diff_diff import generate_event_study_data\n\n# Generate panel data for event study analysis:\n# - 5 pre-treatment periods (0-4)\n# - 5 post-treatment periods (5-9)\n# - Half of units are treated starting at period 5\ndf = generate_event_study_data(\n n_units=300,\n n_pre=5,\n n_post=5,\n treatment_fraction=0.5,\n treatment_effect=5.0,\n unit_fe_sd=2.0,\n noise_sd=2.0,\n seed=42\n)\n\nprint(f\"Generated {len(df)} observations\")\nprint(f\"Units: {df['unit'].nunique()} ({df[df['treated']==1]['unit'].nunique()} treated)\")\nprint(f\"Periods: {df['period'].nunique()} (5 pre, 5 post)\")\nprint(f\"True ATT: 5.0\")" }, { "cell_type": "markdown", @@ -407,21 +361,7 @@ } }, "outputs": [], - "source": [ - "# Plot the power curve\n", - "if HAS_MATPLOTLIB:\n", - " fig, ax = plt.subplots(figsize=(10, 6))\n", - " plot_pretrends_power(\n", - " curve,\n", - " ax=ax,\n", - " show_mdv=True,\n", - " target_power=0.80,\n", - " title='Pre-Trends Test Power Curve',\n", - " show=False\n", - " )\n", - " plt.tight_layout()\n", - " plt.show()" - ] + "source": "# Plot the power curve\nif HAS_MATPLOTLIB:\n fig, ax = plt.subplots(figsize=(10, 6))\n plot_pretrends_power(\n curve,\n ax=ax,\n mdv=pt_results.mdv, # Show MDV line on plot\n target_power=0.80,\n title='Pre-Trends Test Power Curve',\n show=False\n )\n plt.tight_layout()\n plt.show()" }, { "cell_type": "markdown", @@ -537,26 +477,7 @@ } }, "outputs": [], - "source": [ - "if HAS_MATPLOTLIB:\n", - " fig, axes = plt.subplots(1, 3, figsize=(15, 5))\n", - " \n", - " for ax, vtype in zip(axes, ['linear', 'constant', 'last_period']):\n", - " pt_v = PreTrendsPower(violation_type=vtype)\n", - " curve_v = pt_v.power_curve(event_results, n_points=50, pre_periods=pre_treatment_periods)\n", - " \n", - " plot_pretrends_power(\n", - " curve_v,\n", - " ax=ax,\n", - " show_mdv=True,\n", - " target_power=0.80,\n", - " title=f'Violation Type: {vtype.replace(\"_\", \" \").title()}',\n", - " show=False\n", - " )\n", - " \n", - " plt.tight_layout()\n", - " plt.show()" - ] + "source": "if HAS_MATPLOTLIB:\n fig, axes = plt.subplots(1, 3, figsize=(15, 5))\n \n for ax, vtype in zip(axes, ['linear', 'constant', 'last_period']):\n pt_v = PreTrendsPower(violation_type=vtype)\n results_v = pt_v.fit(event_results, pre_periods=pre_treatment_periods)\n curve_v = pt_v.power_curve(event_results, n_points=50, pre_periods=pre_treatment_periods)\n \n plot_pretrends_power(\n curve_v,\n ax=ax,\n mdv=results_v.mdv, # Show MDV line on plot\n target_power=0.80,\n title=f'Violation Type: {vtype.replace(\"_\", \" \").title()}',\n show=False\n )\n \n plt.tight_layout()\n plt.show()" }, { "cell_type": "markdown", diff --git a/docs/tutorials/08_triple_diff.ipynb b/docs/tutorials/08_triple_diff.ipynb index 18152db4..10e447aa 100644 --- a/docs/tutorials/08_triple_diff.ipynb +++ b/docs/tutorials/08_triple_diff.ipynb @@ -72,53 +72,7 @@ } }, "outputs": [], - "source": [ - "def generate_ddd_data(n_per_cell=200, true_att=2.0, seed=42):\n", - " \"\"\"Generate synthetic DDD data.\"\"\"\n", - " rng = np.random.default_rng(seed)\n", - " \n", - " rows = []\n", - " for g in [0, 1]: # group (0=control state, 1=treated state)\n", - " for p in [0, 1]: # partition (0=ineligible, 1=eligible)\n", - " for t in [0, 1]: # time (0=pre, 1=post)\n", - " for i in range(n_per_cell):\n", - " # Base outcome with group and partition effects\n", - " y = 50 + 5*g + 3*p + 2*t # baseline differences\n", - " \n", - " # Add second-order interactions (non-treatment)\n", - " y += 1.5*g*p # group-partition interaction\n", - " y += 1.0*g*t # differential trend by group (would bias DiD)\n", - " y += 0.5*p*t # differential trend by partition\n", - " \n", - " # Treatment effect: ONLY for G=1, P=1, T=1\n", - " if g == 1 and p == 1 and t == 1:\n", - " y += true_att\n", - " \n", - " # Covariates\n", - " age = rng.normal(40, 10)\n", - " education = rng.choice([12, 14, 16, 18], p=[0.3, 0.3, 0.25, 0.15])\n", - " y += 0.1*age + 0.5*education # covariate effects\n", - " \n", - " # Add noise\n", - " y += rng.normal(0, 3)\n", - " \n", - " rows.append({\n", - " 'outcome': y,\n", - " 'group': g,\n", - " 'partition': p,\n", - " 'time': t,\n", - " 'age': age,\n", - " 'education': education,\n", - " })\n", - " \n", - " return pd.DataFrame(rows)\n", - "\n", - "# Generate data\n", - "data = generate_ddd_data(n_per_cell=200, true_att=2.0)\n", - "print(f\"Dataset shape: {data.shape}\")\n", - "print(f\"\\nSample composition:\")\n", - "print(data.groupby(['group', 'partition', 'time']).size().unstack(fill_value=0))" - ] + "source": "# Generate DDD data using the library function\nfrom diff_diff import generate_ddd_data\n\n# Generate synthetic DDD data that mimics a policy setting:\n# - Enacted in some states (group=1) but not others (group=0)\n# - Affects only eligible individuals (partition=1) but not others (partition=0)\n# - Has a true treatment effect of 2.0\ndata = generate_ddd_data(\n n_per_cell=200,\n treatment_effect=2.0,\n group_effect=5.0, # Main effect of being in treated group\n partition_effect=3.0, # Main effect of being in eligible partition\n time_effect=2.0, # Main effect of post-treatment period\n noise_sd=3.0,\n add_covariates=True, # Include age and education covariates\n seed=42\n)\n\nprint(f\"Dataset shape: {data.shape}\")\nprint(f\"\\nSample composition:\")\nprint(data.groupby(['group', 'partition', 'time']).size().unstack(fill_value=0))" }, { "cell_type": "markdown", @@ -516,4 +470,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/docs/tutorials/10_trop.ipynb b/docs/tutorials/10_trop.ipynb index 715d54a2..6e9c4149 100644 --- a/docs/tutorials/10_trop.ipynb +++ b/docs/tutorials/10_trop.ipynb @@ -65,105 +65,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "def generate_factor_dgp(\n", - " n_units=50,\n", - " n_pre=10,\n", - " n_post=5,\n", - " n_treated=10,\n", - " n_factors=2,\n", - " treatment_effect=2.0,\n", - " factor_strength=1.0,\n", - " noise_std=0.5,\n", - " seed=42\n", - "):\n", - " \"\"\"\n", - " Generate panel data with known factor structure.\n", - " \n", - " DGP: Y_it = mu + alpha_i + beta_t + L_it + tau*D_it + eps_it\n", - " \n", - " where L_it = Lambda_i'F_t is the interactive fixed effects component.\n", - " \n", - " This creates a scenario where standard DiD/SDID may be biased,\n", - " but TROP should recover the true treatment effect.\n", - " \n", - " Returns DataFrame with columns:\n", - " - 'treated': observation-level indicator (1 if treated AND post-period) - for TROP\n", - " - 'treat': unit-level ever-treated indicator (1 for all periods if unit is treated) - for SDID\n", - " \"\"\"\n", - " rng = np.random.default_rng(seed)\n", - " \n", - " n_control = n_units - n_treated\n", - " n_periods = n_pre + n_post\n", - " \n", - " # Generate factors F: (n_periods, n_factors)\n", - " F = rng.normal(0, 1, (n_periods, n_factors))\n", - " \n", - " # Generate loadings Lambda: (n_factors, n_units)\n", - " # Make treated units have correlated loadings (creates confounding)\n", - " Lambda = rng.normal(0, 1, (n_factors, n_units))\n", - " Lambda[:, :n_treated] += 0.5 # Treated units have higher loadings\n", - " \n", - " # Unit fixed effects\n", - " alpha = rng.normal(0, 1, n_units)\n", - " alpha[:n_treated] += 1.0 # Treated units have higher intercept\n", - " \n", - " # Time fixed effects\n", - " beta = np.linspace(0, 2, n_periods)\n", - " \n", - " # Generate outcomes\n", - " data = []\n", - " for i in range(n_units):\n", - " is_treated = i < n_treated\n", - " \n", - " for t in range(n_periods):\n", - " post = t >= n_pre\n", - " \n", - " y = 10.0 + alpha[i] + beta[t]\n", - " y += factor_strength * (Lambda[:, i] @ F[t, :]) # L_it component\n", - " \n", - " if is_treated and post:\n", - " y += treatment_effect\n", - " \n", - " y += rng.normal(0, noise_std)\n", - " \n", - " data.append({\n", - " 'unit': i,\n", - " 'period': t,\n", - " 'outcome': y,\n", - " 'treated': int(is_treated and post), # Observation-level (for TROP)\n", - " 'treat': int(is_treated) # Unit-level ever-treated (for SDID)\n", - " })\n", - " \n", - " return pd.DataFrame(data)\n", - "\n", - "\n", - "# Generate data with factor structure (reduced size for faster execution)\n", - "true_att = 2.0\n", - "n_factors = 2\n", - "n_pre = 6 # Reduced from 10\n", - "n_post = 3 # Reduced from 5\n", - "\n", - "df = generate_factor_dgp(\n", - " n_units=30, # Reduced from 50\n", - " n_pre=n_pre,\n", - " n_post=n_post,\n", - " n_treated=6, # Reduced from 10\n", - " n_factors=n_factors,\n", - " treatment_effect=true_att,\n", - " factor_strength=1.5, # Strong factor confounding\n", - " noise_std=0.5,\n", - " seed=42\n", - ")\n", - "\n", - "print(f\"Dataset: {len(df)} observations\")\n", - "print(f\"Treated units: 6\")\n", - "print(f\"Control units: 24\")\n", - "print(f\"Pre-treatment periods: {n_pre}\")\n", - "print(f\"Post-treatment periods: {n_post}\")\n", - "print(f\"True treatment effect: {true_att}\")\n", - "print(f\"True number of factors: {n_factors}\")" - ] + "source": "# Generate factor model data using the library function\nfrom diff_diff import generate_factor_data\n\n# True parameters for verification\ntrue_att = 2.0\nn_factors = 2\nn_pre = 6 # Reduced from 10 for faster execution\nn_post = 3 # Reduced from 5\n\n# Generate panel data with factor structure\n# This creates a scenario where standard DiD/SDID may be biased,\n# but TROP should recover the true treatment effect.\ndf = generate_factor_data(\n n_units=30, # Reduced from 50 for faster execution\n n_pre=n_pre,\n n_post=n_post,\n n_treated=6, # Reduced from 10\n n_factors=n_factors,\n treatment_effect=true_att,\n factor_strength=1.5, # Strong factor confounding\n treated_loading_shift=0.5,\n unit_fe_sd=1.0,\n noise_sd=0.5,\n seed=42\n)\n\nprint(f\"Dataset: {len(df)} observations\")\nprint(f\"Treated units: 6\")\nprint(f\"Control units: 24\")\nprint(f\"Pre-treatment periods: {n_pre}\")\nprint(f\"Post-treatment periods: {n_post}\")\nprint(f\"True treatment effect: {true_att}\")\nprint(f\"True number of factors: {n_factors}\")" }, { "cell_type": "code", @@ -522,81 +424,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "# Monte Carlo comparison (reduced for faster tutorial execution)\n", - "n_sims = 5 # Reduced from 20 for faster validation\n", - "trop_estimates = []\n", - "sdid_estimates = []\n", - "\n", - "print(f\"Running {n_sims} simulations...\")\n", - "\n", - "for sim in range(n_sims):\n", - " # Generate new data (includes both 'treated' and 'treat' columns)\n", - " sim_data = generate_factor_dgp(\n", - " n_units=50,\n", - " n_pre=10,\n", - " n_post=5,\n", - " n_treated=10,\n", - " n_factors=2,\n", - " treatment_effect=2.0,\n", - " factor_strength=1.5,\n", - " noise_std=0.5,\n", - " seed=100 + sim\n", - " )\n", - " \n", - " # TROP (uses observation-level 'treated')\n", - " try:\n", - " trop_m = TROP(\n", - " lambda_time_grid=[1.0],\n", - " lambda_unit_grid=[1.0],\n", - " lambda_nn_grid=[0.1],\n", - " n_bootstrap=10, \n", - " seed=42 + sim\n", - " )\n", - " trop_res = trop_m.fit(\n", - " sim_data,\n", - " outcome='outcome',\n", - " treatment='treated',\n", - " unit='unit',\n", - " time='period',\n", - " post_periods=list(range(10, 15))\n", - " )\n", - " trop_estimates.append(trop_res.att)\n", - " except Exception as e:\n", - " print(f\"TROP failed on sim {sim}: {e}\")\n", - " \n", - " # SDID (uses unit-level 'treat')\n", - " try:\n", - " sdid_m = SyntheticDiD(n_bootstrap=10, seed=42 + sim)\n", - " sdid_res = sdid_m.fit(\n", - " sim_data,\n", - " outcome='outcome',\n", - " treatment='treat', # Unit-level ever-treated indicator\n", - " unit='unit',\n", - " time='period',\n", - " post_periods=list(range(10, 15))\n", - " )\n", - " sdid_estimates.append(sdid_res.att)\n", - " except Exception as e:\n", - " print(f\"SDID failed on sim {sim}: {e}\")\n", - "\n", - "print(f\"\\nMonte Carlo Results (True ATT = {true_att})\")\n", - "print(\"=\"*60)\n", - "print(f\"{'Estimator':<15} {'Mean':>12} {'Bias':>12} {'RMSE':>12}\")\n", - "print(\"-\"*60)\n", - "\n", - "if trop_estimates:\n", - " trop_mean = np.mean(trop_estimates)\n", - " trop_bias = trop_mean - true_att\n", - " trop_rmse = np.sqrt(np.mean([(e - true_att)**2 for e in trop_estimates]))\n", - " print(f\"{'TROP':<15} {trop_mean:>12.4f} {trop_bias:>12.4f} {trop_rmse:>12.4f}\")\n", - "\n", - "if sdid_estimates:\n", - " sdid_mean = np.mean(sdid_estimates)\n", - " sdid_bias = sdid_mean - true_att\n", - " sdid_rmse = np.sqrt(np.mean([(e - true_att)**2 for e in sdid_estimates]))\n", - " print(f\"{'SDID':<15} {sdid_mean:>12.4f} {sdid_bias:>12.4f} {sdid_rmse:>12.4f}\")" - ] + "source": "# Monte Carlo comparison (reduced for faster tutorial execution)\nn_sims = 5 # Reduced from 20 for faster validation\ntrop_estimates = []\nsdid_estimates = []\n\nprint(f\"Running {n_sims} simulations...\")\n\nfor sim in range(n_sims):\n # Generate new data using the library function\n # (includes both 'treated' and 'treat' columns)\n sim_data = generate_factor_data(\n n_units=50,\n n_pre=10,\n n_post=5,\n n_treated=10,\n n_factors=2,\n treatment_effect=2.0,\n factor_strength=1.5,\n noise_sd=0.5,\n seed=100 + sim\n )\n \n # TROP (uses observation-level 'treated')\n try:\n trop_m = TROP(\n lambda_time_grid=[1.0],\n lambda_unit_grid=[1.0],\n lambda_nn_grid=[0.1],\n n_bootstrap=10, \n seed=42 + sim\n )\n trop_res = trop_m.fit(\n sim_data,\n outcome='outcome',\n treatment='treated',\n unit='unit',\n time='period',\n post_periods=list(range(10, 15))\n )\n trop_estimates.append(trop_res.att)\n except Exception as e:\n print(f\"TROP failed on sim {sim}: {e}\")\n \n # SDID (uses unit-level 'treat')\n try:\n sdid_m = SyntheticDiD(n_bootstrap=10, seed=42 + sim)\n sdid_res = sdid_m.fit(\n sim_data,\n outcome='outcome',\n treatment='treat', # Unit-level ever-treated indicator\n unit='unit',\n time='period',\n post_periods=list(range(10, 15))\n )\n sdid_estimates.append(sdid_res.att)\n except Exception as e:\n print(f\"SDID failed on sim {sim}: {e}\")\n\nprint(f\"\\nMonte Carlo Results (True ATT = {true_att})\")\nprint(\"=\"*60)\nprint(f\"{'Estimator':<15} {'Mean':>12} {'Bias':>12} {'RMSE':>12}\")\nprint(\"-\"*60)\n\nif trop_estimates:\n trop_mean = np.mean(trop_estimates)\n trop_bias = trop_mean - true_att\n trop_rmse = np.sqrt(np.mean([(e - true_att)**2 for e in trop_estimates]))\n print(f\"{'TROP':<15} {trop_mean:>12.4f} {trop_bias:>12.4f} {trop_rmse:>12.4f}\")\n\nif sdid_estimates:\n sdid_mean = np.mean(sdid_estimates)\n sdid_bias = sdid_mean - true_att\n sdid_rmse = np.sqrt(np.mean([(e - true_att)**2 for e in sdid_estimates]))\n print(f\"{'SDID':<15} {sdid_mean:>12.4f} {sdid_bias:>12.4f} {sdid_rmse:>12.4f}\")" }, { "cell_type": "code", @@ -788,4 +616,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/tests/test_prep.py b/tests/test_prep.py index 889f3532..6d5225fa 100644 --- a/tests/test_prep.py +++ b/tests/test_prep.py @@ -792,3 +792,382 @@ def test_single_control_unit(self): assert result["unit"].iloc[0] == single_control # Single control should get score of 1.0 (best possible) assert result["quality_score"].iloc[0] == 1.0 + + +class TestGenerateStaggeredData: + """Tests for generate_staggered_data function.""" + + def test_basic_generation(self): + """Test basic staggered data generation.""" + from diff_diff.prep import generate_staggered_data + + data = generate_staggered_data(n_units=50, n_periods=8, seed=42) + assert len(data) == 400 # 50 units x 8 periods + assert set(data.columns) == { + "unit", "period", "outcome", "first_treat", "treated", "treat", "true_effect" + } + + def test_never_treated_fraction(self): + """Test that never_treated_frac is respected.""" + from diff_diff.prep import generate_staggered_data + + data = generate_staggered_data(n_units=100, never_treated_frac=0.3, seed=42) + n_never = (data.groupby("unit")["first_treat"].first() == 0).sum() + assert n_never == 30 + + def test_cohort_periods(self): + """Test custom cohort periods.""" + from diff_diff.prep import generate_staggered_data + + data = generate_staggered_data( + n_units=100, n_periods=10, cohort_periods=[4, 6], seed=42 + ) + cohorts = data.groupby("unit")["first_treat"].first().unique() + assert set(cohorts) == {0, 4, 6} + + def test_treatment_effect_direction(self): + """Test that treatment effect is positive.""" + from diff_diff.prep import generate_staggered_data + + data = generate_staggered_data( + n_units=100, treatment_effect=3.0, noise_sd=0.1, seed=42 + ) + # Treated observations should have positive true_effect + treated_effects = data[data["treated"] == 1]["true_effect"] + assert (treated_effects > 0).all() + + def test_dynamic_effects(self): + """Test dynamic treatment effects.""" + from diff_diff.prep import generate_staggered_data + + data = generate_staggered_data( + n_units=50, n_periods=10, treatment_effect=2.0, + dynamic_effects=True, effect_growth=0.1, seed=42 + ) + # Effects should grow over time since treatment + # Check a treated unit + treated_units = data[data["treat"] == 1]["unit"].unique() + unit_data = data[data["unit"] == treated_units[0]].sort_values("period") + first_treat = unit_data["first_treat"].iloc[0] + effects = unit_data[unit_data["period"] >= first_treat]["true_effect"].values + # Effects should be increasing (with dynamic effects) + assert all(effects[i] <= effects[i + 1] for i in range(len(effects) - 1)) + + def test_reproducibility(self): + """Test seed produces reproducible data.""" + from diff_diff.prep import generate_staggered_data + + data1 = generate_staggered_data(seed=123) + data2 = generate_staggered_data(seed=123) + pd.testing.assert_frame_equal(data1, data2) + + def test_invalid_cohort_period(self): + """Test error on invalid cohort period.""" + from diff_diff.prep import generate_staggered_data + + with pytest.raises(ValueError, match="must be between"): + generate_staggered_data(n_periods=10, cohort_periods=[0, 5]) # 0 invalid + + with pytest.raises(ValueError, match="must be between"): + generate_staggered_data(n_periods=10, cohort_periods=[5, 10]) # 10 invalid + + +class TestGenerateFactorData: + """Tests for generate_factor_data function.""" + + def test_basic_generation(self): + """Test basic factor data generation.""" + from diff_diff.prep import generate_factor_data + + data = generate_factor_data(n_units=30, n_pre=8, n_post=4, n_treated=5, seed=42) + assert len(data) == 360 # 30 units x 12 periods + assert set(data.columns) == { + "unit", "period", "outcome", "treated", "treat", "true_effect" + } + + def test_treated_units_count(self): + """Test that n_treated is respected.""" + from diff_diff.prep import generate_factor_data + + data = generate_factor_data(n_units=50, n_treated=10, seed=42) + n_treated = data.groupby("unit")["treat"].first().sum() + assert n_treated == 10 + + def test_treatment_in_post_only(self): + """Test that treatment indicator is 1 only in post-treatment.""" + from diff_diff.prep import generate_factor_data + + data = generate_factor_data(n_pre=10, n_post=5, n_treated=10, seed=42) + # Pre-treatment observations should have treated=0 + pre_data = data[data["period"] < 10] + assert (pre_data["treated"] == 0).all() + + def test_treatment_effect_recovery(self): + """Test that treatment effect can be roughly recovered.""" + from diff_diff.prep import generate_factor_data + + true_effect = 3.0 + data = generate_factor_data( + n_units=100, n_pre=10, n_post=5, n_treated=30, + treatment_effect=true_effect, noise_sd=0.1, factor_strength=0.1, + seed=42 + ) + # Simple DiD on treated vs control, post vs pre + treated_post = data[(data["treat"] == 1) & (data["period"] >= 10)]["outcome"].mean() + treated_pre = data[(data["treat"] == 1) & (data["period"] < 10)]["outcome"].mean() + control_post = data[(data["treat"] == 0) & (data["period"] >= 10)]["outcome"].mean() + control_pre = data[(data["treat"] == 0) & (data["period"] < 10)]["outcome"].mean() + did_estimate = (treated_post - treated_pre) - (control_post - control_pre) + # With low noise and factor strength, should be reasonably close + assert abs(did_estimate - true_effect) < 2.0 + + def test_reproducibility(self): + """Test seed produces reproducible data.""" + from diff_diff.prep import generate_factor_data + + data1 = generate_factor_data(seed=123) + data2 = generate_factor_data(seed=123) + pd.testing.assert_frame_equal(data1, data2) + + def test_invalid_n_treated(self): + """Test error on invalid n_treated.""" + from diff_diff.prep import generate_factor_data + + with pytest.raises(ValueError, match="cannot exceed"): + generate_factor_data(n_units=10, n_treated=20) + + with pytest.raises(ValueError, match="at least 1"): + generate_factor_data(n_units=10, n_treated=0) + + +class TestGenerateDddData: + """Tests for generate_ddd_data function.""" + + def test_basic_generation(self): + """Test basic DDD data generation.""" + from diff_diff.prep import generate_ddd_data + + data = generate_ddd_data(n_per_cell=50, seed=42) + assert len(data) == 400 # 50 x 8 cells + expected_cols = {"outcome", "group", "partition", "time", "unit_id", "true_effect"} + assert expected_cols.issubset(set(data.columns)) + + def test_cell_structure(self): + """Test that all 8 cells have correct counts.""" + from diff_diff.prep import generate_ddd_data + + data = generate_ddd_data(n_per_cell=100, seed=42) + cell_counts = data.groupby(["group", "partition", "time"]).size() + assert len(cell_counts) == 8 + assert (cell_counts == 100).all() + + def test_treatment_effect_location(self): + """Test that true_effect is only non-zero for G=1, P=1, T=1.""" + from diff_diff.prep import generate_ddd_data + + data = generate_ddd_data(n_per_cell=50, treatment_effect=5.0, seed=42) + # Only G=1, P=1, T=1 should have non-zero true_effect + treated = data[(data["group"] == 1) & (data["partition"] == 1) & (data["time"] == 1)] + not_treated = data[~((data["group"] == 1) & (data["partition"] == 1) & (data["time"] == 1))] + + assert (treated["true_effect"] == 5.0).all() + assert (not_treated["true_effect"] == 0.0).all() + + def test_with_covariates(self): + """Test data generation with covariates.""" + from diff_diff.prep import generate_ddd_data + + data = generate_ddd_data(n_per_cell=50, add_covariates=True, seed=42) + assert "age" in data.columns + assert "education" in data.columns + + def test_without_covariates(self): + """Test data generation without covariates.""" + from diff_diff.prep import generate_ddd_data + + data = generate_ddd_data(n_per_cell=50, add_covariates=False, seed=42) + assert "age" not in data.columns + assert "education" not in data.columns + + def test_treatment_effect_recovery(self): + """Test that treatment effect can be recovered with DDD.""" + from diff_diff.prep import generate_ddd_data + + true_effect = 3.0 + data = generate_ddd_data(n_per_cell=200, treatment_effect=true_effect, noise_sd=0.5, seed=42) + + # Manual DDD calculation + y_111 = data[(data["group"] == 1) & (data["partition"] == 1) & (data["time"] == 1)]["outcome"].mean() + y_110 = data[(data["group"] == 1) & (data["partition"] == 1) & (data["time"] == 0)]["outcome"].mean() + y_101 = data[(data["group"] == 1) & (data["partition"] == 0) & (data["time"] == 1)]["outcome"].mean() + y_100 = data[(data["group"] == 1) & (data["partition"] == 0) & (data["time"] == 0)]["outcome"].mean() + y_011 = data[(data["group"] == 0) & (data["partition"] == 1) & (data["time"] == 1)]["outcome"].mean() + y_010 = data[(data["group"] == 0) & (data["partition"] == 1) & (data["time"] == 0)]["outcome"].mean() + y_001 = data[(data["group"] == 0) & (data["partition"] == 0) & (data["time"] == 1)]["outcome"].mean() + y_000 = data[(data["group"] == 0) & (data["partition"] == 0) & (data["time"] == 0)]["outcome"].mean() + + manual_ddd = (y_111 - y_110) - (y_101 - y_100) - (y_011 - y_010) + (y_001 - y_000) + assert abs(manual_ddd - true_effect) < 0.5 + + def test_reproducibility(self): + """Test seed produces reproducible data.""" + from diff_diff.prep import generate_ddd_data + + data1 = generate_ddd_data(seed=123) + data2 = generate_ddd_data(seed=123) + pd.testing.assert_frame_equal(data1, data2) + + +class TestGeneratePanelData: + """Tests for generate_panel_data function.""" + + def test_basic_generation(self): + """Test basic panel data generation.""" + from diff_diff.prep import generate_panel_data + + data = generate_panel_data(n_units=50, n_periods=6, seed=42) + assert len(data) == 300 # 50 units x 6 periods + assert set(data.columns) == { + "unit", "period", "treated", "post", "outcome", "true_effect" + } + + def test_treatment_fraction(self): + """Test that treatment_fraction is respected.""" + from diff_diff.prep import generate_panel_data + + data = generate_panel_data(n_units=100, treatment_fraction=0.4, seed=42) + n_treated_units = data.groupby("unit")["treated"].first().sum() + assert n_treated_units == 40 + + def test_treatment_period(self): + """Test that treatment_period is respected.""" + from diff_diff.prep import generate_panel_data + + data = generate_panel_data(n_periods=10, treatment_period=5, seed=42) + # Post should be 1 for periods >= 5 + assert (data[data["period"] < 5]["post"] == 0).all() + assert (data[data["period"] >= 5]["post"] == 1).all() + + def test_parallel_trends(self): + """Test data generation with parallel trends.""" + from diff_diff.prep import generate_panel_data + + data = generate_panel_data( + n_units=200, n_periods=8, parallel_trends=True, noise_sd=0.1, seed=42 + ) + # Calculate pre-treatment trends + pre_data = data[data["post"] == 0] + treated_trend = pre_data[pre_data["treated"] == 1].groupby("period")["outcome"].mean() + control_trend = pre_data[pre_data["treated"] == 0].groupby("period")["outcome"].mean() + + # Calculate slopes + treated_slope = np.polyfit(treated_trend.index, treated_trend.values, 1)[0] + control_slope = np.polyfit(control_trend.index, control_trend.values, 1)[0] + + # Slopes should be similar (parallel trends) + assert abs(treated_slope - control_slope) < 0.5 + + def test_non_parallel_trends(self): + """Test data generation with trend violation.""" + from diff_diff.prep import generate_panel_data + + data = generate_panel_data( + n_units=200, n_periods=8, parallel_trends=False, + trend_violation=1.0, noise_sd=0.1, seed=42 + ) + # Calculate pre-treatment trends + pre_data = data[data["post"] == 0] + treated_trend = pre_data[pre_data["treated"] == 1].groupby("period")["outcome"].mean() + control_trend = pre_data[pre_data["treated"] == 0].groupby("period")["outcome"].mean() + + # Calculate slopes + treated_slope = np.polyfit(treated_trend.index, treated_trend.values, 1)[0] + control_slope = np.polyfit(control_trend.index, control_trend.values, 1)[0] + + # Treated slope should be steeper (trend violation) + assert treated_slope > control_slope + 0.5 + + def test_reproducibility(self): + """Test seed produces reproducible data.""" + from diff_diff.prep import generate_panel_data + + data1 = generate_panel_data(seed=123) + data2 = generate_panel_data(seed=123) + pd.testing.assert_frame_equal(data1, data2) + + def test_invalid_treatment_period(self): + """Test error on invalid treatment_period.""" + from diff_diff.prep import generate_panel_data + + with pytest.raises(ValueError, match="at least 1"): + generate_panel_data(n_periods=10, treatment_period=0) + + with pytest.raises(ValueError, match="less than n_periods"): + generate_panel_data(n_periods=10, treatment_period=10) + + +class TestGenerateEventStudyData: + """Tests for generate_event_study_data function.""" + + def test_basic_generation(self): + """Test basic event study data generation.""" + from diff_diff.prep import generate_event_study_data + + data = generate_event_study_data(n_units=100, n_pre=5, n_post=5, seed=42) + assert len(data) == 1000 # 100 units x 10 periods + assert set(data.columns) == { + "unit", "period", "treated", "post", "outcome", "event_time", "true_effect" + } + + def test_event_time(self): + """Test that event_time is correctly calculated.""" + from diff_diff.prep import generate_event_study_data + + data = generate_event_study_data(n_pre=5, n_post=5, seed=42) + # Event time should range from -5 to 4 + assert data["event_time"].min() == -5 + assert data["event_time"].max() == 4 + + def test_treatment_at_correct_period(self): + """Test that treatment starts at period n_pre.""" + from diff_diff.prep import generate_event_study_data + + data = generate_event_study_data(n_pre=4, n_post=3, seed=42) + # Post should be 1 for periods >= 4 + assert (data[data["period"] < 4]["post"] == 0).all() + assert (data[data["period"] >= 4]["post"] == 1).all() + + def test_treatment_effect_recovery(self): + """Test that treatment effect can be recovered.""" + from diff_diff.prep import generate_event_study_data + + true_effect = 4.0 + data = generate_event_study_data( + n_units=500, n_pre=5, n_post=5, treatment_effect=true_effect, + noise_sd=0.5, seed=42 + ) + + # Simple DiD + treated_post = data[(data["treated"] == 1) & (data["post"] == 1)]["outcome"].mean() + treated_pre = data[(data["treated"] == 1) & (data["post"] == 0)]["outcome"].mean() + control_post = data[(data["treated"] == 0) & (data["post"] == 1)]["outcome"].mean() + control_pre = data[(data["treated"] == 0) & (data["post"] == 0)]["outcome"].mean() + did_estimate = (treated_post - treated_pre) - (control_post - control_pre) + + assert abs(did_estimate - true_effect) < 1.0 + + def test_reproducibility(self): + """Test seed produces reproducible data.""" + from diff_diff.prep import generate_event_study_data + + data1 = generate_event_study_data(seed=123) + data2 = generate_event_study_data(seed=123) + pd.testing.assert_frame_equal(data1, data2) + + def test_treatment_fraction(self): + """Test that treatment_fraction is respected.""" + from diff_diff.prep import generate_event_study_data + + data = generate_event_study_data(n_units=100, treatment_fraction=0.4, seed=42) + n_treated_units = data.groupby("unit")["treated"].first().sum() + assert n_treated_units == 40 diff --git a/tests/test_staggered.py b/tests/test_staggered.py index 06caf79e..f78f756a 100644 --- a/tests/test_staggered.py +++ b/tests/test_staggered.py @@ -7,6 +7,7 @@ import pytest from diff_diff import CallawaySantAnna, CallawaySantAnnaResults +from diff_diff.prep import generate_staggered_data as _generate_staggered_data def generate_staggered_data( @@ -17,58 +18,32 @@ def generate_staggered_data( never_treated_frac: float = 0.3, seed: int = 42, ) -> pd.DataFrame: - """Generate synthetic staggered adoption data.""" - np.random.seed(seed) - - # Generate unit and time identifiers - units = np.repeat(np.arange(n_units), n_periods) - times = np.tile(np.arange(n_periods), n_units) - - # Assign treatment cohorts - # Some units never treated, others treated in different periods - n_never = int(n_units * never_treated_frac) - n_treated = n_units - n_never - - # Treatment periods start from period 3 onwards - cohort_periods = np.linspace(3, n_periods - 2, n_cohorts).astype(int) - - first_treat = np.zeros(n_units) - if n_treated > 0: - cohort_assignments = np.random.choice(len(cohort_periods), size=n_treated) - first_treat[n_never:] = cohort_periods[cohort_assignments] - - first_treat_expanded = np.repeat(first_treat, n_periods) - - # Generate outcomes - # Y = unit_fe + time_fe + treatment_effect * post + noise - unit_fe = np.random.randn(n_units) * 2 - time_fe = np.linspace(0, 1, n_periods) - - unit_fe_expanded = np.repeat(unit_fe, n_periods) - time_fe_expanded = np.tile(time_fe, n_units) - - # Treatment indicator - post = (times >= first_treat_expanded) & (first_treat_expanded > 0) - - # Dynamic treatment effects (effect grows over time) - relative_time = times - first_treat_expanded - dynamic_effect = treatment_effect * (1 + 0.1 * np.maximum(relative_time, 0)) + """ + Generate synthetic staggered adoption data for tests. - outcomes = ( - unit_fe_expanded + - time_fe_expanded + - dynamic_effect * post + - np.random.randn(len(units)) * 0.5 + Wrapper around the library function to maintain backward compatibility + with test signatures (uses 'time' column instead of 'period'). + """ + # Compute cohort periods based on n_cohorts + cohort_periods = np.linspace(3, n_periods - 2, n_cohorts).astype(int).tolist() + + data = _generate_staggered_data( + n_units=n_units, + n_periods=n_periods, + cohort_periods=cohort_periods, + never_treated_frac=never_treated_frac, + treatment_effect=treatment_effect, + dynamic_effects=True, + effect_growth=0.1, + unit_fe_sd=2.0, + noise_sd=0.5, + seed=seed, ) - df = pd.DataFrame({ - 'unit': units, - 'time': times, - 'outcome': outcomes, - 'first_treat': first_treat_expanded.astype(int), - }) + # Rename 'period' to 'time' for backward compatibility with existing tests + data = data.rename(columns={"period": "time"}) - return df + return data class TestCallawaySantAnna: diff --git a/tests/test_triple_diff.py b/tests/test_triple_diff.py index f2670d24..4bbde03a 100644 --- a/tests/test_triple_diff.py +++ b/tests/test_triple_diff.py @@ -19,6 +19,9 @@ triple_difference, ) +# Note: The library exports generate_ddd_data in diff_diff.prep, but tests use +# a local implementation with test-specific parameter names and covariate handling. + # ============================================================================= # Fixtures for test data generation @@ -36,25 +39,8 @@ def generate_ddd_data( """ Generate synthetic DDD data with known treatment effect. - Parameters - ---------- - n_per_cell : int - Number of observations per cell (8 cells total). - true_att : float - True average treatment effect on the treated. - noise_sd : float - Standard deviation of outcome noise. - seed : int - Random seed for reproducibility. - add_covariates : bool - Whether to add covariates that affect the outcome. - covariate_effect : float - Effect size of covariates on outcome. - - Returns - ------- - pd.DataFrame - Synthetic DDD data with known ATT. + This is a test-specific implementation that maintains backward compatibility + with existing tests. For general use, prefer diff_diff.prep.generate_ddd_data. """ rng = np.random.default_rng(seed) diff --git a/tests/test_trop.py b/tests/test_trop.py index f1b925fe..644b5164 100644 --- a/tests/test_trop.py +++ b/tests/test_trop.py @@ -6,6 +6,7 @@ from diff_diff import SyntheticDiD from diff_diff.trop import TROP, TROPResults, trop +from diff_diff.prep import generate_factor_data def generate_factor_dgp( @@ -22,54 +23,24 @@ def generate_factor_dgp( """ Generate panel data with known factor structure. - DGP: Y_it = mu + gamma_i + delta_t + Lambda_i'F_t + tau*D_it + eps_it + Wrapper around the library function for backward compatibility with tests. """ - rng = np.random.default_rng(seed) - - n_control = n_units - n_treated - n_periods = n_pre + n_post - - # Generate factors F: (n_periods, n_factors) - F = rng.normal(0, 1, (n_periods, n_factors)) - - # Generate loadings Lambda: (n_factors, n_units) - Lambda = rng.normal(0, 1, (n_factors, n_units)) - Lambda[:, :n_treated] += 0.5 - - # Unit fixed effects - gamma = rng.normal(0, 1, n_units) - gamma[:n_treated] += 1.0 - - # Time fixed effects - delta = np.linspace(0, 2, n_periods) - - # Generate outcomes - data = [] - for i in range(n_units): - is_treated = i < n_treated - - for t in range(n_periods): - period = t - post = t >= n_pre - - y = 10.0 + gamma[i] + delta[t] - y += factor_strength * (Lambda[:, i] @ F[t, :]) - - # Treatment effect only for treated units in post period - treatment_indicator = 1 if (is_treated and post) else 0 - if treatment_indicator: - y += treatment_effect - - y += rng.normal(0, noise_std) - - data.append({ - "unit": i, - "period": period, - "outcome": y, - "treated": treatment_indicator, - }) + data = generate_factor_data( + n_units=n_units, + n_pre=n_pre, + n_post=n_post, + n_treated=n_treated, + n_factors=n_factors, + treatment_effect=treatment_effect, + factor_strength=factor_strength, + treated_loading_shift=0.5, + unit_fe_sd=1.0, + noise_sd=noise_std, + seed=seed, + ) - return pd.DataFrame(data) + # Return only the columns the tests expect + return data[["unit", "period", "outcome", "treated"]] @pytest.fixture From 5e527ab2fbb0a47e49ac79d20723b72118c9e115 Mon Sep 17 00:00:00 2001 From: igerber Date: Mon, 19 Jan 2026 09:40:35 -0500 Subject: [PATCH 2/2] Address PR review feedback - Add new DGP functions to CLAUDE.md Module Structure section - Restore trailing newlines to modified notebook files - Add RuntimeWarnings investigation items to TODO.md Co-Authored-By: Claude Opus 4.5 --- CLAUDE.md | 7 ++++++- TODO.md | 15 +++++++++++++++ docs/tutorials/02_staggered_did.ipynb | 2 +- docs/tutorials/04_parallel_trends.ipynb | 2 +- docs/tutorials/07_pretrends_power.ipynb | 2 +- docs/tutorials/08_triple_diff.ipynb | 2 +- docs/tutorials/10_trop.ipynb | 2 +- 7 files changed, 26 insertions(+), 6 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 2e58c957..7bfbfac2 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -195,7 +195,12 @@ pytest tests/test_rust_backend.py -v - Integrates with HonestDiD for comprehensive sensitivity analysis - **`diff_diff/prep.py`** - Data preparation utilities: - - `generate_did_data` - Create synthetic data with known treatment effect + - `generate_did_data` - Create synthetic data with known treatment effect (basic 2x2 DiD) + - `generate_staggered_data` - Staggered adoption data for CallawaySantAnna/SunAbraham + - `generate_factor_data` - Factor model data for TROP/SyntheticDiD + - `generate_ddd_data` - Triple Difference (DDD) design data + - `generate_panel_data` - Panel data with optional parallel trends violations + - `generate_event_study_data` - Event study data with simultaneous treatment - `make_treatment_indicator`, `make_post_indicator` - Create binary indicators - `wide_to_long`, `balance_panel` - Panel data reshaping - `validate_did_data`, `summarize_did_data` - Data validation and summary diff --git a/TODO.md b/TODO.md index cee5c945..3e479276 100644 --- a/TODO.md +++ b/TODO.md @@ -92,6 +92,21 @@ Enhancements for `honest_did.py`: --- +## RuntimeWarnings in Linear Algebra Operations + +Pre-existing RuntimeWarnings in matrix operations that should be investigated: + +- [ ] `linalg.py:162` - "divide by zero", "overflow", "invalid value" in fitted value computation + - Occurs during `X @ coefficients` when coefficients contain extreme values + - Seen in test_prep.py during treatment effect recovery tests +- [ ] `triple_diff.py:307,323` - Similar warnings in propensity score computation + - Occurs in IPW and DR estimation methods with covariates + - Related to logistic regression overflow in edge cases + +**Note**: These warnings do not affect correctness of results but should be handled gracefully (e.g., with `np.errstate` context managers or input validation). + +--- + ## Rust Backend Optimizations Deferred from PR #58 code review (completed in v2.0.3): diff --git a/docs/tutorials/02_staggered_did.ipynb b/docs/tutorials/02_staggered_did.ipynb index cbde2ab2..ff75190d 100644 --- a/docs/tutorials/02_staggered_did.ipynb +++ b/docs/tutorials/02_staggered_did.ipynb @@ -912,4 +912,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/docs/tutorials/04_parallel_trends.ipynb b/docs/tutorials/04_parallel_trends.ipynb index ef57e8ec..149e08a3 100644 --- a/docs/tutorials/04_parallel_trends.ipynb +++ b/docs/tutorials/04_parallel_trends.ipynb @@ -798,4 +798,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/docs/tutorials/07_pretrends_power.ipynb b/docs/tutorials/07_pretrends_power.ipynb index cf96f251..6802dbc8 100644 --- a/docs/tutorials/07_pretrends_power.ipynb +++ b/docs/tutorials/07_pretrends_power.ipynb @@ -768,4 +768,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/docs/tutorials/08_triple_diff.ipynb b/docs/tutorials/08_triple_diff.ipynb index 10e447aa..b798922e 100644 --- a/docs/tutorials/08_triple_diff.ipynb +++ b/docs/tutorials/08_triple_diff.ipynb @@ -470,4 +470,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/docs/tutorials/10_trop.ipynb b/docs/tutorials/10_trop.ipynb index 6e9c4149..dd357a86 100644 --- a/docs/tutorials/10_trop.ipynb +++ b/docs/tutorials/10_trop.ipynb @@ -616,4 +616,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +}