From 5513acfd8dceb917c765fe13d7f51167a46ae615 Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 24 Jan 2026 18:42:03 -0500
Subject: [PATCH 1/4] Fix plot_event_study reference_period normalization

The reference_period parameter was documented as normalizing effects
to 0 but only applied visual styling. This commit adds actual
normalization logic that subtracts the reference effect from all
effects when reference_period is specified.

Changes:
- Add normalization logic in plot_event_study() after critical_value
  calculation: subtracts reference effect from all effects
- Only normalizes if reference_period exists in data and is finite
- Add 3 tests verifying normalization behavior, no-normalization case,
  and NaN reference handling
- Fix Python 3.9 compatibility issue in tutorial notebook (backslash
  in f-string expression)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 diff_diff/visualization.py            |   6 ++
 docs/tutorials/02_staggered_did.ipynb | 141 +++++++++++++++++++++++++-
 tests/test_visualization.py           | 102 +++++++++++++++++++
 3 files changed, 244 insertions(+), 5 deletions(-)

diff --git a/diff_diff/visualization.py b/diff_diff/visualization.py
index 6584b51a..b9e3f364 100644
--- a/diff_diff/visualization.py
+++ b/diff_diff/visualization.py
@@ -192,6 +192,12 @@ def plot_event_study(
     # Compute confidence intervals
     critical_value = scipy_stats.norm.ppf(1 - alpha / 2)
 
+    # Normalize effects to reference period if specified
+    if reference_period is not None and reference_period in effects:
+        ref_effect = effects[reference_period]
+        if np.isfinite(ref_effect):
+            effects = {p: e - ref_effect for p, e in effects.items()}
+
     plot_data = []
     for period in periods:
         effect = effects.get(period, np.nan)
diff --git a/docs/tutorials/02_staggered_did.ipynb b/docs/tutorials/02_staggered_did.ipynb
index 62d913d4..4657608c 100644
--- a/docs/tutorials/02_staggered_did.ipynb
+++ b/docs/tutorials/02_staggered_did.ipynb
@@ -3,7 +3,31 @@
   {
    "cell_type": "markdown",
    "metadata": {},
-   "source": "# Staggered Difference-in-Differences\n\nThis notebook demonstrates how to handle **staggered treatment adoption** using modern DiD estimators. In staggered DiD settings:\n\n- Different units get treated at different times\n- Traditional TWFE can give biased estimates due to \"forbidden comparisons\"\n- Modern estimators compute group-time specific effects and aggregate them properly\n\nWe'll cover:\n1. Understanding staggered adoption\n2. The problem with TWFE (and Goodman-Bacon decomposition)\n3. The Callaway-Sant'Anna estimator\n4. Group-time effects ATT(g,t)\n5. Aggregating effects (simple, group, event-study)\n6. Bootstrap inference for valid standard errors\n7. Visualization\n8. Pre-treatment effects and parallel trends testing\n9. Different control group options\n10. Handling anticipation effects\n11. Adding covariates\n12. Comparing with MultiPeriodDiD\n13. Sun-Abraham interaction-weighted estimator\n14. Comparing CS and SA as a robustness check"
+   "source": [
+    "# Staggered Difference-in-Differences\n",
+    "\n",
+    "This notebook demonstrates how to handle **staggered treatment adoption** using modern DiD estimators. In staggered DiD settings:\n",
+    "\n",
+    "- Different units get treated at different times\n",
+    "- Traditional TWFE can give biased estimates due to \"forbidden comparisons\"\n",
+    "- Modern estimators compute group-time specific effects and aggregate them properly\n",
+    "\n",
+    "We'll cover:\n",
+    "1. Understanding staggered adoption\n",
+    "2. The problem with TWFE (and Goodman-Bacon decomposition)\n",
+    "3. The Callaway-Sant'Anna estimator\n",
+    "4. Group-time effects ATT(g,t)\n",
+    "5. Aggregating effects (simple, group, event-study)\n",
+    "6. Bootstrap inference for valid standard errors\n",
+    "7. Visualization\n",
+    "8. Pre-treatment effects and parallel trends testing\n",
+    "9. Different control group options\n",
+    "10. Handling anticipation effects\n",
+    "11. Adding covariates\n",
+    "12. Comparing with MultiPeriodDiD\n",
+    "13. Sun-Abraham interaction-weighted estimator\n",
+    "14. Comparing CS and SA as a robustness check"
+   ]
   },
   {
    "cell_type": "code",
@@ -810,19 +834,126 @@
   {
    "cell_type": "markdown",
    "metadata": {},
-   "source": "## 14. Comparing CS and SA as a Robustness Check\n\nRunning both estimators provides a useful robustness check. When they agree, results are more credible.\n\n### Understanding Pre-Period Differences\n\nYou may notice that **post-treatment effects align closely** between CS and SA, but **pre-treatment effects can differ in magnitude and significance**. This is expected methodological behavior, not a bug.\n\n**Why the difference?**\n\n1. **Callaway-Sant'Anna with `base_period=\"varying\"` (default)**:\n   - Pre-treatment effects use **consecutive period comparisons** (period t vs period t-1)\n   - Each pre-period coefficient represents a one-period change\n   - These smaller incremental changes often yield lower t-statistics\n\n2. **Sun-Abraham**:\n   - Uses a **fixed reference period** (e=-1 when anticipation=0, or e=-1-anticipation otherwise)\n   - All coefficients are deviations from this single reference\n   - Pre-period coefficients show cumulative difference from the reference\n\n**To make CS pre-periods more comparable to SA**, use `base_period=\"universal\"`:\n\n```python\ncs_universal = CallawaySantAnna(base_period=\"universal\")\n```\n\nThis makes CS compare all periods to g-1 (like SA), producing more similar pre-treatment estimates."
+   "source": [
+    "## 14. Comparing CS and SA as a Robustness Check\n",
+    "\n",
+    "Running both estimators provides a useful robustness check. When they agree, results are more credible.\n",
+    "\n",
+    "### Understanding Pre-Period Differences\n",
+    "\n",
+    "You may notice that **post-treatment effects align closely** between CS and SA, but **pre-treatment effects can differ in magnitude and significance**. This is expected methodological behavior, not a bug.\n",
+    "\n",
+    "**Why the difference?**\n",
+    "\n",
+    "1. **Callaway-Sant'Anna with `base_period=\"varying\"` (default)**:\n",
+    "   - Pre-treatment effects use **consecutive period comparisons** (period t vs period t-1)\n",
+    "   - Each pre-period coefficient represents a one-period change\n",
+    "   - These smaller incremental changes often yield lower t-statistics\n",
+    "\n",
+    "2. **Sun-Abraham**:\n",
+    "   - Uses a **fixed reference period** (e=-1 when anticipation=0, or e=-1-anticipation otherwise)\n",
+    "   - All coefficients are deviations from this single reference\n",
+    "   - Pre-period coefficients show cumulative difference from the reference\n",
+    "\n",
+    "**To make CS pre-periods more comparable to SA**, use `base_period=\"universal\"`:\n",
+    "\n",
+    "```python\n",
+    "cs_universal = CallawaySantAnna(base_period=\"universal\")\n",
+    "```\n",
+    "\n",
+    "This makes CS compare all periods to g-1 (like SA), producing more similar pre-treatment estimates."
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": "# Compare overall ATT from both estimators\nprint(\"Robustness Check: CS vs SA\")\nprint(\"=\" * 60)\nprint(f\"{'Estimator':<30} {'Overall ATT':>12} {'SE':>10}\")\nprint(\"-\" * 60)\nprint(f\"{'Callaway-Sant\\\\'Anna (varying)':<30} {results_cs.overall_att:>12.4f} {results_cs.overall_se:>10.4f}\")\nprint(f\"{'Sun-Abraham':<30} {results_sa.overall_att:>12.4f} {results_sa.overall_se:>10.4f}\")\n\n# Also fit CS with universal base period for comparison\ncs_universal = CallawaySantAnna(control_group=\"never_treated\", base_period=\"universal\")\nresults_cs_univ = cs_universal.fit(\n    df, outcome=\"outcome\", unit=\"unit\",\n    time=\"period\", first_treat=\"first_treat\",\n    aggregate=\"event_study\"\n)\n\n# Compare event study effects\nprint(\"\\n\\nEvent Study Comparison:\")\nprint(\"Note: Pre-periods differ due to base period methodology (see explanation above)\")\nprint(f\"{'Rel. Time':>10} {'CS (vary)':>12} {'CS (univ)':>12} {'SA':>10} {'Note':>20}\")\nprint(\"-\" * 70)\n\nfor rel_time in sorted(results_sa.event_study_effects.keys()):\n    sa_eff = results_sa.event_study_effects[rel_time]['effect']\n    cs_vary = results_cs.event_study_effects.get(rel_time, {}).get('effect', np.nan)\n    cs_univ = results_cs_univ.event_study_effects.get(rel_time, {}).get('effect', np.nan)\n    \n    note = \"pre (differs)\" if rel_time < 0 else \"post (matches)\"\n    print(f\"{rel_time:>10} {cs_vary:>12.4f} {cs_univ:>12.4f} {sa_eff:>10.4f} {note:>20}\")\n\nprint(\"\\nPost-treatment effects should be similar across all methods\")\nprint(\"Pre-treatment differences are expected due to base period methodology\")"
+   "source": [
+    "# Compare overall ATT from both estimators\n",
+    "cs_label = \"Callaway-Sant'Anna (varying)\"\n",
+    "print(\"Robustness Check: CS vs SA\")\n",
+    "print(\"=\" * 60)\n",
+    "print(f\"{'Estimator':<30} {'Overall ATT':>12} {'SE':>10}\")\n",
+    "print(\"-\" * 60)\n",
+    "print(f\"{cs_label:<30} {results_cs.overall_att:>12.4f} {results_cs.overall_se:>10.4f}\")\n",
+    "print(f\"{'Sun-Abraham':<30} {results_sa.overall_att:>12.4f} {results_sa.overall_se:>10.4f}\")\n",
+    "\n",
+    "# Also fit CS with universal base period for comparison\n",
+    "cs_universal = CallawaySantAnna(control_group=\"never_treated\", base_period=\"universal\")\n",
+    "results_cs_univ = cs_universal.fit(\n",
+    "    df, outcome=\"outcome\", unit=\"unit\",\n",
+    "    time=\"period\", first_treat=\"first_treat\",\n",
+    "    aggregate=\"event_study\"\n",
+    ")\n",
+    "\n",
+    "# Compare event study effects\n",
+    "print(\"\\n\\nEvent Study Comparison:\")\n",
+    "print(\"Note: Pre-periods differ due to base period methodology (see explanation above)\")\n",
+    "print(f\"{'Rel. Time':>10} {'CS (vary)':>12} {'CS (univ)':>12} {'SA':>10} {'Note':>20}\")\n",
+    "print(\"-\" * 70)\n",
+    "\n",
+    "for rel_time in sorted(results_sa.event_study_effects.keys()):\n",
+    "    sa_eff = results_sa.event_study_effects[rel_time]['effect']\n",
+    "    cs_vary = results_cs.event_study_effects.get(rel_time, {}).get('effect', np.nan)\n",
+    "    cs_univ = results_cs_univ.event_study_effects.get(rel_time, {}).get('effect', np.nan)\n",
+    "    \n",
+    "    note = \"pre (differs)\" if rel_time < 0 else \"post (matches)\"\n",
+    "    print(f\"{rel_time:>10} {cs_vary:>12.4f} {cs_univ:>12.4f} {sa_eff:>10.4f} {note:>20}\")\n",
+    "\n",
+    "print(\"\\nPost-treatment effects should be similar across all methods\")\n",
+    "print(\"Pre-treatment differences are expected due to base period methodology\")"
+   ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
-   "source": "## Summary\n\nKey takeaways:\n\n1. **TWFE can be biased** with staggered adoption and heterogeneous effects\n2. **Goodman-Bacon decomposition** reveals *why* TWFE fails by showing:\n   - The implicit 2x2 comparisons and their weights\n   - How much weight falls on \"forbidden comparisons\" (already-treated as controls)\n3. **Callaway-Sant'Anna** properly handles staggered adoption by:\n   - Computing group-time specific effects ATT(g,t)\n   - Only using valid comparison groups\n   - Properly aggregating effects\n4. **Sun-Abraham** provides an alternative approach using:\n   - Interaction-weighted regression with cohort x relative-time indicators\n   - Different weighting scheme than CS\n   - More efficient under homogeneous effects\n5. **Run both CS and SA** as a robustness check—when they agree, results are more credible\n6. **Aggregation options**:\n   - `\"simple\"`: Overall ATT\n   - `\"group\"`: ATT by cohort\n   - `\"event\"`: ATT by event time (for event-study plots)\n7. **Bootstrap inference** provides valid standard errors and confidence intervals:\n   - Use `n_bootstrap` parameter to enable multiplier bootstrap\n   - Choose weight type: `'rademacher'`, `'mammen'`, or `'webb'`\n   - Bootstrap results include SEs, CIs, and p-values for all aggregations\n8. **Pre-treatment effects** provide parallel trends diagnostics:\n   - Use `base_period=\"varying\"` for consecutive period comparisons\n   - Pre-treatment ATT(g,t) should be near zero\n   - 95% CIs including zero is consistent with parallel trends\n   - See Tutorial 07 for pre-trends power analysis (Roth 2022)\n9. **Control group choices** affect efficiency and assumptions:\n   - `\"never_treated\"`: Stronger parallel trends assumption\n   - `\"not_yet_treated\"`: Weaker assumption, uses more data\n10. **CS vs SA pre-period differences are expected**:\n    - Post-treatment effects should be similar (robustness check)\n    - Pre-treatment effects differ due to base period methodology\n    - CS (varying): consecutive comparisons → one-period changes\n    - SA: fixed reference (e=-1-anticipation) → cumulative deviations\n    - Use `base_period=\"universal\"` in CS for comparable pre-periods\n\nFor more details, see:\n- Callaway, B., & Sant'Anna, P. H. (2021). Difference-in-differences with multiple time periods. *Journal of Econometrics*.\n- Sun, L., & Abraham, S. (2021). Estimating dynamic treatment effects in event studies with heterogeneous treatment effects. *Journal of Econometrics*.\n- Goodman-Bacon, A. (2021). Difference-in-differences with variation in treatment timing. *Journal of Econometrics*."
+   "source": [
+    "## Summary\n",
+    "\n",
+    "Key takeaways:\n",
+    "\n",
+    "1. **TWFE can be biased** with staggered adoption and heterogeneous effects\n",
+    "2. **Goodman-Bacon decomposition** reveals *why* TWFE fails by showing:\n",
+    "   - The implicit 2x2 comparisons and their weights\n",
+    "   - How much weight falls on \"forbidden comparisons\" (already-treated as controls)\n",
+    "3. **Callaway-Sant'Anna** properly handles staggered adoption by:\n",
+    "   - Computing group-time specific effects ATT(g,t)\n",
+    "   - Only using valid comparison groups\n",
+    "   - Properly aggregating effects\n",
+    "4. **Sun-Abraham** provides an alternative approach using:\n",
+    "   - Interaction-weighted regression with cohort x relative-time indicators\n",
+    "   - Different weighting scheme than CS\n",
+    "   - More efficient under homogeneous effects\n",
+    "5. **Run both CS and SA** as a robustness check—when they agree, results are more credible\n",
+    "6. **Aggregation options**:\n",
+    "   - `\"simple\"`: Overall ATT\n",
+    "   - `\"group\"`: ATT by cohort\n",
+    "   - `\"event\"`: ATT by event time (for event-study plots)\n",
+    "7. **Bootstrap inference** provides valid standard errors and confidence intervals:\n",
+    "   - Use `n_bootstrap` parameter to enable multiplier bootstrap\n",
+    "   - Choose weight type: `'rademacher'`, `'mammen'`, or `'webb'`\n",
+    "   - Bootstrap results include SEs, CIs, and p-values for all aggregations\n",
+    "8. **Pre-treatment effects** provide parallel trends diagnostics:\n",
+    "   - Use `base_period=\"varying\"` for consecutive period comparisons\n",
+    "   - Pre-treatment ATT(g,t) should be near zero\n",
+    "   - 95% CIs including zero is consistent with parallel trends\n",
+    "   - See Tutorial 07 for pre-trends power analysis (Roth 2022)\n",
+    "9. **Control group choices** affect efficiency and assumptions:\n",
+    "   - `\"never_treated\"`: Stronger parallel trends assumption\n",
+    "   - `\"not_yet_treated\"`: Weaker assumption, uses more data\n",
+    "10. **CS vs SA pre-period differences are expected**:\n",
+    "    - Post-treatment effects should be similar (robustness check)\n",
+    "    - Pre-treatment effects differ due to base period methodology\n",
+    "    - CS (varying): consecutive comparisons → one-period changes\n",
+    "    - SA: fixed reference (e=-1-anticipation) → cumulative deviations\n",
+    "    - Use `base_period=\"universal\"` in CS for comparable pre-periods\n",
+    "\n",
+    "For more details, see:\n",
+    "- Callaway, B., & Sant'Anna, P. H. (2021). Difference-in-differences with multiple time periods. *Journal of Econometrics*.\n",
+    "- Sun, L., & Abraham, S. (2021). Estimating dynamic treatment effects in event studies with heterogeneous treatment effects. *Journal of Econometrics*.\n",
+    "- Goodman-Bacon, A. (2021). Difference-in-differences with variation in treatment timing. *Journal of Econometrics*."
+   ]
   }
  ],
  "metadata": {
@@ -832,4 +963,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
\ No newline at end of file
+}
diff --git a/tests/test_visualization.py b/tests/test_visualization.py
index 4f78254b..44b47bed 100644
--- a/tests/test_visualization.py
+++ b/tests/test_visualization.py
@@ -318,6 +318,108 @@ def test_plot_cs_with_anticipation(self):
 
         plt.close()
 
+    def test_plot_event_study_reference_period_normalization(self):
+        """Test that reference_period actually normalizes effects to 0.
+
+        When reference_period is specified, the effect at that period should
+        be subtracted from all effects, so the reference period becomes exactly 0.
+        """
+        pytest.importorskip("matplotlib")
+        import matplotlib.pyplot as plt
+
+        # Create data where reference period (period=0) has effect=0.3
+        df = pd.DataFrame({
+            'period': [-2, -1, 0, 1, 2],
+            'effect': [0.1, 0.2, 0.3, 0.5, 0.6],  # ref at 0 has effect 0.3
+            'se': [0.1, 0.1, 0.1, 0.1, 0.1]
+        })
+
+        ax = plot_event_study(df, reference_period=0, show=False)
+
+        # Find plotted y-values by extracting data from Line2D objects
+        # The point estimates are plotted as individual markers
+        y_values = []
+        for child in ax.get_children():
+            # Line2D objects with single points are our markers
+            if hasattr(child, 'get_ydata'):
+                ydata = child.get_ydata()
+                if len(ydata) == 1:
+                    y_values.append(float(ydata[0]))
+
+        # After normalization:
+        # - Original effects: [0.1, 0.2, 0.3, 0.5, 0.6]
+        # - Reference effect: 0.3
+        # - Normalized: [-0.2, -0.1, 0.0, 0.2, 0.3]
+        expected_normalized = [-0.2, -0.1, 0.0, 0.2, 0.3]
+
+        # Check that reference period (0) is at y=0
+        assert 0.0 in y_values or any(abs(y) < 0.01 for y in y_values), \
+            f"Reference period should be at y=0, got y_values={y_values}"
+
+        # Verify all expected normalized values are present
+        for expected in expected_normalized:
+            assert any(abs(y - expected) < 0.01 for y in y_values), \
+                f"Expected normalized value {expected} not found in {y_values}"
+
+        plt.close()
+
+    def test_plot_event_study_no_normalization_without_reference(self):
+        """Test that effects are NOT normalized when reference_period is None."""
+        pytest.importorskip("matplotlib")
+        import matplotlib.pyplot as plt
+
+        df = pd.DataFrame({
+            'period': [-1, 0, 1],
+            'effect': [0.1, 0.3, 0.5],
+            'se': [0.1, 0.1, 0.1]
+        })
+
+        ax = plot_event_study(df, reference_period=None, show=False)
+
+        # Extract y-values
+        y_values = []
+        for child in ax.get_children():
+            if hasattr(child, 'get_ydata'):
+                ydata = child.get_ydata()
+                if len(ydata) == 1:
+                    y_values.append(float(ydata[0]))
+
+        # Without normalization, original values should be preserved
+        for expected in [0.1, 0.3, 0.5]:
+            assert any(abs(y - expected) < 0.01 for y in y_values), \
+                f"Original value {expected} not found in {y_values}"
+
+        plt.close()
+
+    def test_plot_event_study_normalization_with_nan_reference(self):
+        """Test that normalization is skipped when reference effect is NaN."""
+        pytest.importorskip("matplotlib")
+        import matplotlib.pyplot as plt
+
+        df = pd.DataFrame({
+            'period': [-1, 0, 1],
+            'effect': [0.1, np.nan, 0.5],  # Reference period has NaN effect
+            'se': [0.1, 0.1, 0.1]
+        })
+
+        # This should not raise and should skip normalization
+        ax = plot_event_study(df, reference_period=0, show=False)
+
+        # Extract y-values (NaN effect is skipped in plotting)
+        y_values = []
+        for child in ax.get_children():
+            if hasattr(child, 'get_ydata'):
+                ydata = child.get_ydata()
+                if len(ydata) == 1:
+                    y_values.append(float(ydata[0]))
+
+        # Original non-NaN values should be preserved (not normalized)
+        for expected in [0.1, 0.5]:
+            assert any(abs(y - expected) < 0.01 for y in y_values), \
+                f"Original value {expected} not found in {y_values}"
+
+        plt.close()
+
 
 class TestPlotEventStudyIntegration:
     """Integration tests for event study plotting."""

From bbe7637f0f3b1f6f07acf6e0f40b3177e496cfc0 Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 24 Jan 2026 19:03:10 -0500
Subject: [PATCH 2/4] Set reference period SE to NaN during normalization

When normalizing effects to a reference period, the reference SE is now
set to NaN since it's an identifying constraint rather than an estimated
quantity. This follows the fixest (R) convention where the omitted
category has no associated uncertainty.

Changes:
- visualization.py: Set reference SE to NaN when normalizing
- test_visualization.py: Verify reference period has no error bars
- REGISTRY.md: Document event study plotting normalization behavior

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 diff_diff/visualization.py   |  3 +++
 docs/methodology/REGISTRY.md | 32 +++++++++++++++++++++++++++
 tests/test_visualization.py  | 42 +++++++++++++++++++++++++++++++++---
 3 files changed, 74 insertions(+), 3 deletions(-)

diff --git a/diff_diff/visualization.py b/diff_diff/visualization.py
index b9e3f364..ab735137 100644
--- a/diff_diff/visualization.py
+++ b/diff_diff/visualization.py
@@ -197,6 +197,9 @@ def plot_event_study(
         ref_effect = effects[reference_period]
         if np.isfinite(ref_effect):
             effects = {p: e - ref_effect for p, e in effects.items()}
+            # Set reference SE to NaN (it's now a constraint, not an estimate)
+            # This follows fixest convention where the omitted category has no SE/CI
+            se = {p: (np.nan if p == reference_period else s) for p, s in se.items()}
 
     plot_data = []
     for period in periods:
diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md
index 6d6c1b61..94c913c9 100644
--- a/docs/methodology/REGISTRY.md
+++ b/docs/methodology/REGISTRY.md
@@ -728,6 +728,38 @@ n = 2(t_{α/2} + t_{1-κ})² σ² / MDE²
 
 ---
 
+# Visualization
+
+## Event Study Plotting (`plot_event_study`)
+
+**Reference Period Normalization**
+
+When `reference_period` is specified:
+- Point estimates are normalized: `effect_normalized = effect - effect_ref`
+- Reference period SE is set to NaN (it's now a constraint, not an estimate)
+- Other periods' SEs are unchanged (they represent uncertainty relative to the constraint)
+- CIs are recomputed from normalized effects and original SEs
+
+This follows the `fixest` (R) convention where the omitted/reference category is an identifying
+constraint with no associated uncertainty. This differs from the `did` (R) package which does
+not normalize and reports full inference for all periods including the reference.
+
+**Rationale**: When normalizing to a reference period, we're treating that period as an
+identifying constraint (effect ≡ 0 by definition). The variance of a constant is zero,
+but since it's a constraint rather than an estimated quantity, we report NaN rather than 0.
+
+**Edge Cases:**
+- If `reference_period` not in data: No normalization applied
+- If reference effect is NaN: No normalization applied
+- Reference period CI becomes (NaN, NaN) after normalization
+- Reference period is plotted with hollow marker but no error bars
+
+**Reference implementation(s):**
+- R: `fixest::coefplot()` with reference category shown at 0 with no CI
+- R: `did::ggdid()` does not normalize; shows full inference for all periods
+
+---
+
 # Cross-Reference: Standard Errors Summary
 
 | Estimator | Default SE | Alternatives |
diff --git a/tests/test_visualization.py b/tests/test_visualization.py
index 44b47bed..03113923 100644
--- a/tests/test_visualization.py
+++ b/tests/test_visualization.py
@@ -319,10 +319,15 @@ def test_plot_cs_with_anticipation(self):
         plt.close()
 
     def test_plot_event_study_reference_period_normalization(self):
-        """Test that reference_period actually normalizes effects to 0.
+        """Test that reference_period normalizes effects and sets reference SE to NaN.
 
-        When reference_period is specified, the effect at that period should
-        be subtracted from all effects, so the reference period becomes exactly 0.
+        When reference_period is specified:
+        1. The effect at that period is subtracted from all effects (ref period = 0)
+        2. The SE at the reference period is set to NaN (it's a constraint, not an estimate)
+        3. Other periods retain their original SEs and have error bars
+
+        This follows the fixest (R) convention where the omitted/reference category
+        has no associated uncertainty (it's an identifying constraint).
         """
         pytest.importorskip("matplotlib")
         import matplotlib.pyplot as plt
@@ -361,6 +366,37 @@ def test_plot_event_study_reference_period_normalization(self):
             assert any(abs(y - expected) < 0.01 for y in y_values), \
                 f"Expected normalized value {expected} not found in {y_values}"
 
+        # Verify error bars: reference period (y=0) should have NO error bars
+        # while other periods should have error bars
+        # Error bars are drawn via ax.errorbar, which creates ErrorbarContainer or Line2D
+        # The error bar x-coordinates tell us which periods have error bars
+
+        # Find the errorbar data (the line segments that form error bars)
+        errorbar_x_coords = set()
+        for child in ax.get_children():
+            # ErrorbarContainer's children include LineCollection for the caps/stems
+            if hasattr(child, 'get_segments'):
+                segments = child.get_segments()
+                for seg in segments:
+                    # Each segment is [[x1, y1], [x2, y2]]
+                    if len(seg) >= 2:
+                        # x-coordinate of error bar (both points have same x)
+                        errorbar_x_coords.add(round(seg[0][0], 1))
+
+        # x-coordinates: period -2 -> x=0, -1 -> x=1, 0 -> x=2, 1 -> x=3, 2 -> x=4
+        # The reference period (period=0) is at x=2
+        reference_x = 2  # period 0 is at x-coordinate 2
+
+        # Reference period should NOT have error bars (x=2 should not be in errorbar_x_coords)
+        assert reference_x not in errorbar_x_coords, \
+            f"Reference period should have no error bars but found error bar at x={reference_x}"
+
+        # Other periods SHOULD have error bars
+        # At least some of x=0, x=1, x=3, x=4 should have error bars
+        non_ref_x_coords = {0, 1, 3, 4}
+        assert len(errorbar_x_coords & non_ref_x_coords) >= 2, \
+            f"Non-reference periods should have error bars, found: {errorbar_x_coords}"
+
         plt.close()
 
     def test_plot_event_study_no_normalization_without_reference(self):

From 60c04cb2bda01c6b3f8711456b278bf6d97798ad Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 24 Jan 2026 19:25:13 -0500
Subject: [PATCH 3/4] Only normalize when reference_period is explicitly
 specified

- Add reference_inferred flag to _extract_plot_data return value
- Track whether reference_period was user-provided vs auto-detected
- Auto-inferred reference periods get hollow marker styling only, no normalization
- Explicit reference_period=X triggers normalization and sets ref SE to NaN
- Add tests for both auto-inferred (no normalization) and explicit (normalizes)
- Update REGISTRY.md to document the explicit vs inferred behavior

This prevents unintended normalization when reference period isn't a true
identifying constraint (e.g., CallawaySantAnna with base_period="varying").

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 diff_diff/visualization.py   |  39 +++++++---
 docs/methodology/REGISTRY.md |  34 ++++++---
 tests/test_visualization.py  | 136 +++++++++++++++++++++++++++++++++++
 3 files changed, 190 insertions(+), 19 deletions(-)

diff --git a/diff_diff/visualization.py b/diff_diff/visualization.py
index ab735137..72f2cd4b 100644
--- a/diff_diff/visualization.py
+++ b/diff_diff/visualization.py
@@ -170,10 +170,18 @@ def plot_event_study(
 
     from scipy import stats as scipy_stats
 
+    # Track if reference_period was explicitly provided by user
+    reference_period_explicit = reference_period is not None
+
     # Extract data from results if provided
     if results is not None:
-        effects, se, periods, pre_periods, post_periods, reference_period = \
-            _extract_plot_data(results, periods, pre_periods, post_periods, reference_period)
+        extracted = _extract_plot_data(
+            results, periods, pre_periods, post_periods, reference_period
+        )
+        effects, se, periods, pre_periods, post_periods, reference_period, reference_inferred = extracted
+        # If reference was inferred from results, it was NOT explicitly provided
+        if reference_inferred:
+            reference_period_explicit = False
     elif effects is None or se is None:
         raise ValueError(
             "Must provide either 'results' or both 'effects' and 'se'"
@@ -192,8 +200,12 @@ def plot_event_study(
     # Compute confidence intervals
     critical_value = scipy_stats.norm.ppf(1 - alpha / 2)
 
-    # Normalize effects to reference period if specified
-    if reference_period is not None and reference_period in effects:
+    # Normalize effects to reference period ONLY if explicitly specified by user
+    # Auto-inferred reference periods (from CallawaySantAnna) just get hollow marker styling,
+    # NO normalization. This prevents unintended normalization when the reference period
+    # isn't a true identifying constraint (e.g., CallawaySantAnna with base_period="varying").
+    if (reference_period is not None and reference_period in effects and
+            reference_period_explicit):
         ref_effect = effects[reference_period]
         if np.isfinite(ref_effect):
             effects = {p: e - ref_effect for p, e in effects.items()}
@@ -313,14 +325,17 @@ def _extract_plot_data(
     pre_periods: Optional[List[Any]],
     post_periods: Optional[List[Any]],
     reference_period: Optional[Any],
-) -> Tuple[Dict, Dict, List, List, List, Any]:
+) -> Tuple[Dict, Dict, List, List, List, Any, bool]:
     """
     Extract plotting data from various result types.
 
     Returns
     -------
     tuple
-        (effects, se, periods, pre_periods, post_periods, reference_period)
+        (effects, se, periods, pre_periods, post_periods, reference_period, reference_inferred)
+
+        reference_inferred is True if reference_period was auto-detected from results
+        rather than explicitly provided by the user.
     """
     # Handle DataFrame input
     if isinstance(results, pd.DataFrame):
@@ -337,7 +352,8 @@ def _extract_plot_data(
         if periods is None:
             periods = list(results['period'])
 
-        return effects, se, periods, pre_periods, post_periods, reference_period
+        # DataFrame input: reference_period was already set by caller, never inferred here
+        return effects, se, periods, pre_periods, post_periods, reference_period, False
 
     # Handle MultiPeriodDiDResults
     if hasattr(results, 'period_effects'):
@@ -357,7 +373,8 @@ def _extract_plot_data(
         if periods is None:
             periods = post_periods
 
-        return effects, se, periods, pre_periods, post_periods, reference_period
+        # MultiPeriodDiDResults: reference_period was already set by caller, never inferred here
+        return effects, se, periods, pre_periods, post_periods, reference_period, False
 
     # Handle CallawaySantAnnaResults (event study aggregation)
     if hasattr(results, 'event_study_effects') and results.event_study_effects is not None:
@@ -371,8 +388,12 @@ def _extract_plot_data(
         if periods is None:
             periods = sorted(effects.keys())
 
+        # Track if reference_period was explicitly provided vs auto-inferred
+        reference_inferred = False
+
         # Reference period is typically -1 for event study
         if reference_period is None:
+            reference_inferred = True  # We're about to infer it
             # Detect reference period from n_groups=0 marker (normalization constraint)
             # This handles anticipation > 0 where reference is at e = -1 - anticipation
             for period, effect_data in results.event_study_effects.items():
@@ -389,7 +410,7 @@ def _extract_plot_data(
         if post_periods is None:
             post_periods = [p for p in periods if p >= 0]
 
-        return effects, se, periods, pre_periods, post_periods, reference_period
+        return effects, se, periods, pre_periods, post_periods, reference_period, reference_inferred
 
     raise TypeError(
         f"Cannot extract plot data from {type(results).__name__}. "
diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md
index 94c913c9..015ba986 100644
--- a/docs/methodology/REGISTRY.md
+++ b/docs/methodology/REGISTRY.md
@@ -734,25 +734,39 @@ n = 2(t_{α/2} + t_{1-κ})² σ² / MDE²
 
 **Reference Period Normalization**
 
-When `reference_period` is specified:
-- Point estimates are normalized: `effect_normalized = effect - effect_ref`
-- Reference period SE is set to NaN (it's now a constraint, not an estimate)
-- Other periods' SEs are unchanged (they represent uncertainty relative to the constraint)
-- CIs are recomputed from normalized effects and original SEs
+Normalization only occurs when `reference_period` is **explicitly specified** by the user:
 
-This follows the `fixest` (R) convention where the omitted/reference category is an identifying
-constraint with no associated uncertainty. This differs from the `did` (R) package which does
-not normalize and reports full inference for all periods including the reference.
+- **Explicit `reference_period=X`**: Normalizes effects (subtracts ref effect), sets ref SE to NaN
+  - Point estimates: `effect_normalized = effect - effect_ref`
+  - Reference period SE → NaN (it's now a constraint, not an estimate)
+  - Other periods' SEs unchanged (uncertainty relative to the constraint)
+  - CIs recomputed from normalized effects and original SEs
+
+- **Auto-inferred reference** (from CallawaySantAnna results): Hollow marker styling only, no normalization
+  - Original effects are plotted unchanged
+  - Reference period shown with hollow marker for visual indication
+  - All periods retain their original SEs and error bars
+
+This design prevents unintended normalization when the reference period isn't a true
+identifying constraint (e.g., CallawaySantAnna with `base_period="varying"` where different
+cohorts use different comparison periods).
+
+The explicit-only normalization follows the `fixest` (R) convention where the omitted/reference
+category is an identifying constraint with no associated uncertainty. Auto-inferred references
+follow the `did` (R) package convention which does not normalize and reports full inference.
 
 **Rationale**: When normalizing to a reference period, we're treating that period as an
 identifying constraint (effect ≡ 0 by definition). The variance of a constant is zero,
 but since it's a constraint rather than an estimated quantity, we report NaN rather than 0.
+Auto-inferred references may not represent true identifying constraints, so normalization
+should be a deliberate user choice.
 
 **Edge Cases:**
 - If `reference_period` not in data: No normalization applied
 - If reference effect is NaN: No normalization applied
-- Reference period CI becomes (NaN, NaN) after normalization
-- Reference period is plotted with hollow marker but no error bars
+- Reference period CI becomes (NaN, NaN) after normalization (explicit only)
+- Reference period is plotted with hollow marker (both explicit and auto-inferred)
+- Reference period error bars: removed for explicit, retained for auto-inferred
 
 **Reference implementation(s):**
 - R: `fixest::coefplot()` with reference category shown at 0 with no CI
diff --git a/tests/test_visualization.py b/tests/test_visualization.py
index 03113923..9a7ce783 100644
--- a/tests/test_visualization.py
+++ b/tests/test_visualization.py
@@ -456,6 +456,142 @@ def test_plot_event_study_normalization_with_nan_reference(self):
 
         plt.close()
 
+    def test_plot_cs_results_no_auto_normalization(self):
+        """Test that auto-inferred reference period does NOT normalize effects.
+
+        When CallawaySantAnna results auto-infer reference_period=-1 (or from n_groups=0),
+        effects should NOT be normalized (just hollow marker styling).
+        Only explicit reference_period=X should trigger normalization.
+        """
+        pytest.importorskip("matplotlib")
+        import matplotlib.pyplot as plt
+
+        # Generate staggered data and fit CallawaySantAnna with event study
+        data = generate_staggered_data()
+        cs = CallawaySantAnna()
+        results = cs.fit(
+            data,
+            outcome='outcome',
+            unit='unit',
+            time='time',
+            first_treat='first_treat',
+            aggregate='event_study'
+        )
+
+        # Get original effects from results (before any normalization)
+        original_effects = {
+            period: effect_data['effect']
+            for period, effect_data in results.event_study_effects.items()
+        }
+
+        # Plot WITHOUT explicitly specifying reference_period
+        # This should auto-infer reference but NOT normalize
+        ax = plot_event_study(results, show=False)
+
+        # Extract plotted y-values
+        y_values = []
+        for child in ax.get_children():
+            if hasattr(child, 'get_ydata'):
+                ydata = child.get_ydata()
+                if len(ydata) == 1:
+                    y_values.append(float(ydata[0]))
+
+        # Verify that the original (non-normalized) effects are plotted
+        # Check that at least some non-zero effects are preserved
+        non_zero_originals = [e for e in original_effects.values() if abs(e) > 0.01]
+        assert len(non_zero_originals) > 0, "Should have non-zero original effects"
+
+        # The key check: effects should NOT all be relative to some reference
+        # If normalized, reference would be at 0 and others shifted accordingly
+        # Since NOT normalized, we should see the original effect values
+        for period, orig_effect in original_effects.items():
+            if np.isfinite(orig_effect):
+                # Check that original value is present (not normalized)
+                assert any(abs(y - orig_effect) < 0.05 for y in y_values), \
+                    f"Original effect {orig_effect:.3f} for period {period} " \
+                    f"should be plotted without normalization. Found y_values: {y_values}"
+
+        plt.close()
+
+    def test_plot_cs_results_explicit_reference_normalizes(self):
+        """Test that explicit reference_period normalizes CallawaySantAnna results.
+
+        When user explicitly passes reference_period=X to plot_event_study,
+        it should normalize effects (subtract ref effect) and set ref SE to NaN.
+        """
+        pytest.importorskip("matplotlib")
+        import matplotlib.pyplot as plt
+
+        # Generate staggered data and fit CallawaySantAnna with event study
+        data = generate_staggered_data()
+        cs = CallawaySantAnna()
+        results = cs.fit(
+            data,
+            outcome='outcome',
+            unit='unit',
+            time='time',
+            first_treat='first_treat',
+            aggregate='event_study'
+        )
+
+        # Get original effects from results
+        original_effects = {
+            period: effect_data['effect']
+            for period, effect_data in results.event_study_effects.items()
+        }
+
+        # Choose reference period (typically -1)
+        ref_period = -1
+        ref_effect = original_effects.get(ref_period, 0.0)
+
+        # Compute expected normalized effects
+        expected_normalized = {
+            period: effect - ref_effect
+            for period, effect in original_effects.items()
+        }
+
+        # Plot WITH explicit reference_period - this SHOULD normalize
+        ax = plot_event_study(results, reference_period=ref_period, show=False)
+
+        # Extract plotted y-values
+        y_values = []
+        for child in ax.get_children():
+            if hasattr(child, 'get_ydata'):
+                ydata = child.get_ydata()
+                if len(ydata) == 1:
+                    y_values.append(float(ydata[0]))
+
+        # The reference period should now be at y=0 (normalized)
+        assert any(abs(y) < 0.01 for y in y_values), \
+            f"Reference period should be normalized to y=0, got y_values={y_values}"
+
+        # Verify normalized values are present
+        for period, norm_effect in expected_normalized.items():
+            if np.isfinite(norm_effect):
+                assert any(abs(y - norm_effect) < 0.05 for y in y_values), \
+                    f"Normalized effect {norm_effect:.3f} for period {period} " \
+                    f"not found in {y_values}"
+
+        # Verify reference period has no error bars (SE was set to NaN)
+        # Find error bar x-coordinates
+        periods_in_plot = sorted(original_effects.keys())
+        ref_x_idx = periods_in_plot.index(ref_period) if ref_period in periods_in_plot else None
+
+        if ref_x_idx is not None:
+            errorbar_x_coords = set()
+            for child in ax.get_children():
+                if hasattr(child, 'get_segments'):
+                    segments = child.get_segments()
+                    for seg in segments:
+                        if len(seg) >= 2:
+                            errorbar_x_coords.add(round(seg[0][0], 1))
+
+            # Reference period should NOT have error bars
+            assert ref_x_idx not in errorbar_x_coords, \
+                f"Reference period at x={ref_x_idx} should have no error bars"
+
+        plt.close()
+
 
 class TestPlotEventStudyIntegration:
     """Integration tests for event study plotting."""

From 710f2c1219f2312253d4edf9a8780bbe20acc51a Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 24 Jan 2026 19:49:29 -0500
Subject: [PATCH 4/4] Update docs and reuse fixture per PR review feedback

- Clarify reference_period docstring: explicit triggers normalization,
  auto-inferred only applies hollow marker styling
- Update Notes section to distinguish explicit vs inferred behavior
- Refactor two tests to reuse cs_results fixture instead of re-fitting

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 diff_diff/visualization.py  | 11 +++++++----
 tests/test_visualization.py | 30 ++++++------------------------
 2 files changed, 13 insertions(+), 28 deletions(-)

diff --git a/diff_diff/visualization.py b/diff_diff/visualization.py
index 72f2cd4b..3ba3a809 100644
--- a/diff_diff/visualization.py
+++ b/diff_diff/visualization.py
@@ -73,8 +73,10 @@ def plot_event_study(
     periods : list, optional
         List of periods to plot. If None, uses all periods from results.
     reference_period : any, optional
-        The reference period (normalized to effect=0). Will be shown as a
-        hollow marker. If None, tries to infer from results.
+        The reference period to highlight. When explicitly provided, effects
+        are normalized (ref effect subtracted) and ref SE is set to NaN.
+        When None and auto-inferred from results, only hollow marker styling
+        is applied (no normalization). If None, tries to infer from results.
     pre_periods : list, optional
         List of pre-treatment periods. Used for shading.
     post_periods : list, optional
@@ -151,8 +153,9 @@ def plot_event_study(
        trends holds. Large pre-treatment effects suggest the assumption may
        be violated.
 
-    2. **Reference period**: Usually the last pre-treatment period (t=-1),
-       normalized to zero. This is the omitted category.
+    2. **Reference period**: Usually the last pre-treatment period (t=-1).
+       When explicitly specified via ``reference_period``, effects are normalized
+       to zero at this period. When auto-inferred, shown with hollow marker only.
 
     3. **Post-treatment periods**: The treatment effects of interest. These
        show how the outcome evolved after treatment.
diff --git a/tests/test_visualization.py b/tests/test_visualization.py
index 9a7ce783..4a3ae3b7 100644
--- a/tests/test_visualization.py
+++ b/tests/test_visualization.py
@@ -456,7 +456,7 @@ def test_plot_event_study_normalization_with_nan_reference(self):
 
         plt.close()
 
-    def test_plot_cs_results_no_auto_normalization(self):
+    def test_plot_cs_results_no_auto_normalization(self, cs_results):
         """Test that auto-inferred reference period does NOT normalize effects.
 
         When CallawaySantAnna results auto-infer reference_period=-1 (or from n_groups=0),
@@ -466,17 +466,8 @@ def test_plot_cs_results_no_auto_normalization(self):
         pytest.importorskip("matplotlib")
         import matplotlib.pyplot as plt
 
-        # Generate staggered data and fit CallawaySantAnna with event study
-        data = generate_staggered_data()
-        cs = CallawaySantAnna()
-        results = cs.fit(
-            data,
-            outcome='outcome',
-            unit='unit',
-            time='time',
-            first_treat='first_treat',
-            aggregate='event_study'
-        )
+        # Use fixture instead of re-fitting
+        results = cs_results
 
         # Get original effects from results (before any normalization)
         original_effects = {
@@ -513,7 +504,7 @@ def test_plot_cs_results_no_auto_normalization(self):
 
         plt.close()
 
-    def test_plot_cs_results_explicit_reference_normalizes(self):
+    def test_plot_cs_results_explicit_reference_normalizes(self, cs_results):
         """Test that explicit reference_period normalizes CallawaySantAnna results.
 
         When user explicitly passes reference_period=X to plot_event_study,
@@ -522,17 +513,8 @@ def test_plot_cs_results_explicit_reference_normalizes(self):
         pytest.importorskip("matplotlib")
         import matplotlib.pyplot as plt
 
-        # Generate staggered data and fit CallawaySantAnna with event study
-        data = generate_staggered_data()
-        cs = CallawaySantAnna()
-        results = cs.fit(
-            data,
-            outcome='outcome',
-            unit='unit',
-            time='time',
-            first_treat='first_treat',
-            aggregate='event_study'
-        )
+        # Use fixture instead of re-fitting
+        results = cs_results
 
         # Get original effects from results
         original_effects = {