Address AI review: tighten fixture guard, derive truth, add provenance

igerber · claude · igerber · commit a4b388178e6f · 2026-03-21T10:25:18.000-04:00
P2: Assert exact cohort counts (656/252/176/163/65) and wave support
since the CSV fixture is deterministic — approximate tolerances could
mask fixture drift.

P3: Derive _TRUE_ES_AVG_COMPUSTAT programmatically from DGP parameters
instead of hard-coding, so changes to the DGP definition propagate
automatically.

P3: Add tests/data/README.md documenting the HRS fixture source,
sample selection steps, and expected counts for future audit/rebuild.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/tests/data/README.md b/tests/data/README.md
@@ -0,0 +1,34 @@
+# Test Data Fixtures
+
+## hrs_edid_validation.csv
+
+**Source:** Dobkin, C., Finkelstein, A., Kluender, R., & Notowidigdo, M. J. (2018).
+"The Economic Consequences of Hospital Admissions." *American Economic Review*, 108(2), 308-352.
+Replication kit: https://www.openicpsr.org/openicpsr/project/116186/version/V1/view
+
+**Sample selection:** Follows Sun & Abraham (2021), as used by Chen, Sant'Anna & Xie (2025)
+Section 6:
+
+1. Read `HRS_long.dta` from the Dobkin et al. replication kit
+2. Keep waves 7-11, retain only individuals present in all 5 waves
+3. Filter to ever-hospitalized individuals with `first_hosp >= 8`
+4. Filter to ages 50-59 at hospitalization (`age_hosp`)
+5. Drop wave 11 (no valid comparison group)
+6. Recode `first_hosp == 11` as never-treated (`inf`)
+
+**Expected counts:**
+
+| Column | Values |
+|--------|--------|
+| Total individuals | 656 |
+| Waves | 7, 8, 9, 10 |
+| Rows | 2,624 |
+| G=8 | 252 |
+| G=9 | 176 |
+| G=10 | 163 |
+| G=inf | 65 |
+
+**Columns:** `unit` (hhidpn), `time` (wave), `outcome` (oop_spend, 2005 dollars), `first_treat` (first_hosp)
+
+**Regeneration:** Requires the Dobkin et al. replication kit (`.gitignore`d as `replication_data/`).
+The extraction logic is documented in the plan file and was executed as a one-time preprocessing step.
diff --git a/tests/test_efficient_did_validation.py b/tests/test_efficient_did_validation.py
@@ -142,10 +142,28 @@ def _compute_es_avg(result):
     return np.mean(list(es.values()))
 
 
-# Ground truth ES_avg for Compustat DGP (see plan for derivation)
-_TRUE_ES_AVG_COMPUSTAT = np.mean(
-    [0.1235, 0.247, 0.3705, 0.494, 0.770, 0.924, 1.078]
-)
+# Ground truth derived from DGP parameters (not hard-coded)
+_ATT_COEFS = {5: 0.154, 8: 0.093}  # ATT(g,t) = coef * (t - g + 1) for t >= g
+_N_PERIODS = 11
+
+
+def _true_es_avg_from_dgp():
+    """Derive ES_avg from DGP treatment effect parameters."""
+    max_e = {g: _N_PERIODS - g for g in _ATT_COEFS}
+    all_e = range(0, max(max_e.values()) + 1)
+    es_values = []
+    for e in all_e:
+        contributing = [
+            coef * (e + 1)
+            for g, coef in _ATT_COEFS.items()
+            if e <= max_e[g]
+        ]
+        if contributing:
+            es_values.append(np.mean(contributing))
+    return np.mean(es_values)
+
+
+_TRUE_ES_AVG_COMPUSTAT = _true_es_avg_from_dgp()
 
 
 def _true_overall_att_compustat():
@@ -226,24 +244,26 @@ class TestHRSReplication:
     """Validate EDiD against Table 6 of Chen, Sant'Anna & Xie (2025)."""
 
     def test_sample_selection_yields_expected_counts(self, hrs_data):
+        # Fixture is deterministic — assert exact counts
         n_units = hrs_data["unit"].nunique()
-        assert abs(n_units - 652) <= 10, f"Expected ~652 units, got {n_units}"
+        assert n_units == 656, f"Expected 656 units, got {n_units}"
 
         groups = hrs_data.groupby("unit")["first_treat"].first()
 
-        # Check 4 groups exist
         finite_groups = sorted(g for g in groups.unique() if np.isfinite(g))
         assert finite_groups == [8, 9, 10], f"Expected groups [8,9,10], got {finite_groups}"
         assert any(np.isinf(g) for g in groups.unique()), "Missing never-treated group"
 
-        # Check approximate sizes
-        for g, expected in [(8, 252), (9, 176), (10, 163)]:
+        expected_sizes = {8: 252, 9: 176, 10: 163}
+        for g, expected in expected_sizes.items():
             actual = (groups == g).sum()
-            assert abs(actual - expected) <= 15, (
-                f"G={g}: expected ~{expected}, got {actual}"
-            )
+            assert actual == expected, f"G={g}: expected {expected}, got {actual}"
         n_inf = groups.apply(np.isinf).sum()
-        assert abs(n_inf - 65) <= 10, f"G=inf: expected ~65, got {n_inf}"
+        assert n_inf == 65, f"G=inf: expected 65, got {n_inf}"
+
+        assert sorted(hrs_data["time"].unique()) == [7, 8, 9, 10], (
+            f"Expected waves [7,8,9,10], got {sorted(hrs_data['time'].unique())}"
+        )
 
     def test_group_time_effects_match_table6(self, edid_hrs_result):
         for (g, t), (expected_effect, _) in TABLE6_EDID.items():