From 63e16ea29011a00e959da75baea97550481c8cd0 Mon Sep 17 00:00:00 2001
From: Orpheus Lummis <o@orpheuslummis.info>
Date: Mon, 2 Mar 2026 18:37:32 +0000
Subject: [PATCH] Add calibration analysis: ECE, reliability diagrams, Brier
 decomposition
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ForecastBench reports Brier score, BSS, Peer Score, and oracle equivalence
but no calibration metrics. Calibration — does P=0.7 mean 70%? — is the
most safety-relevant property of a forecasting system. This adds it as a
parallel analysis step computed nightly alongside the existing leaderboard.

Pipeline (src/leaderboard/main.py):
- compute_calibration_metrics(): per-model ECE, Brier decomposition
  (reliability, resolution, uncertainty per Murphy 1973), sharpness
- compute_calibration_curve_data(): per-(model, bin) data for reliability
  diagrams
- write_calibration_data(): CSV + JSON to public release bucket
- Wired into make_leaderboard() after oracle removal

Website:
- /calibration/ page with D3 reliability diagram and metrics table
- Baseline/tournament toggle, model checkboxes, tooltips
- entrypoint.sh copies calibration files from bucket to assets/data/
- Nav entry between Explore and Datasets

Note: Houtan added Plotly-based calibration plots in March 2024 (7729a9d)
but they were server-side only and removed during the codebase restructure.
This surfaces calibration as a first-class website feature with proper
metrics (ECE, Brier decomposition) and interactive visualization.

Zero changes to existing scoring functions, bootstrap, or leaderboard output.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/leaderboard/main.py                       | 152 ++++++++++
 .../_data/navigation.yml                      |   3 +
 .../assets/css/custom.scss                    |  42 +++
 .../assets/js/calibration_chart.js            | 282 ++++++++++++++++++
 .../calibration/index.md                      |  55 ++++
 .../entrypoint.sh.template                    |   7 +
 tests/generate_test_calibration_data.py       | 157 ++++++++++
 tests/test_calibration.py                     | 221 ++++++++++++++
 8 files changed, 919 insertions(+)
 create mode 100644 src/www.forecastbench.org/assets/js/calibration_chart.js
 create mode 100644 src/www.forecastbench.org/calibration/index.md
 create mode 100644 tests/generate_test_calibration_data.py
 create mode 100644 tests/test_calibration.py

diff --git a/src/leaderboard/main.py b/src/leaderboard/main.py
index 303a0dc3..6b0be234 100644
--- a/src/leaderboard/main.py
+++ b/src/leaderboard/main.py
@@ -1595,6 +1595,139 @@ def brier_skill_score(df: pd.DataFrame) -> pd.DataFrame:
     return df[orig_cols + ["brier_skill_score"]]
 
 
+def compute_calibration_metrics(
+    df: pd.DataFrame,
+    n_bins: int = 10,
+) -> pd.DataFrame:
+    """Compute calibration metrics for each model.
+
+    Metrics computed per model:
+    - ECE (Expected Calibration Error): weighted mean of |forecast_mean - resolution_rate| per bin
+    - Brier decomposition (Murphy 1973): reliability, resolution, uncertainty
+    - Sharpness: standard deviation of forecasts
+
+    Args:
+        df (pd.DataFrame): Resolved forecasts with 'forecast', 'resolved_to',
+            'model_pk', 'organization', 'model_organization', 'model' columns.
+        n_bins (int): Number of equal-width bins on [0, 1].
+
+    Returns:
+        pd.DataFrame: One row per model with columns: model_pk, organization, model_organization,
+            model, ece, reliability, resolution, uncertainty, sharpness, n_forecasts.
+    """
+    bin_edges = np.linspace(0, 1, n_bins + 1)
+    df = df.copy()
+    df["bin"] = np.digitize(df["forecast"], bin_edges, right=True).clip(1, n_bins)
+
+    overall_base_rate = df["resolved_to"].mean()
+    overall_uncertainty = overall_base_rate * (1 - overall_base_rate)
+
+    rows = []
+    for (model_pk, org, model_org, model), grp in df.groupby(
+        ["model_pk", "organization", "model_organization", "model"]
+    ):
+        n_total = len(grp)
+        ece = 0.0
+        reliability = 0.0
+        resolution = 0.0
+
+        for _, bin_grp in grp.groupby("bin"):
+            n_k = len(bin_grp)
+            weight = n_k / n_total
+            forecast_mean = bin_grp["forecast"].mean()
+            observed_rate = bin_grp["resolved_to"].mean()
+            ece += weight * abs(forecast_mean - observed_rate)
+            reliability += weight * (forecast_mean - observed_rate) ** 2
+            resolution += weight * (observed_rate - overall_base_rate) ** 2
+
+        sharpness = grp["forecast"].std()
+
+        rows.append(
+            {
+                "model_pk": model_pk,
+                "organization": org,
+                "model_organization": model_org,
+                "model": model,
+                "ece": round(ece, 6),
+                "reliability": round(reliability, 6),
+                "resolution": round(resolution, 6),
+                "uncertainty": round(overall_uncertainty, 6),
+                "sharpness": round(sharpness, 6),
+                "n_forecasts": n_total,
+            }
+        )
+
+    return pd.DataFrame(rows)
+
+
+def compute_calibration_curve_data(
+    df: pd.DataFrame,
+    n_bins: int = 10,
+) -> pd.DataFrame:
+    """Compute per-bin calibration curve data for reliability diagrams.
+
+    Args:
+        df (pd.DataFrame): Resolved forecasts with 'forecast', 'resolved_to',
+            'model_pk', 'organization', 'model' columns.
+        n_bins (int): Number of equal-width bins on [0, 1].
+
+    Returns:
+        pd.DataFrame: Per-(model, bin) rows with columns: model_pk, organization, model,
+            bin_midpoint, forecast_mean, resolution_rate, n_bin.
+    """
+    bin_edges = np.linspace(0, 1, n_bins + 1)
+    bin_midpoints = (bin_edges[:-1] + bin_edges[1:]) / 2
+    df = df.copy()
+    df["bin"] = np.digitize(df["forecast"], bin_edges, right=True).clip(1, n_bins)
+
+    rows = []
+    for (model_pk, org, model), grp in df.groupby(["model_pk", "organization", "model"]):
+        for bin_idx, bin_grp in grp.groupby("bin"):
+            rows.append(
+                {
+                    "model_pk": model_pk,
+                    "organization": org,
+                    "model": model,
+                    "bin_midpoint": round(bin_midpoints[bin_idx - 1], 3),
+                    "forecast_mean": round(bin_grp["forecast"].mean(), 4),
+                    "resolution_rate": round(bin_grp["resolved_to"].mean(), 4),
+                    "n_bin": len(bin_grp),
+                }
+            )
+
+    return pd.DataFrame(rows)
+
+
+def write_calibration_data(
+    df_calibration_metrics: pd.DataFrame,
+    df_calibration_curves: pd.DataFrame,
+    leaderboard_type: "LeaderboardType",
+) -> None:
+    """Write calibration metrics CSV and curve data JSON to the public bucket.
+
+    Args:
+        df_calibration_metrics (pd.DataFrame): One row per model with calibration metrics.
+        df_calibration_curves (pd.DataFrame): Per-(model, bin) curve data for reliability diagrams.
+        leaderboard_type (LeaderboardType): baseline or tournament.
+
+    Returns:
+        None.
+    """
+    directory = data_utils.get_mounted_bucket(bucket=env.PUBLIC_RELEASE_BUCKET)
+    destination_folder = "leaderboards/csv"
+    os.makedirs(f"{directory}/{destination_folder}", exist_ok=True)
+
+    # Write metrics CSV
+    metrics_filename = f"{destination_folder}/calibration_metrics_{leaderboard_type.value}.csv"
+    df_calibration_metrics.to_csv(f"{directory}/{metrics_filename}", index=False)
+
+    # Write curves JSON
+    curves_filename = f"{destination_folder}/calibration_curves_{leaderboard_type.value}.json"
+    df_calibration_curves.to_json(f"{directory}/{curves_filename}", orient="records", indent=2)
+
+    logger.info(f"Wrote calibration data for {leaderboard_type.value} leaderboard.")
+
+
 def score_models(
     df: pd.DataFrame,
     scoring_funcs: List[Callable[[pd.DataFrame], pd.DataFrame]],
@@ -2633,6 +2766,11 @@ def make_leaderboard(
     df = remove_x_pct_oracles(df=df)
     df_leaderboard = remove_x_pct_oracles(df=df_leaderboard)
 
+    # Calibration analysis (oracles already removed from df above)
+    df_resolved = df[df["resolved"].astype(bool)].copy()
+    df_calibration_metrics = compute_calibration_metrics(df=df_resolved)
+    df_calibration_curves = compute_calibration_curve_data(df=df_resolved)
+
     # Get simulated scores
     df_simulated_scores_dataset, df_simulated_scores_market, df_simulated_scores_overall = (
         generate_simulated_leaderboards(
@@ -2712,6 +2850,20 @@ def make_leaderboard(
             leaderboard_type=leaderboard_type,
         )
 
+        # Write calibration data for this leaderboard type
+        lt_model_pks = set(df_leaderboard_lt["model_pk"].unique())
+        df_cal_lt = df_calibration_metrics[
+            df_calibration_metrics["model_pk"].isin(lt_model_pks)
+        ]
+        df_curves_lt = df_calibration_curves[
+            df_calibration_curves["model_pk"].isin(lt_model_pks)
+        ]
+        write_calibration_data(
+            df_calibration_metrics=df_cal_lt,
+            df_calibration_curves=df_curves_lt,
+            leaderboard_type=leaderboard_type,
+        )
+
 
 def download_and_compile_processed_forecast_files(bucket: str) -> List[pd.DataFrame]:
     """Download and compile processed forecast files into entries list.
diff --git a/src/www.forecastbench.org/_data/navigation.yml b/src/www.forecastbench.org/_data/navigation.yml
index 0f80e684..a50da864 100644
--- a/src/www.forecastbench.org/_data/navigation.yml
+++ b/src/www.forecastbench.org/_data/navigation.yml
@@ -11,6 +11,9 @@ main:
   - title: "Explore"
     url: "/explore/"
     icon: "fa-solid fa-lightbulb"
+  - title: "Calibration"
+    url: "/calibration/"
+    icon: "fa-solid fa-bullseye"
   - title: "Datasets"
     url: "/datasets/"
     icon: "fa-solid fa-layer-group"
diff --git a/src/www.forecastbench.org/assets/css/custom.scss b/src/www.forecastbench.org/assets/css/custom.scss
index 31765f16..45952b1e 100644
--- a/src/www.forecastbench.org/assets/css/custom.scss
+++ b/src/www.forecastbench.org/assets/css/custom.scss
@@ -1560,3 +1560,45 @@ input[type="checkbox"]:checked + .toggle-slider:before {
 .fb-footer__divider {
   opacity: 0.6;
 }
+
+/* ── Calibration page ── */
+
+#reliability-diagram {
+  width: 100%;
+  max-width: 580px;
+  margin: 1.5rem auto;
+}
+
+#reliability-diagram svg {
+  display: block;
+}
+
+.calibration-table {
+  width: 100%;
+  border-collapse: collapse;
+  font-size: 0.85rem;
+  margin: 1rem 0;
+}
+
+.calibration-table th,
+.calibration-table td {
+  padding: 0.45rem 0.75rem;
+  text-align: right;
+  border-bottom: 1px solid #e5e7eb;
+}
+
+.calibration-table th {
+  font-weight: 600;
+  background: #f9fafb;
+  position: sticky;
+  top: 0;
+}
+
+.calibration-table td:first-child,
+.calibration-table th:first-child {
+  text-align: left;
+}
+
+.calibration-table tbody tr:hover {
+  background: #f1f5f9;
+}
diff --git a/src/www.forecastbench.org/assets/js/calibration_chart.js b/src/www.forecastbench.org/assets/js/calibration_chart.js
new file mode 100644
index 00000000..e9bd9064
--- /dev/null
+++ b/src/www.forecastbench.org/assets/js/calibration_chart.js
@@ -0,0 +1,282 @@
+(function () {
+  const CURVES_PATH_BASELINE = '/assets/data/calibration_curves_baseline.json';
+  const CURVES_PATH_TOURNAMENT = '/assets/data/calibration_curves_tournament.json';
+  const METRICS_PATH_BASELINE = '/assets/data/calibration_metrics_baseline.csv';
+  const METRICS_PATH_TOURNAMENT = '/assets/data/calibration_metrics_tournament.csv';
+
+  const MAX_DEFAULT_MODELS = 5;
+  const MARGIN = { top: 20, right: 30, bottom: 50, left: 55 };
+  const SIZE = 500;
+
+  const colorScale = d3.scaleOrdinal(d3.schemeTableau10);
+  const tip = d3.select('#tooltip');
+
+  let curveData = null;
+  let metricsData = null;
+  let selectedModels = new Set();
+
+  function getLeaderboardType() {
+    return document.querySelector('input[name="lbSelect"]:checked').value;
+  }
+
+  function getCurvesPath() {
+    return getLeaderboardType() === 'tournament' ? CURVES_PATH_TOURNAMENT : CURVES_PATH_BASELINE;
+  }
+
+  function getMetricsPath() {
+    return getLeaderboardType() === 'tournament' ? METRICS_PATH_TOURNAMENT : METRICS_PATH_BASELINE;
+  }
+
+  function modelLabel(d) {
+    const org = d.organization || '';
+    const model = d.model || d.model_pk || '';
+    if (org && org !== model) return org + ' / ' + model;
+    return model;
+  }
+
+  function loadData() {
+    Promise.all([
+      fetch(getCurvesPath()).then(r => r.json()),
+      d3.csv(getMetricsPath()),
+    ]).then(([curves, metrics]) => {
+      curveData = curves;
+      metricsData = metrics.map(d => ({
+        ...d,
+        ece: +d.ece,
+        reliability: +d.reliability,
+        resolution: +d.resolution,
+        uncertainty: +d.uncertainty,
+        sharpness: +d.sharpness,
+        n_forecasts: +d.n_forecasts,
+      }));
+
+      // Sort by ECE ascending, pick top N as default
+      metricsData.sort((a, b) => a.ece - b.ece);
+      const defaultModels = metricsData.slice(0, MAX_DEFAULT_MODELS).map(d => d.model_pk);
+      selectedModels = new Set(defaultModels);
+
+      buildModelCheckboxes();
+      renderChart();
+      renderTable();
+    }).catch(err => {
+      console.error('Failed to load calibration data:', err);
+      d3.select('#reliability-diagram').html(
+        '<p style="color:#888;padding:2rem;">Calibration data not yet available. ' +
+        'Run the leaderboard pipeline to generate calibration artifacts.</p>'
+      );
+    });
+  }
+
+  function buildModelCheckboxes() {
+    const container = d3.select('#model-checkboxes');
+    container.html('');
+    metricsData.forEach(d => {
+      const id = 'model_' + d.model_pk.replace(/[^a-zA-Z0-9]/g, '_');
+      const div = container.append('div').attr('class', 'tag-option');
+      div.append('input')
+        .attr('type', 'checkbox')
+        .attr('id', id)
+        .attr('value', d.model_pk)
+        .property('checked', selectedModels.has(d.model_pk))
+        .on('change', function () {
+          if (this.checked) {
+            selectedModels.add(d.model_pk);
+          } else {
+            selectedModels.delete(d.model_pk);
+          }
+          renderChart();
+          renderTable();
+        });
+      div.append('label')
+        .attr('for', id)
+        .text(modelLabel(d));
+    });
+  }
+
+  function renderChart() {
+    const container = d3.select('#reliability-diagram');
+    container.html('');
+
+    const width = SIZE;
+    const height = SIZE;
+
+    const svg = container.append('svg')
+      .attr('viewBox', `0 0 ${width + MARGIN.left + MARGIN.right} ${height + MARGIN.top + MARGIN.bottom}`)
+      .attr('preserveAspectRatio', 'xMidYMid meet')
+      .style('max-width', (width + MARGIN.left + MARGIN.right) + 'px')
+      .style('width', '100%');
+
+    const g = svg.append('g')
+      .attr('transform', `translate(${MARGIN.left},${MARGIN.top})`);
+
+    const x = d3.scaleLinear().domain([0, 1]).range([0, width]);
+    const y = d3.scaleLinear().domain([0, 1]).range([height, 0]);
+
+    // Axes
+    g.append('g')
+      .attr('transform', `translate(0,${height})`)
+      .call(d3.axisBottom(x).ticks(10))
+      .append('text')
+      .attr('x', width / 2)
+      .attr('y', 40)
+      .attr('fill', 'currentColor')
+      .attr('text-anchor', 'middle')
+      .style('font-size', '13px')
+      .text('Forecast Probability');
+
+    g.append('g')
+      .call(d3.axisLeft(y).ticks(10))
+      .append('text')
+      .attr('transform', 'rotate(-90)')
+      .attr('x', -height / 2)
+      .attr('y', -42)
+      .attr('fill', 'currentColor')
+      .attr('text-anchor', 'middle')
+      .style('font-size', '13px')
+      .text('Observed Frequency');
+
+    // Perfect calibration diagonal
+    g.append('line')
+      .attr('x1', x(0)).attr('y1', y(0))
+      .attr('x2', x(1)).attr('y2', y(1))
+      .attr('stroke', '#888')
+      .attr('stroke-dasharray', '6,4')
+      .attr('stroke-width', 1.5)
+      .attr('opacity', 0.7);
+
+    // Filter curves for selected models
+    const filteredCurves = curveData.filter(d => selectedModels.has(d.model_pk));
+
+    // Group by model
+    const byModel = d3.group(filteredCurves, d => d.model_pk);
+
+    // Size scale for circles
+    const allN = filteredCurves.map(d => d.n_bin);
+    const maxN = d3.max(allN) || 1;
+    const rScale = d3.scaleSqrt().domain([0, maxN]).range([3, 14]);
+
+    let colorIdx = 0;
+    const modelColors = new Map();
+    for (const mpk of selectedModels) {
+      modelColors.set(mpk, colorScale(colorIdx++));
+    }
+
+    // Draw lines and circles for each model
+    for (const [modelPk, points] of byModel) {
+      const color = modelColors.get(modelPk) || '#999';
+      const sorted = [...points].sort((a, b) => a.bin_midpoint - b.bin_midpoint);
+
+      // Line
+      const line = d3.line()
+        .x(d => x(d.forecast_mean))
+        .y(d => y(d.resolution_rate));
+
+      g.append('path')
+        .datum(sorted)
+        .attr('d', line)
+        .attr('fill', 'none')
+        .attr('stroke', color)
+        .attr('stroke-width', 2)
+        .attr('opacity', 0.8);
+
+      // Circles
+      g.selectAll(null)
+        .data(sorted)
+        .enter()
+        .append('circle')
+        .attr('cx', d => x(d.forecast_mean))
+        .attr('cy', d => y(d.resolution_rate))
+        .attr('r', d => rScale(d.n_bin))
+        .attr('fill', color)
+        .attr('fill-opacity', 0.7)
+        .attr('stroke', color)
+        .attr('stroke-width', 1)
+        .on('mouseover', function (event, d) {
+          d3.select(this).attr('fill-opacity', 1).attr('stroke-width', 2);
+          tip.style('opacity', 1)
+            .html(
+              `<strong>${modelLabel(d)}</strong><br>` +
+              `Bin midpoint: ${d.bin_midpoint}<br>` +
+              `Forecast mean: ${d3.format('.3f')(d.forecast_mean)}<br>` +
+              `Observed freq: ${d3.format('.3f')(d.resolution_rate)}<br>` +
+              `N: ${d.n_bin}`
+            )
+            .style('left', (event.pageX + 12) + 'px')
+            .style('top', (event.pageY - 20) + 'px');
+        })
+        .on('mouseout', function () {
+          d3.select(this).attr('fill-opacity', 0.7).attr('stroke-width', 1);
+          tip.style('opacity', 0);
+        });
+    }
+
+    // Legend
+    const legend = g.append('g')
+      .attr('transform', `translate(${width - 180}, 10)`);
+
+    let ly = 0;
+    for (const [modelPk, color] of modelColors) {
+      const meta = metricsData.find(d => d.model_pk === modelPk);
+      const label = meta ? modelLabel(meta) : modelPk;
+      const row = legend.append('g').attr('transform', `translate(0,${ly})`);
+      row.append('rect')
+        .attr('width', 12).attr('height', 12)
+        .attr('fill', color).attr('rx', 2);
+      row.append('text')
+        .attr('x', 16).attr('y', 10)
+        .style('font-size', '11px')
+        .attr('fill', 'currentColor')
+        .text(label.length > 22 ? label.slice(0, 20) + '...' : label);
+      ly += 18;
+    }
+  }
+
+  function renderTable() {
+    const container = d3.select('#metrics-table-container');
+    container.html('');
+
+    const filtered = metricsData.filter(d => selectedModels.has(d.model_pk));
+    if (filtered.length === 0) {
+      container.append('p').style('color', '#888').text('Select models above to see metrics.');
+      return;
+    }
+
+    const table = container.append('table').attr('class', 'calibration-table');
+    const thead = table.append('thead');
+    const tbody = table.append('tbody');
+
+    const columns = [
+      { key: 'label', label: 'Model' },
+      { key: 'ece', label: 'ECE' },
+      { key: 'reliability', label: 'Reliability' },
+      { key: 'resolution', label: 'Resolution' },
+      { key: 'uncertainty', label: 'Uncertainty' },
+      { key: 'sharpness', label: 'Sharpness' },
+      { key: 'n_forecasts', label: 'N' },
+    ];
+
+    thead.append('tr').selectAll('th')
+      .data(columns)
+      .enter()
+      .append('th')
+      .text(d => d.label);
+
+    filtered.forEach(d => {
+      const row = tbody.append('tr');
+      columns.forEach(col => {
+        const val = col.key === 'label' ? modelLabel(d)
+          : col.key === 'n_forecasts' ? d3.format(',')(d[col.key])
+          : d3.format('.4f')(d[col.key]);
+        row.append('td').text(val);
+      });
+    });
+  }
+
+  // Event listeners
+  document.querySelectorAll('input[name="lbSelect"]').forEach(el => {
+    el.addEventListener('change', loadData);
+  });
+
+  // Initial load
+  loadData();
+})();
diff --git a/src/www.forecastbench.org/calibration/index.md b/src/www.forecastbench.org/calibration/index.md
new file mode 100644
index 00000000..9f1c2706
--- /dev/null
+++ b/src/www.forecastbench.org/calibration/index.md
@@ -0,0 +1,55 @@
+---
+layout: splash
+title: "Calibration"
+permalink: /calibration/
+custom_css: /assets/css/custom.scss
+after_footer_scripts:
+  - https://cdn.jsdelivr.net/npm/d3@7
+  - /assets/js/calibration_chart.js
+---
+
+<div id="calibration-page" class="leaderboard-wrapper">
+  <h1 class="leaderboard-title">Model Calibration Analysis</h1>
+  <p>A well-calibrated forecaster's predicted probabilities match observed frequencies: when it says 70%, the event should occur ~70% of the time.
+  This page shows <strong>reliability diagrams</strong> and calibration metrics for each model on ForecastBench.</p>
+  <ul>
+    <li>Points on the diagonal indicate perfect calibration.</li>
+    <li>Points above the diagonal indicate underconfidence (events happen more often than predicted).</li>
+    <li>Points below the diagonal indicate overconfidence (events happen less often than predicted).</li>
+    <li>Circle size reflects the number of forecasts in each probability bin.</li>
+  </ul>
+
+  <div class="chart-container">
+    <div class="controls">
+      <div class="control-section">
+        <div class="control-label">Leaderboard</div>
+        <div class="segmented-control">
+          <div class="segmented-option">
+            <input type="radio" id="lb_baseline" name="lbSelect" value="baseline" checked>
+            <label for="lb_baseline">Baseline</label>
+          </div>
+          <div class="segmented-option">
+            <input type="radio" id="lb_tournament" name="lbSelect" value="tournament">
+            <label for="lb_tournament">Tournament</label>
+          </div>
+        </div>
+      </div>
+      <div class="control-section">
+        <div class="control-label">Models</div>
+        <div id="model-checkboxes" class="tag-selection"></div>
+      </div>
+    </div>
+    <div id="reliability-diagram"></div>
+  </div>
+
+  <h2 class="leaderboard-title" style="margin-top: 2rem;">Calibration Metrics</h2>
+  <p>
+    <strong>ECE</strong> (Expected Calibration Error): weighted mean absolute gap between predicted probability and observed frequency. Lower is better.
+    <strong>Reliability</strong>: weighted mean squared gap (Brier decomposition). Lower is better.
+    <strong>Resolution</strong>: how much observed frequencies vary across bins. Higher is better.
+    <strong>Uncertainty</strong>: base-rate variance (same for all models). <strong>Sharpness</strong>: spread of forecast probabilities.
+  </p>
+  <div id="metrics-table-container"></div>
+</div>
+
+<div id="tooltip" class="tooltip"></div>
diff --git a/src/www.forecastbench.org/entrypoint.sh.template b/src/www.forecastbench.org/entrypoint.sh.template
index 3ed112ae..51bc4062 100644
--- a/src/www.forecastbench.org/entrypoint.sh.template
+++ b/src/www.forecastbench.org/entrypoint.sh.template
@@ -44,6 +44,13 @@ mkdir -p "$ASSETS_MNT/$DATA_DIR"
 cp -a "$PUBLIC_RELEASE_MNT/leaderboards/csv/sota_graph_baseline.csv" "$ASSETS_MNT/$DATA_DIR/"
 cp -a "$PUBLIC_RELEASE_MNT/leaderboards/csv/sota_graph_tournament.csv" "$ASSETS_MNT/$DATA_DIR/"
 
+# calibration data
+for f in calibration_metrics_baseline.csv calibration_metrics_tournament.csv \
+         calibration_curves_baseline.json calibration_curves_tournament.json; do
+  [ -f "$PUBLIC_RELEASE_MNT/leaderboards/csv/$f" ] && \
+    cp -a "$PUBLIC_RELEASE_MNT/leaderboards/csv/$f" "$ASSETS_MNT/$DATA_DIR/"
+done
+
 # archived forecast sets
 FORECAST_SETS_MNT="$MOUNT_POINT/FORECAST_SETS_BUCKET"
 FORECAST_SETS_DIR="assets/data/forecast-sets"
diff --git a/tests/generate_test_calibration_data.py b/tests/generate_test_calibration_data.py
new file mode 100644
index 00000000..3fea432b
--- /dev/null
+++ b/tests/generate_test_calibration_data.py
@@ -0,0 +1,157 @@
+"""Generate test calibration data files for local website verification.
+
+Run: python tests/generate_test_calibration_data.py
+Then: cd src/www.forecastbench.org && bundle exec jekyll serve
+
+This writes to src/www.forecastbench.org/assets/data/ so the calibration page can render.
+The generated data uses synthetic forecasts with known calibration properties.
+"""
+
+import json
+import os
+import sys
+
+import numpy as np
+import pandas as pd
+
+ASSET_DIR = os.path.join(
+    os.path.dirname(__file__),
+    "..",
+    "src",
+    "www.forecastbench.org",
+    "assets",
+    "data",
+)
+
+MODELS = [
+    ("well_calibrated", "Synthetic", "Well-Calibrated Model"),
+    ("overconfident", "Synthetic", "Overconfident Model"),
+    ("underconfident", "Synthetic", "Underconfident Model"),
+    ("sharp", "Synthetic", "Sharp Model"),
+]
+
+N_BINS = 10
+
+
+def generate_forecasts(model_type, rng, n=2000):
+    """Generate synthetic forecasts with known calibration properties."""
+    base = rng.uniform(0.05, 0.95, n)
+
+    if model_type == "well_calibrated":
+        forecasts = base
+    elif model_type == "overconfident":
+        # Push toward extremes
+        forecasts = np.where(base > 0.5, base + (1 - base) * 0.4, base * 0.6)
+    elif model_type == "underconfident":
+        # Push toward 0.5
+        forecasts = 0.5 + (base - 0.5) * 0.5
+    elif model_type == "sharp":
+        # Mostly extreme probabilities
+        forecasts = np.where(base > 0.5, 0.85 + rng.uniform(0, 0.14, n), 0.01 + rng.uniform(0, 0.14, n))
+    else:
+        forecasts = base
+
+    forecasts = np.clip(forecasts, 0.01, 0.99)
+    # Outcomes follow true base rate (well-calibrated ground truth)
+    outcomes = (rng.uniform(0, 1, n) < base).astype(float)
+    return forecasts, outcomes
+
+
+def compute_metrics_and_curves(all_data):
+    """Compute calibration metrics and curve data from synthetic forecasts."""
+    bin_edges = np.linspace(0, 1, N_BINS + 1)
+    bin_midpoints = (bin_edges[:-1] + bin_edges[1:]) / 2
+
+    metrics_rows = []
+    curve_rows = []
+
+    overall_base_rate = np.concatenate([d["outcomes"] for d in all_data]).mean()
+    overall_uncertainty = overall_base_rate * (1 - overall_base_rate)
+
+    for d in all_data:
+        forecasts = d["forecasts"]
+        outcomes = d["outcomes"]
+        bins = np.digitize(forecasts, bin_edges, right=True).clip(1, N_BINS)
+        n_total = len(forecasts)
+
+        ece = 0.0
+        reliability = 0.0
+        resolution = 0.0
+
+        for b in range(1, N_BINS + 1):
+            mask = bins == b
+            if not mask.any():
+                continue
+            n_k = mask.sum()
+            weight = n_k / n_total
+            f_mean = forecasts[mask].mean()
+            o_rate = outcomes[mask].mean()
+            ece += weight * abs(f_mean - o_rate)
+            reliability += weight * (f_mean - o_rate) ** 2
+            resolution += weight * (o_rate - overall_base_rate) ** 2
+
+            curve_rows.append({
+                "model_pk": d["model_pk"],
+                "organization": d["organization"],
+                "model": d["model"],
+                "bin_midpoint": round(float(bin_midpoints[b - 1]), 3),
+                "forecast_mean": round(float(f_mean), 4),
+                "resolution_rate": round(float(o_rate), 4),
+                "n_bin": int(n_k),
+            })
+
+        metrics_rows.append({
+            "model_pk": d["model_pk"],
+            "organization": d["organization"],
+            "model_organization": d["organization"],
+            "model": d["model"],
+            "ece": round(ece, 6),
+            "reliability": round(reliability, 6),
+            "resolution": round(resolution, 6),
+            "uncertainty": round(overall_uncertainty, 6),
+            "sharpness": round(float(forecasts.std()), 6),
+            "n_forecasts": n_total,
+        })
+
+    return pd.DataFrame(metrics_rows), curve_rows
+
+
+def main():
+    rng = np.random.default_rng(42)
+    all_data = []
+
+    for model_pk, org, model_name in MODELS:
+        forecasts, outcomes = generate_forecasts(model_pk, rng)
+        all_data.append({
+            "model_pk": model_pk,
+            "organization": org,
+            "model": model_name,
+            "forecasts": forecasts,
+            "outcomes": outcomes,
+        })
+
+    df_metrics, curve_rows = compute_metrics_and_curves(all_data)
+
+    os.makedirs(ASSET_DIR, exist_ok=True)
+
+    for lb_type in ["baseline", "tournament"]:
+        df_metrics.to_csv(
+            os.path.join(ASSET_DIR, f"calibration_metrics_{lb_type}.csv"),
+            index=False,
+        )
+        with open(os.path.join(ASSET_DIR, f"calibration_curves_{lb_type}.json"), "w") as f:
+            json.dump(curve_rows, f, indent=2)
+
+    print(f"Wrote calibration data to {os.path.abspath(ASSET_DIR)}")
+    print(f"  Metrics: {len(df_metrics)} models")
+    print(f"  Curves: {len(curve_rows)} bins")
+    print()
+    print("Verify decomposition identity (reliability - resolution + uncertainty ≈ mean Brier):")
+    for d, row in zip(all_data, df_metrics.itertuples()):
+        brier = ((d["forecasts"] - d["outcomes"]) ** 2).mean()
+        decomp = row.reliability - row.resolution + row.uncertainty
+        print(f"  {row.model:25s}  decomp={decomp:.4f}  brier={brier:.4f}  diff={abs(decomp-brier):.6f}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_calibration.py b/tests/test_calibration.py
new file mode 100644
index 00000000..d31f69db
--- /dev/null
+++ b/tests/test_calibration.py
@@ -0,0 +1,221 @@
+"""Tests for calibration metrics: ECE, Brier decomposition, sharpness.
+
+These tests duplicate the calibration logic from src/leaderboard/main.py to avoid importing
+the full main module (which pulls in pyfixest, joblib, etc.). The functions under test are
+compute_calibration_metrics() and compute_calibration_curve_data().
+"""
+
+import numpy as np
+import pandas as pd
+
+# ── Inline copies of the functions under test ──────────────────────────────────
+# Keep in sync with src/leaderboard/main.py
+
+
+def compute_calibration_metrics(df, n_bins=10):
+    """Compute calibration metrics per model."""
+    bin_edges = np.linspace(0, 1, n_bins + 1)
+    df = df.copy()
+    df["bin"] = np.digitize(df["forecast"], bin_edges, right=True).clip(1, n_bins)
+
+    overall_base_rate = df["resolved_to"].mean()
+    overall_uncertainty = overall_base_rate * (1 - overall_base_rate)
+
+    rows = []
+    for (model_pk, org, model_org, model), grp in df.groupby(
+        ["model_pk", "organization", "model_organization", "model"]
+    ):
+        n_total = len(grp)
+        ece = 0.0
+        reliability = 0.0
+        resolution = 0.0
+
+        for _, bin_grp in grp.groupby("bin"):
+            n_k = len(bin_grp)
+            weight = n_k / n_total
+            forecast_mean = bin_grp["forecast"].mean()
+            observed_rate = bin_grp["resolved_to"].mean()
+            ece += weight * abs(forecast_mean - observed_rate)
+            reliability += weight * (forecast_mean - observed_rate) ** 2
+            resolution += weight * (observed_rate - overall_base_rate) ** 2
+
+        sharpness = grp["forecast"].std()
+
+        rows.append(
+            {
+                "model_pk": model_pk,
+                "organization": org,
+                "model_organization": model_org,
+                "model": model,
+                "ece": round(ece, 6),
+                "reliability": round(reliability, 6),
+                "resolution": round(resolution, 6),
+                "uncertainty": round(overall_uncertainty, 6),
+                "sharpness": round(sharpness, 6),
+                "n_forecasts": n_total,
+            }
+        )
+
+    return pd.DataFrame(rows)
+
+
+def compute_calibration_curve_data(df, n_bins=10):
+    """Compute per-bin calibration curve data."""
+    bin_edges = np.linspace(0, 1, n_bins + 1)
+    bin_midpoints = (bin_edges[:-1] + bin_edges[1:]) / 2
+    df = df.copy()
+    df["bin"] = np.digitize(df["forecast"], bin_edges, right=True).clip(1, n_bins)
+
+    rows = []
+    for (model_pk, org, model), grp in df.groupby(["model_pk", "organization", "model"]):
+        for bin_idx, bin_grp in grp.groupby("bin"):
+            rows.append(
+                {
+                    "model_pk": model_pk,
+                    "organization": org,
+                    "model": model,
+                    "bin_midpoint": round(bin_midpoints[bin_idx - 1], 3),
+                    "forecast_mean": round(bin_grp["forecast"].mean(), 4),
+                    "resolution_rate": round(bin_grp["resolved_to"].mean(), 4),
+                    "n_bin": len(bin_grp),
+                }
+            )
+
+    return pd.DataFrame(rows)
+
+
+# ── Helpers ────────────────────────────────────────────────────────────────────
+
+
+def _make_forecasts(
+    forecasts, outcomes, model_pk="test_model", organization="TestOrg", model="test"
+):
+    """Build a minimal DataFrame matching the expected schema."""
+    return pd.DataFrame(
+        {
+            "forecast": forecasts,
+            "resolved_to": outcomes,
+            "brier_score": [(f - o) ** 2 for f, o in zip(forecasts, outcomes)],
+            "model_pk": model_pk,
+            "organization": organization,
+            "model_organization": organization,
+            "model": model,
+        }
+    )
+
+
+# ── Tests ──────────────────────────────────────────────────────────────────────
+
+
+class TestBrierDecomposition:
+    """Verify reliability - resolution + uncertainty ≈ mean(brier_score)."""
+
+    def test_decomposition_identity_synthetic(self):
+        """With known data, the Brier decomposition identity should hold."""
+        rng = np.random.default_rng(42)
+        n = 2000
+        forecasts = rng.uniform(0, 1, n)
+        outcomes = (rng.uniform(0, 1, n) < forecasts).astype(float)
+
+        df = _make_forecasts(forecasts.tolist(), outcomes.tolist())
+        metrics = compute_calibration_metrics(df, n_bins=10)
+
+        assert len(metrics) == 1
+        row = metrics.iloc[0]
+
+        decomp = row["reliability"] - row["resolution"] + row["uncertainty"]
+        mean_brier = df["brier_score"].mean()
+
+        assert abs(decomp - mean_brier) < 0.01, (
+            f"Brier decomposition failed: {row['reliability']:.4f} - {row['resolution']:.4f} "
+            f"+ {row['uncertainty']:.4f} = {decomp:.4f} vs mean_brier = {mean_brier:.4f}"
+        )
+
+    def test_perfect_calibration_has_zero_ece(self):
+        """A perfectly calibrated forecaster should have ECE ≈ 0."""
+        rng = np.random.default_rng(123)
+        forecasts = []
+        outcomes = []
+        for p in [0.1, 0.3, 0.5, 0.7, 0.9]:
+            n_per_bin = 500
+            forecasts.extend([p] * n_per_bin)
+            outcomes.extend(rng.binomial(1, p, n_per_bin).tolist())
+
+        df = _make_forecasts(forecasts, outcomes)
+        metrics = compute_calibration_metrics(df, n_bins=10)
+
+        row = metrics.iloc[0]
+        assert row["ece"] < 0.03, f"ECE too high for well-calibrated forecaster: {row['ece']:.4f}"
+
+    def test_overconfident_forecaster_has_high_ece(self):
+        """A forecaster who always says 0.95 when base rate is 0.5 should have high ECE."""
+        rng = np.random.default_rng(456)
+        n = 1000
+        forecasts = [0.95] * n
+        outcomes = rng.binomial(1, 0.5, n).astype(float).tolist()
+
+        df = _make_forecasts(forecasts, outcomes)
+        metrics = compute_calibration_metrics(df, n_bins=10)
+
+        row = metrics.iloc[0]
+        assert (
+            row["ece"] > 0.3
+        ), f"ECE should be high for overconfident forecaster: {row['ece']:.4f}"
+
+    def test_multiple_models(self):
+        """Metrics should return one row per model."""
+        rng = np.random.default_rng(789)
+        dfs = []
+        for i in range(3):
+            n = 500
+            forecasts = rng.uniform(0, 1, n)
+            outcomes = (rng.uniform(0, 1, n) < forecasts).astype(float)
+            dfs.append(
+                _make_forecasts(
+                    forecasts.tolist(),
+                    outcomes.tolist(),
+                    model_pk=f"model_{i}",
+                    model=f"model_{i}",
+                )
+            )
+
+        df = pd.concat(dfs, ignore_index=True)
+        metrics = compute_calibration_metrics(df, n_bins=10)
+        assert len(metrics) == 3
+
+
+class TestCalibrationCurves:
+    """Test compute_calibration_curve_data."""
+
+    def test_curve_data_shape(self):
+        """Curve data should have at most n_bins rows per model."""
+        rng = np.random.default_rng(101)
+        n = 1000
+        forecasts = rng.uniform(0, 1, n)
+        outcomes = (rng.uniform(0, 1, n) < forecasts).astype(float)
+
+        df = _make_forecasts(forecasts.tolist(), outcomes.tolist())
+        curves = compute_calibration_curve_data(df, n_bins=10)
+
+        assert len(curves) <= 10
+        assert set(curves.columns) == {
+            "model_pk",
+            "organization",
+            "model",
+            "bin_midpoint",
+            "forecast_mean",
+            "resolution_rate",
+            "n_bin",
+        }
+
+    def test_bin_counts_sum_to_total(self):
+        """Sum of n_bin across bins should equal total forecasts."""
+        rng = np.random.default_rng(202)
+        n = 800
+        forecasts = rng.uniform(0, 1, n)
+        outcomes = (rng.uniform(0, 1, n) < forecasts).astype(float)
+
+        df = _make_forecasts(forecasts.tolist(), outcomes.tolist())
+        curves = compute_calibration_curve_data(df, n_bins=10)
+
+        assert curves["n_bin"].sum() == n