diff --git a/src/leaderboard/main.py b/src/leaderboard/main.py
index 303a0dc3..6b0be234 100644
--- a/src/leaderboard/main.py
+++ b/src/leaderboard/main.py
@@ -1595,6 +1595,139 @@ def brier_skill_score(df: pd.DataFrame) -> pd.DataFrame:
return df[orig_cols + ["brier_skill_score"]]
+def compute_calibration_metrics(
+ df: pd.DataFrame,
+ n_bins: int = 10,
+) -> pd.DataFrame:
+ """Compute calibration metrics for each model.
+
+ Metrics computed per model:
+ - ECE (Expected Calibration Error): weighted mean of |forecast_mean - resolution_rate| per bin
+ - Brier decomposition (Murphy 1973): reliability, resolution, uncertainty
+ - Sharpness: standard deviation of forecasts
+
+ Args:
+ df (pd.DataFrame): Resolved forecasts with 'forecast', 'resolved_to',
+ 'model_pk', 'organization', 'model_organization', 'model' columns.
+ n_bins (int): Number of equal-width bins on [0, 1].
+
+ Returns:
+ pd.DataFrame: One row per model with columns: model_pk, organization, model_organization,
+ model, ece, reliability, resolution, uncertainty, sharpness, n_forecasts.
+ """
+ bin_edges = np.linspace(0, 1, n_bins + 1)
+ df = df.copy()
+ df["bin"] = np.digitize(df["forecast"], bin_edges, right=True).clip(1, n_bins)
+
+ overall_base_rate = df["resolved_to"].mean()
+ overall_uncertainty = overall_base_rate * (1 - overall_base_rate)
+
+ rows = []
+ for (model_pk, org, model_org, model), grp in df.groupby(
+ ["model_pk", "organization", "model_organization", "model"]
+ ):
+ n_total = len(grp)
+ ece = 0.0
+ reliability = 0.0
+ resolution = 0.0
+
+ for _, bin_grp in grp.groupby("bin"):
+ n_k = len(bin_grp)
+ weight = n_k / n_total
+ forecast_mean = bin_grp["forecast"].mean()
+ observed_rate = bin_grp["resolved_to"].mean()
+ ece += weight * abs(forecast_mean - observed_rate)
+ reliability += weight * (forecast_mean - observed_rate) ** 2
+ resolution += weight * (observed_rate - overall_base_rate) ** 2
+
+ sharpness = grp["forecast"].std()
+
+ rows.append(
+ {
+ "model_pk": model_pk,
+ "organization": org,
+ "model_organization": model_org,
+ "model": model,
+ "ece": round(ece, 6),
+ "reliability": round(reliability, 6),
+ "resolution": round(resolution, 6),
+ "uncertainty": round(overall_uncertainty, 6),
+ "sharpness": round(sharpness, 6),
+ "n_forecasts": n_total,
+ }
+ )
+
+ return pd.DataFrame(rows)
+
+
+def compute_calibration_curve_data(
+ df: pd.DataFrame,
+ n_bins: int = 10,
+) -> pd.DataFrame:
+ """Compute per-bin calibration curve data for reliability diagrams.
+
+ Args:
+ df (pd.DataFrame): Resolved forecasts with 'forecast', 'resolved_to',
+ 'model_pk', 'organization', 'model' columns.
+ n_bins (int): Number of equal-width bins on [0, 1].
+
+ Returns:
+ pd.DataFrame: Per-(model, bin) rows with columns: model_pk, organization, model,
+ bin_midpoint, forecast_mean, resolution_rate, n_bin.
+ """
+ bin_edges = np.linspace(0, 1, n_bins + 1)
+ bin_midpoints = (bin_edges[:-1] + bin_edges[1:]) / 2
+ df = df.copy()
+ df["bin"] = np.digitize(df["forecast"], bin_edges, right=True).clip(1, n_bins)
+
+ rows = []
+ for (model_pk, org, model), grp in df.groupby(["model_pk", "organization", "model"]):
+ for bin_idx, bin_grp in grp.groupby("bin"):
+ rows.append(
+ {
+ "model_pk": model_pk,
+ "organization": org,
+ "model": model,
+ "bin_midpoint": round(bin_midpoints[bin_idx - 1], 3),
+ "forecast_mean": round(bin_grp["forecast"].mean(), 4),
+ "resolution_rate": round(bin_grp["resolved_to"].mean(), 4),
+ "n_bin": len(bin_grp),
+ }
+ )
+
+ return pd.DataFrame(rows)
+
+
+def write_calibration_data(
+ df_calibration_metrics: pd.DataFrame,
+ df_calibration_curves: pd.DataFrame,
+ leaderboard_type: "LeaderboardType",
+) -> None:
+ """Write calibration metrics CSV and curve data JSON to the public bucket.
+
+ Args:
+ df_calibration_metrics (pd.DataFrame): One row per model with calibration metrics.
+ df_calibration_curves (pd.DataFrame): Per-(model, bin) curve data for reliability diagrams.
+ leaderboard_type (LeaderboardType): baseline or tournament.
+
+ Returns:
+ None.
+ """
+ directory = data_utils.get_mounted_bucket(bucket=env.PUBLIC_RELEASE_BUCKET)
+ destination_folder = "leaderboards/csv"
+ os.makedirs(f"{directory}/{destination_folder}", exist_ok=True)
+
+ # Write metrics CSV
+ metrics_filename = f"{destination_folder}/calibration_metrics_{leaderboard_type.value}.csv"
+ df_calibration_metrics.to_csv(f"{directory}/{metrics_filename}", index=False)
+
+ # Write curves JSON
+ curves_filename = f"{destination_folder}/calibration_curves_{leaderboard_type.value}.json"
+ df_calibration_curves.to_json(f"{directory}/{curves_filename}", orient="records", indent=2)
+
+ logger.info(f"Wrote calibration data for {leaderboard_type.value} leaderboard.")
+
+
def score_models(
df: pd.DataFrame,
scoring_funcs: List[Callable[[pd.DataFrame], pd.DataFrame]],
@@ -2633,6 +2766,11 @@ def make_leaderboard(
df = remove_x_pct_oracles(df=df)
df_leaderboard = remove_x_pct_oracles(df=df_leaderboard)
+ # Calibration analysis (oracles already removed from df above)
+ df_resolved = df[df["resolved"].astype(bool)].copy()
+ df_calibration_metrics = compute_calibration_metrics(df=df_resolved)
+ df_calibration_curves = compute_calibration_curve_data(df=df_resolved)
+
# Get simulated scores
df_simulated_scores_dataset, df_simulated_scores_market, df_simulated_scores_overall = (
generate_simulated_leaderboards(
@@ -2712,6 +2850,20 @@ def make_leaderboard(
leaderboard_type=leaderboard_type,
)
+ # Write calibration data for this leaderboard type
+ lt_model_pks = set(df_leaderboard_lt["model_pk"].unique())
+ df_cal_lt = df_calibration_metrics[
+ df_calibration_metrics["model_pk"].isin(lt_model_pks)
+ ]
+ df_curves_lt = df_calibration_curves[
+ df_calibration_curves["model_pk"].isin(lt_model_pks)
+ ]
+ write_calibration_data(
+ df_calibration_metrics=df_cal_lt,
+ df_calibration_curves=df_curves_lt,
+ leaderboard_type=leaderboard_type,
+ )
+
def download_and_compile_processed_forecast_files(bucket: str) -> List[pd.DataFrame]:
"""Download and compile processed forecast files into entries list.
diff --git a/src/www.forecastbench.org/_data/navigation.yml b/src/www.forecastbench.org/_data/navigation.yml
index 0f80e684..a50da864 100644
--- a/src/www.forecastbench.org/_data/navigation.yml
+++ b/src/www.forecastbench.org/_data/navigation.yml
@@ -11,6 +11,9 @@ main:
- title: "Explore"
url: "/explore/"
icon: "fa-solid fa-lightbulb"
+ - title: "Calibration"
+ url: "/calibration/"
+ icon: "fa-solid fa-bullseye"
- title: "Datasets"
url: "/datasets/"
icon: "fa-solid fa-layer-group"
diff --git a/src/www.forecastbench.org/assets/css/custom.scss b/src/www.forecastbench.org/assets/css/custom.scss
index 31765f16..45952b1e 100644
--- a/src/www.forecastbench.org/assets/css/custom.scss
+++ b/src/www.forecastbench.org/assets/css/custom.scss
@@ -1560,3 +1560,45 @@ input[type="checkbox"]:checked + .toggle-slider:before {
.fb-footer__divider {
opacity: 0.6;
}
+
+/* ── Calibration page ── */
+
+#reliability-diagram {
+ width: 100%;
+ max-width: 580px;
+ margin: 1.5rem auto;
+}
+
+#reliability-diagram svg {
+ display: block;
+}
+
+.calibration-table {
+ width: 100%;
+ border-collapse: collapse;
+ font-size: 0.85rem;
+ margin: 1rem 0;
+}
+
+.calibration-table th,
+.calibration-table td {
+ padding: 0.45rem 0.75rem;
+ text-align: right;
+ border-bottom: 1px solid #e5e7eb;
+}
+
+.calibration-table th {
+ font-weight: 600;
+ background: #f9fafb;
+ position: sticky;
+ top: 0;
+}
+
+.calibration-table td:first-child,
+.calibration-table th:first-child {
+ text-align: left;
+}
+
+.calibration-table tbody tr:hover {
+ background: #f1f5f9;
+}
diff --git a/src/www.forecastbench.org/assets/js/calibration_chart.js b/src/www.forecastbench.org/assets/js/calibration_chart.js
new file mode 100644
index 00000000..e9bd9064
--- /dev/null
+++ b/src/www.forecastbench.org/assets/js/calibration_chart.js
@@ -0,0 +1,282 @@
+(function () {
+ const CURVES_PATH_BASELINE = '/assets/data/calibration_curves_baseline.json';
+ const CURVES_PATH_TOURNAMENT = '/assets/data/calibration_curves_tournament.json';
+ const METRICS_PATH_BASELINE = '/assets/data/calibration_metrics_baseline.csv';
+ const METRICS_PATH_TOURNAMENT = '/assets/data/calibration_metrics_tournament.csv';
+
+ const MAX_DEFAULT_MODELS = 5;
+ const MARGIN = { top: 20, right: 30, bottom: 50, left: 55 };
+ const SIZE = 500;
+
+ const colorScale = d3.scaleOrdinal(d3.schemeTableau10);
+ const tip = d3.select('#tooltip');
+
+ let curveData = null;
+ let metricsData = null;
+ let selectedModels = new Set();
+
+ function getLeaderboardType() {
+ return document.querySelector('input[name="lbSelect"]:checked').value;
+ }
+
+ function getCurvesPath() {
+ return getLeaderboardType() === 'tournament' ? CURVES_PATH_TOURNAMENT : CURVES_PATH_BASELINE;
+ }
+
+ function getMetricsPath() {
+ return getLeaderboardType() === 'tournament' ? METRICS_PATH_TOURNAMENT : METRICS_PATH_BASELINE;
+ }
+
+ function modelLabel(d) {
+ const org = d.organization || '';
+ const model = d.model || d.model_pk || '';
+ if (org && org !== model) return org + ' / ' + model;
+ return model;
+ }
+
+ function loadData() {
+ Promise.all([
+ fetch(getCurvesPath()).then(r => r.json()),
+ d3.csv(getMetricsPath()),
+ ]).then(([curves, metrics]) => {
+ curveData = curves;
+ metricsData = metrics.map(d => ({
+ ...d,
+ ece: +d.ece,
+ reliability: +d.reliability,
+ resolution: +d.resolution,
+ uncertainty: +d.uncertainty,
+ sharpness: +d.sharpness,
+ n_forecasts: +d.n_forecasts,
+ }));
+
+ // Sort by ECE ascending, pick top N as default
+ metricsData.sort((a, b) => a.ece - b.ece);
+ const defaultModels = metricsData.slice(0, MAX_DEFAULT_MODELS).map(d => d.model_pk);
+ selectedModels = new Set(defaultModels);
+
+ buildModelCheckboxes();
+ renderChart();
+ renderTable();
+ }).catch(err => {
+ console.error('Failed to load calibration data:', err);
+ d3.select('#reliability-diagram').html(
+ '
Calibration data not yet available. ' +
+ 'Run the leaderboard pipeline to generate calibration artifacts.
'
+ );
+ });
+ }
+
+ function buildModelCheckboxes() {
+ const container = d3.select('#model-checkboxes');
+ container.html('');
+ metricsData.forEach(d => {
+ const id = 'model_' + d.model_pk.replace(/[^a-zA-Z0-9]/g, '_');
+ const div = container.append('div').attr('class', 'tag-option');
+ div.append('input')
+ .attr('type', 'checkbox')
+ .attr('id', id)
+ .attr('value', d.model_pk)
+ .property('checked', selectedModels.has(d.model_pk))
+ .on('change', function () {
+ if (this.checked) {
+ selectedModels.add(d.model_pk);
+ } else {
+ selectedModels.delete(d.model_pk);
+ }
+ renderChart();
+ renderTable();
+ });
+ div.append('label')
+ .attr('for', id)
+ .text(modelLabel(d));
+ });
+ }
+
+ function renderChart() {
+ const container = d3.select('#reliability-diagram');
+ container.html('');
+
+ const width = SIZE;
+ const height = SIZE;
+
+ const svg = container.append('svg')
+ .attr('viewBox', `0 0 ${width + MARGIN.left + MARGIN.right} ${height + MARGIN.top + MARGIN.bottom}`)
+ .attr('preserveAspectRatio', 'xMidYMid meet')
+ .style('max-width', (width + MARGIN.left + MARGIN.right) + 'px')
+ .style('width', '100%');
+
+ const g = svg.append('g')
+ .attr('transform', `translate(${MARGIN.left},${MARGIN.top})`);
+
+ const x = d3.scaleLinear().domain([0, 1]).range([0, width]);
+ const y = d3.scaleLinear().domain([0, 1]).range([height, 0]);
+
+ // Axes
+ g.append('g')
+ .attr('transform', `translate(0,${height})`)
+ .call(d3.axisBottom(x).ticks(10))
+ .append('text')
+ .attr('x', width / 2)
+ .attr('y', 40)
+ .attr('fill', 'currentColor')
+ .attr('text-anchor', 'middle')
+ .style('font-size', '13px')
+ .text('Forecast Probability');
+
+ g.append('g')
+ .call(d3.axisLeft(y).ticks(10))
+ .append('text')
+ .attr('transform', 'rotate(-90)')
+ .attr('x', -height / 2)
+ .attr('y', -42)
+ .attr('fill', 'currentColor')
+ .attr('text-anchor', 'middle')
+ .style('font-size', '13px')
+ .text('Observed Frequency');
+
+ // Perfect calibration diagonal
+ g.append('line')
+ .attr('x1', x(0)).attr('y1', y(0))
+ .attr('x2', x(1)).attr('y2', y(1))
+ .attr('stroke', '#888')
+ .attr('stroke-dasharray', '6,4')
+ .attr('stroke-width', 1.5)
+ .attr('opacity', 0.7);
+
+ // Filter curves for selected models
+ const filteredCurves = curveData.filter(d => selectedModels.has(d.model_pk));
+
+ // Group by model
+ const byModel = d3.group(filteredCurves, d => d.model_pk);
+
+ // Size scale for circles
+ const allN = filteredCurves.map(d => d.n_bin);
+ const maxN = d3.max(allN) || 1;
+ const rScale = d3.scaleSqrt().domain([0, maxN]).range([3, 14]);
+
+ let colorIdx = 0;
+ const modelColors = new Map();
+ for (const mpk of selectedModels) {
+ modelColors.set(mpk, colorScale(colorIdx++));
+ }
+
+ // Draw lines and circles for each model
+ for (const [modelPk, points] of byModel) {
+ const color = modelColors.get(modelPk) || '#999';
+ const sorted = [...points].sort((a, b) => a.bin_midpoint - b.bin_midpoint);
+
+ // Line
+ const line = d3.line()
+ .x(d => x(d.forecast_mean))
+ .y(d => y(d.resolution_rate));
+
+ g.append('path')
+ .datum(sorted)
+ .attr('d', line)
+ .attr('fill', 'none')
+ .attr('stroke', color)
+ .attr('stroke-width', 2)
+ .attr('opacity', 0.8);
+
+ // Circles
+ g.selectAll(null)
+ .data(sorted)
+ .enter()
+ .append('circle')
+ .attr('cx', d => x(d.forecast_mean))
+ .attr('cy', d => y(d.resolution_rate))
+ .attr('r', d => rScale(d.n_bin))
+ .attr('fill', color)
+ .attr('fill-opacity', 0.7)
+ .attr('stroke', color)
+ .attr('stroke-width', 1)
+ .on('mouseover', function (event, d) {
+ d3.select(this).attr('fill-opacity', 1).attr('stroke-width', 2);
+ tip.style('opacity', 1)
+ .html(
+ `${modelLabel(d)}
` +
+ `Bin midpoint: ${d.bin_midpoint}
` +
+ `Forecast mean: ${d3.format('.3f')(d.forecast_mean)}
` +
+ `Observed freq: ${d3.format('.3f')(d.resolution_rate)}
` +
+ `N: ${d.n_bin}`
+ )
+ .style('left', (event.pageX + 12) + 'px')
+ .style('top', (event.pageY - 20) + 'px');
+ })
+ .on('mouseout', function () {
+ d3.select(this).attr('fill-opacity', 0.7).attr('stroke-width', 1);
+ tip.style('opacity', 0);
+ });
+ }
+
+ // Legend
+ const legend = g.append('g')
+ .attr('transform', `translate(${width - 180}, 10)`);
+
+ let ly = 0;
+ for (const [modelPk, color] of modelColors) {
+ const meta = metricsData.find(d => d.model_pk === modelPk);
+ const label = meta ? modelLabel(meta) : modelPk;
+ const row = legend.append('g').attr('transform', `translate(0,${ly})`);
+ row.append('rect')
+ .attr('width', 12).attr('height', 12)
+ .attr('fill', color).attr('rx', 2);
+ row.append('text')
+ .attr('x', 16).attr('y', 10)
+ .style('font-size', '11px')
+ .attr('fill', 'currentColor')
+ .text(label.length > 22 ? label.slice(0, 20) + '...' : label);
+ ly += 18;
+ }
+ }
+
+ function renderTable() {
+ const container = d3.select('#metrics-table-container');
+ container.html('');
+
+ const filtered = metricsData.filter(d => selectedModels.has(d.model_pk));
+ if (filtered.length === 0) {
+ container.append('p').style('color', '#888').text('Select models above to see metrics.');
+ return;
+ }
+
+ const table = container.append('table').attr('class', 'calibration-table');
+ const thead = table.append('thead');
+ const tbody = table.append('tbody');
+
+ const columns = [
+ { key: 'label', label: 'Model' },
+ { key: 'ece', label: 'ECE' },
+ { key: 'reliability', label: 'Reliability' },
+ { key: 'resolution', label: 'Resolution' },
+ { key: 'uncertainty', label: 'Uncertainty' },
+ { key: 'sharpness', label: 'Sharpness' },
+ { key: 'n_forecasts', label: 'N' },
+ ];
+
+ thead.append('tr').selectAll('th')
+ .data(columns)
+ .enter()
+ .append('th')
+ .text(d => d.label);
+
+ filtered.forEach(d => {
+ const row = tbody.append('tr');
+ columns.forEach(col => {
+ const val = col.key === 'label' ? modelLabel(d)
+ : col.key === 'n_forecasts' ? d3.format(',')(d[col.key])
+ : d3.format('.4f')(d[col.key]);
+ row.append('td').text(val);
+ });
+ });
+ }
+
+ // Event listeners
+ document.querySelectorAll('input[name="lbSelect"]').forEach(el => {
+ el.addEventListener('change', loadData);
+ });
+
+ // Initial load
+ loadData();
+})();
diff --git a/src/www.forecastbench.org/calibration/index.md b/src/www.forecastbench.org/calibration/index.md
new file mode 100644
index 00000000..9f1c2706
--- /dev/null
+++ b/src/www.forecastbench.org/calibration/index.md
@@ -0,0 +1,55 @@
+---
+layout: splash
+title: "Calibration"
+permalink: /calibration/
+custom_css: /assets/css/custom.scss
+after_footer_scripts:
+ - https://cdn.jsdelivr.net/npm/d3@7
+ - /assets/js/calibration_chart.js
+---
+
+
+
Model Calibration Analysis
+
A well-calibrated forecaster's predicted probabilities match observed frequencies: when it says 70%, the event should occur ~70% of the time.
+ This page shows reliability diagrams and calibration metrics for each model on ForecastBench.
+
+ - Points on the diagonal indicate perfect calibration.
+ - Points above the diagonal indicate underconfidence (events happen more often than predicted).
+ - Points below the diagonal indicate overconfidence (events happen less often than predicted).
+ - Circle size reflects the number of forecasts in each probability bin.
+
+
+
+
+
Calibration Metrics
+
+ ECE (Expected Calibration Error): weighted mean absolute gap between predicted probability and observed frequency. Lower is better.
+ Reliability: weighted mean squared gap (Brier decomposition). Lower is better.
+ Resolution: how much observed frequencies vary across bins. Higher is better.
+ Uncertainty: base-rate variance (same for all models). Sharpness: spread of forecast probabilities.
+
+
+
+
+
diff --git a/src/www.forecastbench.org/entrypoint.sh.template b/src/www.forecastbench.org/entrypoint.sh.template
index 3ed112ae..51bc4062 100644
--- a/src/www.forecastbench.org/entrypoint.sh.template
+++ b/src/www.forecastbench.org/entrypoint.sh.template
@@ -44,6 +44,13 @@ mkdir -p "$ASSETS_MNT/$DATA_DIR"
cp -a "$PUBLIC_RELEASE_MNT/leaderboards/csv/sota_graph_baseline.csv" "$ASSETS_MNT/$DATA_DIR/"
cp -a "$PUBLIC_RELEASE_MNT/leaderboards/csv/sota_graph_tournament.csv" "$ASSETS_MNT/$DATA_DIR/"
+# calibration data
+for f in calibration_metrics_baseline.csv calibration_metrics_tournament.csv \
+ calibration_curves_baseline.json calibration_curves_tournament.json; do
+ [ -f "$PUBLIC_RELEASE_MNT/leaderboards/csv/$f" ] && \
+ cp -a "$PUBLIC_RELEASE_MNT/leaderboards/csv/$f" "$ASSETS_MNT/$DATA_DIR/"
+done
+
# archived forecast sets
FORECAST_SETS_MNT="$MOUNT_POINT/FORECAST_SETS_BUCKET"
FORECAST_SETS_DIR="assets/data/forecast-sets"
diff --git a/tests/generate_test_calibration_data.py b/tests/generate_test_calibration_data.py
new file mode 100644
index 00000000..3fea432b
--- /dev/null
+++ b/tests/generate_test_calibration_data.py
@@ -0,0 +1,157 @@
+"""Generate test calibration data files for local website verification.
+
+Run: python tests/generate_test_calibration_data.py
+Then: cd src/www.forecastbench.org && bundle exec jekyll serve
+
+This writes to src/www.forecastbench.org/assets/data/ so the calibration page can render.
+The generated data uses synthetic forecasts with known calibration properties.
+"""
+
+import json
+import os
+import sys
+
+import numpy as np
+import pandas as pd
+
+ASSET_DIR = os.path.join(
+ os.path.dirname(__file__),
+ "..",
+ "src",
+ "www.forecastbench.org",
+ "assets",
+ "data",
+)
+
+MODELS = [
+ ("well_calibrated", "Synthetic", "Well-Calibrated Model"),
+ ("overconfident", "Synthetic", "Overconfident Model"),
+ ("underconfident", "Synthetic", "Underconfident Model"),
+ ("sharp", "Synthetic", "Sharp Model"),
+]
+
+N_BINS = 10
+
+
+def generate_forecasts(model_type, rng, n=2000):
+ """Generate synthetic forecasts with known calibration properties."""
+ base = rng.uniform(0.05, 0.95, n)
+
+ if model_type == "well_calibrated":
+ forecasts = base
+ elif model_type == "overconfident":
+ # Push toward extremes
+ forecasts = np.where(base > 0.5, base + (1 - base) * 0.4, base * 0.6)
+ elif model_type == "underconfident":
+ # Push toward 0.5
+ forecasts = 0.5 + (base - 0.5) * 0.5
+ elif model_type == "sharp":
+ # Mostly extreme probabilities
+ forecasts = np.where(base > 0.5, 0.85 + rng.uniform(0, 0.14, n), 0.01 + rng.uniform(0, 0.14, n))
+ else:
+ forecasts = base
+
+ forecasts = np.clip(forecasts, 0.01, 0.99)
+ # Outcomes follow true base rate (well-calibrated ground truth)
+ outcomes = (rng.uniform(0, 1, n) < base).astype(float)
+ return forecasts, outcomes
+
+
+def compute_metrics_and_curves(all_data):
+ """Compute calibration metrics and curve data from synthetic forecasts."""
+ bin_edges = np.linspace(0, 1, N_BINS + 1)
+ bin_midpoints = (bin_edges[:-1] + bin_edges[1:]) / 2
+
+ metrics_rows = []
+ curve_rows = []
+
+ overall_base_rate = np.concatenate([d["outcomes"] for d in all_data]).mean()
+ overall_uncertainty = overall_base_rate * (1 - overall_base_rate)
+
+ for d in all_data:
+ forecasts = d["forecasts"]
+ outcomes = d["outcomes"]
+ bins = np.digitize(forecasts, bin_edges, right=True).clip(1, N_BINS)
+ n_total = len(forecasts)
+
+ ece = 0.0
+ reliability = 0.0
+ resolution = 0.0
+
+ for b in range(1, N_BINS + 1):
+ mask = bins == b
+ if not mask.any():
+ continue
+ n_k = mask.sum()
+ weight = n_k / n_total
+ f_mean = forecasts[mask].mean()
+ o_rate = outcomes[mask].mean()
+ ece += weight * abs(f_mean - o_rate)
+ reliability += weight * (f_mean - o_rate) ** 2
+ resolution += weight * (o_rate - overall_base_rate) ** 2
+
+ curve_rows.append({
+ "model_pk": d["model_pk"],
+ "organization": d["organization"],
+ "model": d["model"],
+ "bin_midpoint": round(float(bin_midpoints[b - 1]), 3),
+ "forecast_mean": round(float(f_mean), 4),
+ "resolution_rate": round(float(o_rate), 4),
+ "n_bin": int(n_k),
+ })
+
+ metrics_rows.append({
+ "model_pk": d["model_pk"],
+ "organization": d["organization"],
+ "model_organization": d["organization"],
+ "model": d["model"],
+ "ece": round(ece, 6),
+ "reliability": round(reliability, 6),
+ "resolution": round(resolution, 6),
+ "uncertainty": round(overall_uncertainty, 6),
+ "sharpness": round(float(forecasts.std()), 6),
+ "n_forecasts": n_total,
+ })
+
+ return pd.DataFrame(metrics_rows), curve_rows
+
+
+def main():
+ rng = np.random.default_rng(42)
+ all_data = []
+
+ for model_pk, org, model_name in MODELS:
+ forecasts, outcomes = generate_forecasts(model_pk, rng)
+ all_data.append({
+ "model_pk": model_pk,
+ "organization": org,
+ "model": model_name,
+ "forecasts": forecasts,
+ "outcomes": outcomes,
+ })
+
+ df_metrics, curve_rows = compute_metrics_and_curves(all_data)
+
+ os.makedirs(ASSET_DIR, exist_ok=True)
+
+ for lb_type in ["baseline", "tournament"]:
+ df_metrics.to_csv(
+ os.path.join(ASSET_DIR, f"calibration_metrics_{lb_type}.csv"),
+ index=False,
+ )
+ with open(os.path.join(ASSET_DIR, f"calibration_curves_{lb_type}.json"), "w") as f:
+ json.dump(curve_rows, f, indent=2)
+
+ print(f"Wrote calibration data to {os.path.abspath(ASSET_DIR)}")
+ print(f" Metrics: {len(df_metrics)} models")
+ print(f" Curves: {len(curve_rows)} bins")
+ print()
+ print("Verify decomposition identity (reliability - resolution + uncertainty ≈ mean Brier):")
+ for d, row in zip(all_data, df_metrics.itertuples()):
+ brier = ((d["forecasts"] - d["outcomes"]) ** 2).mean()
+ decomp = row.reliability - row.resolution + row.uncertainty
+ print(f" {row.model:25s} decomp={decomp:.4f} brier={brier:.4f} diff={abs(decomp-brier):.6f}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tests/test_calibration.py b/tests/test_calibration.py
new file mode 100644
index 00000000..d31f69db
--- /dev/null
+++ b/tests/test_calibration.py
@@ -0,0 +1,221 @@
+"""Tests for calibration metrics: ECE, Brier decomposition, sharpness.
+
+These tests duplicate the calibration logic from src/leaderboard/main.py to avoid importing
+the full main module (which pulls in pyfixest, joblib, etc.). The functions under test are
+compute_calibration_metrics() and compute_calibration_curve_data().
+"""
+
+import numpy as np
+import pandas as pd
+
+# ── Inline copies of the functions under test ──────────────────────────────────
+# Keep in sync with src/leaderboard/main.py
+
+
+def compute_calibration_metrics(df, n_bins=10):
+ """Compute calibration metrics per model."""
+ bin_edges = np.linspace(0, 1, n_bins + 1)
+ df = df.copy()
+ df["bin"] = np.digitize(df["forecast"], bin_edges, right=True).clip(1, n_bins)
+
+ overall_base_rate = df["resolved_to"].mean()
+ overall_uncertainty = overall_base_rate * (1 - overall_base_rate)
+
+ rows = []
+ for (model_pk, org, model_org, model), grp in df.groupby(
+ ["model_pk", "organization", "model_organization", "model"]
+ ):
+ n_total = len(grp)
+ ece = 0.0
+ reliability = 0.0
+ resolution = 0.0
+
+ for _, bin_grp in grp.groupby("bin"):
+ n_k = len(bin_grp)
+ weight = n_k / n_total
+ forecast_mean = bin_grp["forecast"].mean()
+ observed_rate = bin_grp["resolved_to"].mean()
+ ece += weight * abs(forecast_mean - observed_rate)
+ reliability += weight * (forecast_mean - observed_rate) ** 2
+ resolution += weight * (observed_rate - overall_base_rate) ** 2
+
+ sharpness = grp["forecast"].std()
+
+ rows.append(
+ {
+ "model_pk": model_pk,
+ "organization": org,
+ "model_organization": model_org,
+ "model": model,
+ "ece": round(ece, 6),
+ "reliability": round(reliability, 6),
+ "resolution": round(resolution, 6),
+ "uncertainty": round(overall_uncertainty, 6),
+ "sharpness": round(sharpness, 6),
+ "n_forecasts": n_total,
+ }
+ )
+
+ return pd.DataFrame(rows)
+
+
+def compute_calibration_curve_data(df, n_bins=10):
+ """Compute per-bin calibration curve data."""
+ bin_edges = np.linspace(0, 1, n_bins + 1)
+ bin_midpoints = (bin_edges[:-1] + bin_edges[1:]) / 2
+ df = df.copy()
+ df["bin"] = np.digitize(df["forecast"], bin_edges, right=True).clip(1, n_bins)
+
+ rows = []
+ for (model_pk, org, model), grp in df.groupby(["model_pk", "organization", "model"]):
+ for bin_idx, bin_grp in grp.groupby("bin"):
+ rows.append(
+ {
+ "model_pk": model_pk,
+ "organization": org,
+ "model": model,
+ "bin_midpoint": round(bin_midpoints[bin_idx - 1], 3),
+ "forecast_mean": round(bin_grp["forecast"].mean(), 4),
+ "resolution_rate": round(bin_grp["resolved_to"].mean(), 4),
+ "n_bin": len(bin_grp),
+ }
+ )
+
+ return pd.DataFrame(rows)
+
+
+# ── Helpers ────────────────────────────────────────────────────────────────────
+
+
+def _make_forecasts(
+ forecasts, outcomes, model_pk="test_model", organization="TestOrg", model="test"
+):
+ """Build a minimal DataFrame matching the expected schema."""
+ return pd.DataFrame(
+ {
+ "forecast": forecasts,
+ "resolved_to": outcomes,
+ "brier_score": [(f - o) ** 2 for f, o in zip(forecasts, outcomes)],
+ "model_pk": model_pk,
+ "organization": organization,
+ "model_organization": organization,
+ "model": model,
+ }
+ )
+
+
+# ── Tests ──────────────────────────────────────────────────────────────────────
+
+
+class TestBrierDecomposition:
+ """Verify reliability - resolution + uncertainty ≈ mean(brier_score)."""
+
+ def test_decomposition_identity_synthetic(self):
+ """With known data, the Brier decomposition identity should hold."""
+ rng = np.random.default_rng(42)
+ n = 2000
+ forecasts = rng.uniform(0, 1, n)
+ outcomes = (rng.uniform(0, 1, n) < forecasts).astype(float)
+
+ df = _make_forecasts(forecasts.tolist(), outcomes.tolist())
+ metrics = compute_calibration_metrics(df, n_bins=10)
+
+ assert len(metrics) == 1
+ row = metrics.iloc[0]
+
+ decomp = row["reliability"] - row["resolution"] + row["uncertainty"]
+ mean_brier = df["brier_score"].mean()
+
+ assert abs(decomp - mean_brier) < 0.01, (
+ f"Brier decomposition failed: {row['reliability']:.4f} - {row['resolution']:.4f} "
+ f"+ {row['uncertainty']:.4f} = {decomp:.4f} vs mean_brier = {mean_brier:.4f}"
+ )
+
+ def test_perfect_calibration_has_zero_ece(self):
+ """A perfectly calibrated forecaster should have ECE ≈ 0."""
+ rng = np.random.default_rng(123)
+ forecasts = []
+ outcomes = []
+ for p in [0.1, 0.3, 0.5, 0.7, 0.9]:
+ n_per_bin = 500
+ forecasts.extend([p] * n_per_bin)
+ outcomes.extend(rng.binomial(1, p, n_per_bin).tolist())
+
+ df = _make_forecasts(forecasts, outcomes)
+ metrics = compute_calibration_metrics(df, n_bins=10)
+
+ row = metrics.iloc[0]
+ assert row["ece"] < 0.03, f"ECE too high for well-calibrated forecaster: {row['ece']:.4f}"
+
+ def test_overconfident_forecaster_has_high_ece(self):
+ """A forecaster who always says 0.95 when base rate is 0.5 should have high ECE."""
+ rng = np.random.default_rng(456)
+ n = 1000
+ forecasts = [0.95] * n
+ outcomes = rng.binomial(1, 0.5, n).astype(float).tolist()
+
+ df = _make_forecasts(forecasts, outcomes)
+ metrics = compute_calibration_metrics(df, n_bins=10)
+
+ row = metrics.iloc[0]
+ assert (
+ row["ece"] > 0.3
+ ), f"ECE should be high for overconfident forecaster: {row['ece']:.4f}"
+
+ def test_multiple_models(self):
+ """Metrics should return one row per model."""
+ rng = np.random.default_rng(789)
+ dfs = []
+ for i in range(3):
+ n = 500
+ forecasts = rng.uniform(0, 1, n)
+ outcomes = (rng.uniform(0, 1, n) < forecasts).astype(float)
+ dfs.append(
+ _make_forecasts(
+ forecasts.tolist(),
+ outcomes.tolist(),
+ model_pk=f"model_{i}",
+ model=f"model_{i}",
+ )
+ )
+
+ df = pd.concat(dfs, ignore_index=True)
+ metrics = compute_calibration_metrics(df, n_bins=10)
+ assert len(metrics) == 3
+
+
+class TestCalibrationCurves:
+ """Test compute_calibration_curve_data."""
+
+ def test_curve_data_shape(self):
+ """Curve data should have at most n_bins rows per model."""
+ rng = np.random.default_rng(101)
+ n = 1000
+ forecasts = rng.uniform(0, 1, n)
+ outcomes = (rng.uniform(0, 1, n) < forecasts).astype(float)
+
+ df = _make_forecasts(forecasts.tolist(), outcomes.tolist())
+ curves = compute_calibration_curve_data(df, n_bins=10)
+
+ assert len(curves) <= 10
+ assert set(curves.columns) == {
+ "model_pk",
+ "organization",
+ "model",
+ "bin_midpoint",
+ "forecast_mean",
+ "resolution_rate",
+ "n_bin",
+ }
+
+ def test_bin_counts_sum_to_total(self):
+ """Sum of n_bin across bins should equal total forecasts."""
+ rng = np.random.default_rng(202)
+ n = 800
+ forecasts = rng.uniform(0, 1, n)
+ outcomes = (rng.uniform(0, 1, n) < forecasts).astype(float)
+
+ df = _make_forecasts(forecasts.tolist(), outcomes.tolist())
+ curves = compute_calibration_curve_data(df, n_bins=10)
+
+ assert curves["n_bin"].sum() == n