From 63e16ea29011a00e959da75baea97550481c8cd0 Mon Sep 17 00:00:00 2001 From: Orpheus Lummis Date: Mon, 2 Mar 2026 18:37:32 +0000 Subject: [PATCH] Add calibration analysis: ECE, reliability diagrams, Brier decomposition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ForecastBench reports Brier score, BSS, Peer Score, and oracle equivalence but no calibration metrics. Calibration — does P=0.7 mean 70%? — is the most safety-relevant property of a forecasting system. This adds it as a parallel analysis step computed nightly alongside the existing leaderboard. Pipeline (src/leaderboard/main.py): - compute_calibration_metrics(): per-model ECE, Brier decomposition (reliability, resolution, uncertainty per Murphy 1973), sharpness - compute_calibration_curve_data(): per-(model, bin) data for reliability diagrams - write_calibration_data(): CSV + JSON to public release bucket - Wired into make_leaderboard() after oracle removal Website: - /calibration/ page with D3 reliability diagram and metrics table - Baseline/tournament toggle, model checkboxes, tooltips - entrypoint.sh copies calibration files from bucket to assets/data/ - Nav entry between Explore and Datasets Note: Houtan added Plotly-based calibration plots in March 2024 (7729a9d) but they were server-side only and removed during the codebase restructure. This surfaces calibration as a first-class website feature with proper metrics (ECE, Brier decomposition) and interactive visualization. Zero changes to existing scoring functions, bootstrap, or leaderboard output. Co-Authored-By: Claude Opus 4.6 --- src/leaderboard/main.py | 152 ++++++++++ .../_data/navigation.yml | 3 + .../assets/css/custom.scss | 42 +++ .../assets/js/calibration_chart.js | 282 ++++++++++++++++++ .../calibration/index.md | 55 ++++ .../entrypoint.sh.template | 7 + tests/generate_test_calibration_data.py | 157 ++++++++++ tests/test_calibration.py | 221 ++++++++++++++ 8 files changed, 919 insertions(+) create mode 100644 src/www.forecastbench.org/assets/js/calibration_chart.js create mode 100644 src/www.forecastbench.org/calibration/index.md create mode 100644 tests/generate_test_calibration_data.py create mode 100644 tests/test_calibration.py diff --git a/src/leaderboard/main.py b/src/leaderboard/main.py index 303a0dc3..6b0be234 100644 --- a/src/leaderboard/main.py +++ b/src/leaderboard/main.py @@ -1595,6 +1595,139 @@ def brier_skill_score(df: pd.DataFrame) -> pd.DataFrame: return df[orig_cols + ["brier_skill_score"]] +def compute_calibration_metrics( + df: pd.DataFrame, + n_bins: int = 10, +) -> pd.DataFrame: + """Compute calibration metrics for each model. + + Metrics computed per model: + - ECE (Expected Calibration Error): weighted mean of |forecast_mean - resolution_rate| per bin + - Brier decomposition (Murphy 1973): reliability, resolution, uncertainty + - Sharpness: standard deviation of forecasts + + Args: + df (pd.DataFrame): Resolved forecasts with 'forecast', 'resolved_to', + 'model_pk', 'organization', 'model_organization', 'model' columns. + n_bins (int): Number of equal-width bins on [0, 1]. + + Returns: + pd.DataFrame: One row per model with columns: model_pk, organization, model_organization, + model, ece, reliability, resolution, uncertainty, sharpness, n_forecasts. + """ + bin_edges = np.linspace(0, 1, n_bins + 1) + df = df.copy() + df["bin"] = np.digitize(df["forecast"], bin_edges, right=True).clip(1, n_bins) + + overall_base_rate = df["resolved_to"].mean() + overall_uncertainty = overall_base_rate * (1 - overall_base_rate) + + rows = [] + for (model_pk, org, model_org, model), grp in df.groupby( + ["model_pk", "organization", "model_organization", "model"] + ): + n_total = len(grp) + ece = 0.0 + reliability = 0.0 + resolution = 0.0 + + for _, bin_grp in grp.groupby("bin"): + n_k = len(bin_grp) + weight = n_k / n_total + forecast_mean = bin_grp["forecast"].mean() + observed_rate = bin_grp["resolved_to"].mean() + ece += weight * abs(forecast_mean - observed_rate) + reliability += weight * (forecast_mean - observed_rate) ** 2 + resolution += weight * (observed_rate - overall_base_rate) ** 2 + + sharpness = grp["forecast"].std() + + rows.append( + { + "model_pk": model_pk, + "organization": org, + "model_organization": model_org, + "model": model, + "ece": round(ece, 6), + "reliability": round(reliability, 6), + "resolution": round(resolution, 6), + "uncertainty": round(overall_uncertainty, 6), + "sharpness": round(sharpness, 6), + "n_forecasts": n_total, + } + ) + + return pd.DataFrame(rows) + + +def compute_calibration_curve_data( + df: pd.DataFrame, + n_bins: int = 10, +) -> pd.DataFrame: + """Compute per-bin calibration curve data for reliability diagrams. + + Args: + df (pd.DataFrame): Resolved forecasts with 'forecast', 'resolved_to', + 'model_pk', 'organization', 'model' columns. + n_bins (int): Number of equal-width bins on [0, 1]. + + Returns: + pd.DataFrame: Per-(model, bin) rows with columns: model_pk, organization, model, + bin_midpoint, forecast_mean, resolution_rate, n_bin. + """ + bin_edges = np.linspace(0, 1, n_bins + 1) + bin_midpoints = (bin_edges[:-1] + bin_edges[1:]) / 2 + df = df.copy() + df["bin"] = np.digitize(df["forecast"], bin_edges, right=True).clip(1, n_bins) + + rows = [] + for (model_pk, org, model), grp in df.groupby(["model_pk", "organization", "model"]): + for bin_idx, bin_grp in grp.groupby("bin"): + rows.append( + { + "model_pk": model_pk, + "organization": org, + "model": model, + "bin_midpoint": round(bin_midpoints[bin_idx - 1], 3), + "forecast_mean": round(bin_grp["forecast"].mean(), 4), + "resolution_rate": round(bin_grp["resolved_to"].mean(), 4), + "n_bin": len(bin_grp), + } + ) + + return pd.DataFrame(rows) + + +def write_calibration_data( + df_calibration_metrics: pd.DataFrame, + df_calibration_curves: pd.DataFrame, + leaderboard_type: "LeaderboardType", +) -> None: + """Write calibration metrics CSV and curve data JSON to the public bucket. + + Args: + df_calibration_metrics (pd.DataFrame): One row per model with calibration metrics. + df_calibration_curves (pd.DataFrame): Per-(model, bin) curve data for reliability diagrams. + leaderboard_type (LeaderboardType): baseline or tournament. + + Returns: + None. + """ + directory = data_utils.get_mounted_bucket(bucket=env.PUBLIC_RELEASE_BUCKET) + destination_folder = "leaderboards/csv" + os.makedirs(f"{directory}/{destination_folder}", exist_ok=True) + + # Write metrics CSV + metrics_filename = f"{destination_folder}/calibration_metrics_{leaderboard_type.value}.csv" + df_calibration_metrics.to_csv(f"{directory}/{metrics_filename}", index=False) + + # Write curves JSON + curves_filename = f"{destination_folder}/calibration_curves_{leaderboard_type.value}.json" + df_calibration_curves.to_json(f"{directory}/{curves_filename}", orient="records", indent=2) + + logger.info(f"Wrote calibration data for {leaderboard_type.value} leaderboard.") + + def score_models( df: pd.DataFrame, scoring_funcs: List[Callable[[pd.DataFrame], pd.DataFrame]], @@ -2633,6 +2766,11 @@ def make_leaderboard( df = remove_x_pct_oracles(df=df) df_leaderboard = remove_x_pct_oracles(df=df_leaderboard) + # Calibration analysis (oracles already removed from df above) + df_resolved = df[df["resolved"].astype(bool)].copy() + df_calibration_metrics = compute_calibration_metrics(df=df_resolved) + df_calibration_curves = compute_calibration_curve_data(df=df_resolved) + # Get simulated scores df_simulated_scores_dataset, df_simulated_scores_market, df_simulated_scores_overall = ( generate_simulated_leaderboards( @@ -2712,6 +2850,20 @@ def make_leaderboard( leaderboard_type=leaderboard_type, ) + # Write calibration data for this leaderboard type + lt_model_pks = set(df_leaderboard_lt["model_pk"].unique()) + df_cal_lt = df_calibration_metrics[ + df_calibration_metrics["model_pk"].isin(lt_model_pks) + ] + df_curves_lt = df_calibration_curves[ + df_calibration_curves["model_pk"].isin(lt_model_pks) + ] + write_calibration_data( + df_calibration_metrics=df_cal_lt, + df_calibration_curves=df_curves_lt, + leaderboard_type=leaderboard_type, + ) + def download_and_compile_processed_forecast_files(bucket: str) -> List[pd.DataFrame]: """Download and compile processed forecast files into entries list. diff --git a/src/www.forecastbench.org/_data/navigation.yml b/src/www.forecastbench.org/_data/navigation.yml index 0f80e684..a50da864 100644 --- a/src/www.forecastbench.org/_data/navigation.yml +++ b/src/www.forecastbench.org/_data/navigation.yml @@ -11,6 +11,9 @@ main: - title: "Explore" url: "/explore/" icon: "fa-solid fa-lightbulb" + - title: "Calibration" + url: "/calibration/" + icon: "fa-solid fa-bullseye" - title: "Datasets" url: "/datasets/" icon: "fa-solid fa-layer-group" diff --git a/src/www.forecastbench.org/assets/css/custom.scss b/src/www.forecastbench.org/assets/css/custom.scss index 31765f16..45952b1e 100644 --- a/src/www.forecastbench.org/assets/css/custom.scss +++ b/src/www.forecastbench.org/assets/css/custom.scss @@ -1560,3 +1560,45 @@ input[type="checkbox"]:checked + .toggle-slider:before { .fb-footer__divider { opacity: 0.6; } + +/* ── Calibration page ── */ + +#reliability-diagram { + width: 100%; + max-width: 580px; + margin: 1.5rem auto; +} + +#reliability-diagram svg { + display: block; +} + +.calibration-table { + width: 100%; + border-collapse: collapse; + font-size: 0.85rem; + margin: 1rem 0; +} + +.calibration-table th, +.calibration-table td { + padding: 0.45rem 0.75rem; + text-align: right; + border-bottom: 1px solid #e5e7eb; +} + +.calibration-table th { + font-weight: 600; + background: #f9fafb; + position: sticky; + top: 0; +} + +.calibration-table td:first-child, +.calibration-table th:first-child { + text-align: left; +} + +.calibration-table tbody tr:hover { + background: #f1f5f9; +} diff --git a/src/www.forecastbench.org/assets/js/calibration_chart.js b/src/www.forecastbench.org/assets/js/calibration_chart.js new file mode 100644 index 00000000..e9bd9064 --- /dev/null +++ b/src/www.forecastbench.org/assets/js/calibration_chart.js @@ -0,0 +1,282 @@ +(function () { + const CURVES_PATH_BASELINE = '/assets/data/calibration_curves_baseline.json'; + const CURVES_PATH_TOURNAMENT = '/assets/data/calibration_curves_tournament.json'; + const METRICS_PATH_BASELINE = '/assets/data/calibration_metrics_baseline.csv'; + const METRICS_PATH_TOURNAMENT = '/assets/data/calibration_metrics_tournament.csv'; + + const MAX_DEFAULT_MODELS = 5; + const MARGIN = { top: 20, right: 30, bottom: 50, left: 55 }; + const SIZE = 500; + + const colorScale = d3.scaleOrdinal(d3.schemeTableau10); + const tip = d3.select('#tooltip'); + + let curveData = null; + let metricsData = null; + let selectedModels = new Set(); + + function getLeaderboardType() { + return document.querySelector('input[name="lbSelect"]:checked').value; + } + + function getCurvesPath() { + return getLeaderboardType() === 'tournament' ? CURVES_PATH_TOURNAMENT : CURVES_PATH_BASELINE; + } + + function getMetricsPath() { + return getLeaderboardType() === 'tournament' ? METRICS_PATH_TOURNAMENT : METRICS_PATH_BASELINE; + } + + function modelLabel(d) { + const org = d.organization || ''; + const model = d.model || d.model_pk || ''; + if (org && org !== model) return org + ' / ' + model; + return model; + } + + function loadData() { + Promise.all([ + fetch(getCurvesPath()).then(r => r.json()), + d3.csv(getMetricsPath()), + ]).then(([curves, metrics]) => { + curveData = curves; + metricsData = metrics.map(d => ({ + ...d, + ece: +d.ece, + reliability: +d.reliability, + resolution: +d.resolution, + uncertainty: +d.uncertainty, + sharpness: +d.sharpness, + n_forecasts: +d.n_forecasts, + })); + + // Sort by ECE ascending, pick top N as default + metricsData.sort((a, b) => a.ece - b.ece); + const defaultModels = metricsData.slice(0, MAX_DEFAULT_MODELS).map(d => d.model_pk); + selectedModels = new Set(defaultModels); + + buildModelCheckboxes(); + renderChart(); + renderTable(); + }).catch(err => { + console.error('Failed to load calibration data:', err); + d3.select('#reliability-diagram').html( + '

Calibration data not yet available. ' + + 'Run the leaderboard pipeline to generate calibration artifacts.

' + ); + }); + } + + function buildModelCheckboxes() { + const container = d3.select('#model-checkboxes'); + container.html(''); + metricsData.forEach(d => { + const id = 'model_' + d.model_pk.replace(/[^a-zA-Z0-9]/g, '_'); + const div = container.append('div').attr('class', 'tag-option'); + div.append('input') + .attr('type', 'checkbox') + .attr('id', id) + .attr('value', d.model_pk) + .property('checked', selectedModels.has(d.model_pk)) + .on('change', function () { + if (this.checked) { + selectedModels.add(d.model_pk); + } else { + selectedModels.delete(d.model_pk); + } + renderChart(); + renderTable(); + }); + div.append('label') + .attr('for', id) + .text(modelLabel(d)); + }); + } + + function renderChart() { + const container = d3.select('#reliability-diagram'); + container.html(''); + + const width = SIZE; + const height = SIZE; + + const svg = container.append('svg') + .attr('viewBox', `0 0 ${width + MARGIN.left + MARGIN.right} ${height + MARGIN.top + MARGIN.bottom}`) + .attr('preserveAspectRatio', 'xMidYMid meet') + .style('max-width', (width + MARGIN.left + MARGIN.right) + 'px') + .style('width', '100%'); + + const g = svg.append('g') + .attr('transform', `translate(${MARGIN.left},${MARGIN.top})`); + + const x = d3.scaleLinear().domain([0, 1]).range([0, width]); + const y = d3.scaleLinear().domain([0, 1]).range([height, 0]); + + // Axes + g.append('g') + .attr('transform', `translate(0,${height})`) + .call(d3.axisBottom(x).ticks(10)) + .append('text') + .attr('x', width / 2) + .attr('y', 40) + .attr('fill', 'currentColor') + .attr('text-anchor', 'middle') + .style('font-size', '13px') + .text('Forecast Probability'); + + g.append('g') + .call(d3.axisLeft(y).ticks(10)) + .append('text') + .attr('transform', 'rotate(-90)') + .attr('x', -height / 2) + .attr('y', -42) + .attr('fill', 'currentColor') + .attr('text-anchor', 'middle') + .style('font-size', '13px') + .text('Observed Frequency'); + + // Perfect calibration diagonal + g.append('line') + .attr('x1', x(0)).attr('y1', y(0)) + .attr('x2', x(1)).attr('y2', y(1)) + .attr('stroke', '#888') + .attr('stroke-dasharray', '6,4') + .attr('stroke-width', 1.5) + .attr('opacity', 0.7); + + // Filter curves for selected models + const filteredCurves = curveData.filter(d => selectedModels.has(d.model_pk)); + + // Group by model + const byModel = d3.group(filteredCurves, d => d.model_pk); + + // Size scale for circles + const allN = filteredCurves.map(d => d.n_bin); + const maxN = d3.max(allN) || 1; + const rScale = d3.scaleSqrt().domain([0, maxN]).range([3, 14]); + + let colorIdx = 0; + const modelColors = new Map(); + for (const mpk of selectedModels) { + modelColors.set(mpk, colorScale(colorIdx++)); + } + + // Draw lines and circles for each model + for (const [modelPk, points] of byModel) { + const color = modelColors.get(modelPk) || '#999'; + const sorted = [...points].sort((a, b) => a.bin_midpoint - b.bin_midpoint); + + // Line + const line = d3.line() + .x(d => x(d.forecast_mean)) + .y(d => y(d.resolution_rate)); + + g.append('path') + .datum(sorted) + .attr('d', line) + .attr('fill', 'none') + .attr('stroke', color) + .attr('stroke-width', 2) + .attr('opacity', 0.8); + + // Circles + g.selectAll(null) + .data(sorted) + .enter() + .append('circle') + .attr('cx', d => x(d.forecast_mean)) + .attr('cy', d => y(d.resolution_rate)) + .attr('r', d => rScale(d.n_bin)) + .attr('fill', color) + .attr('fill-opacity', 0.7) + .attr('stroke', color) + .attr('stroke-width', 1) + .on('mouseover', function (event, d) { + d3.select(this).attr('fill-opacity', 1).attr('stroke-width', 2); + tip.style('opacity', 1) + .html( + `${modelLabel(d)}
` + + `Bin midpoint: ${d.bin_midpoint}
` + + `Forecast mean: ${d3.format('.3f')(d.forecast_mean)}
` + + `Observed freq: ${d3.format('.3f')(d.resolution_rate)}
` + + `N: ${d.n_bin}` + ) + .style('left', (event.pageX + 12) + 'px') + .style('top', (event.pageY - 20) + 'px'); + }) + .on('mouseout', function () { + d3.select(this).attr('fill-opacity', 0.7).attr('stroke-width', 1); + tip.style('opacity', 0); + }); + } + + // Legend + const legend = g.append('g') + .attr('transform', `translate(${width - 180}, 10)`); + + let ly = 0; + for (const [modelPk, color] of modelColors) { + const meta = metricsData.find(d => d.model_pk === modelPk); + const label = meta ? modelLabel(meta) : modelPk; + const row = legend.append('g').attr('transform', `translate(0,${ly})`); + row.append('rect') + .attr('width', 12).attr('height', 12) + .attr('fill', color).attr('rx', 2); + row.append('text') + .attr('x', 16).attr('y', 10) + .style('font-size', '11px') + .attr('fill', 'currentColor') + .text(label.length > 22 ? label.slice(0, 20) + '...' : label); + ly += 18; + } + } + + function renderTable() { + const container = d3.select('#metrics-table-container'); + container.html(''); + + const filtered = metricsData.filter(d => selectedModels.has(d.model_pk)); + if (filtered.length === 0) { + container.append('p').style('color', '#888').text('Select models above to see metrics.'); + return; + } + + const table = container.append('table').attr('class', 'calibration-table'); + const thead = table.append('thead'); + const tbody = table.append('tbody'); + + const columns = [ + { key: 'label', label: 'Model' }, + { key: 'ece', label: 'ECE' }, + { key: 'reliability', label: 'Reliability' }, + { key: 'resolution', label: 'Resolution' }, + { key: 'uncertainty', label: 'Uncertainty' }, + { key: 'sharpness', label: 'Sharpness' }, + { key: 'n_forecasts', label: 'N' }, + ]; + + thead.append('tr').selectAll('th') + .data(columns) + .enter() + .append('th') + .text(d => d.label); + + filtered.forEach(d => { + const row = tbody.append('tr'); + columns.forEach(col => { + const val = col.key === 'label' ? modelLabel(d) + : col.key === 'n_forecasts' ? d3.format(',')(d[col.key]) + : d3.format('.4f')(d[col.key]); + row.append('td').text(val); + }); + }); + } + + // Event listeners + document.querySelectorAll('input[name="lbSelect"]').forEach(el => { + el.addEventListener('change', loadData); + }); + + // Initial load + loadData(); +})(); diff --git a/src/www.forecastbench.org/calibration/index.md b/src/www.forecastbench.org/calibration/index.md new file mode 100644 index 00000000..9f1c2706 --- /dev/null +++ b/src/www.forecastbench.org/calibration/index.md @@ -0,0 +1,55 @@ +--- +layout: splash +title: "Calibration" +permalink: /calibration/ +custom_css: /assets/css/custom.scss +after_footer_scripts: + - https://cdn.jsdelivr.net/npm/d3@7 + - /assets/js/calibration_chart.js +--- + +
+

Model Calibration Analysis

+

A well-calibrated forecaster's predicted probabilities match observed frequencies: when it says 70%, the event should occur ~70% of the time. + This page shows reliability diagrams and calibration metrics for each model on ForecastBench.

+
    +
  • Points on the diagonal indicate perfect calibration.
  • +
  • Points above the diagonal indicate underconfidence (events happen more often than predicted).
  • +
  • Points below the diagonal indicate overconfidence (events happen less often than predicted).
  • +
  • Circle size reflects the number of forecasts in each probability bin.
  • +
+ +
+
+
+
Leaderboard
+
+
+ + +
+
+ + +
+
+
+
+
Models
+
+
+
+
+
+ +

Calibration Metrics

+

+ ECE (Expected Calibration Error): weighted mean absolute gap between predicted probability and observed frequency. Lower is better. + Reliability: weighted mean squared gap (Brier decomposition). Lower is better. + Resolution: how much observed frequencies vary across bins. Higher is better. + Uncertainty: base-rate variance (same for all models). Sharpness: spread of forecast probabilities. +

+
+
+ +
diff --git a/src/www.forecastbench.org/entrypoint.sh.template b/src/www.forecastbench.org/entrypoint.sh.template index 3ed112ae..51bc4062 100644 --- a/src/www.forecastbench.org/entrypoint.sh.template +++ b/src/www.forecastbench.org/entrypoint.sh.template @@ -44,6 +44,13 @@ mkdir -p "$ASSETS_MNT/$DATA_DIR" cp -a "$PUBLIC_RELEASE_MNT/leaderboards/csv/sota_graph_baseline.csv" "$ASSETS_MNT/$DATA_DIR/" cp -a "$PUBLIC_RELEASE_MNT/leaderboards/csv/sota_graph_tournament.csv" "$ASSETS_MNT/$DATA_DIR/" +# calibration data +for f in calibration_metrics_baseline.csv calibration_metrics_tournament.csv \ + calibration_curves_baseline.json calibration_curves_tournament.json; do + [ -f "$PUBLIC_RELEASE_MNT/leaderboards/csv/$f" ] && \ + cp -a "$PUBLIC_RELEASE_MNT/leaderboards/csv/$f" "$ASSETS_MNT/$DATA_DIR/" +done + # archived forecast sets FORECAST_SETS_MNT="$MOUNT_POINT/FORECAST_SETS_BUCKET" FORECAST_SETS_DIR="assets/data/forecast-sets" diff --git a/tests/generate_test_calibration_data.py b/tests/generate_test_calibration_data.py new file mode 100644 index 00000000..3fea432b --- /dev/null +++ b/tests/generate_test_calibration_data.py @@ -0,0 +1,157 @@ +"""Generate test calibration data files for local website verification. + +Run: python tests/generate_test_calibration_data.py +Then: cd src/www.forecastbench.org && bundle exec jekyll serve + +This writes to src/www.forecastbench.org/assets/data/ so the calibration page can render. +The generated data uses synthetic forecasts with known calibration properties. +""" + +import json +import os +import sys + +import numpy as np +import pandas as pd + +ASSET_DIR = os.path.join( + os.path.dirname(__file__), + "..", + "src", + "www.forecastbench.org", + "assets", + "data", +) + +MODELS = [ + ("well_calibrated", "Synthetic", "Well-Calibrated Model"), + ("overconfident", "Synthetic", "Overconfident Model"), + ("underconfident", "Synthetic", "Underconfident Model"), + ("sharp", "Synthetic", "Sharp Model"), +] + +N_BINS = 10 + + +def generate_forecasts(model_type, rng, n=2000): + """Generate synthetic forecasts with known calibration properties.""" + base = rng.uniform(0.05, 0.95, n) + + if model_type == "well_calibrated": + forecasts = base + elif model_type == "overconfident": + # Push toward extremes + forecasts = np.where(base > 0.5, base + (1 - base) * 0.4, base * 0.6) + elif model_type == "underconfident": + # Push toward 0.5 + forecasts = 0.5 + (base - 0.5) * 0.5 + elif model_type == "sharp": + # Mostly extreme probabilities + forecasts = np.where(base > 0.5, 0.85 + rng.uniform(0, 0.14, n), 0.01 + rng.uniform(0, 0.14, n)) + else: + forecasts = base + + forecasts = np.clip(forecasts, 0.01, 0.99) + # Outcomes follow true base rate (well-calibrated ground truth) + outcomes = (rng.uniform(0, 1, n) < base).astype(float) + return forecasts, outcomes + + +def compute_metrics_and_curves(all_data): + """Compute calibration metrics and curve data from synthetic forecasts.""" + bin_edges = np.linspace(0, 1, N_BINS + 1) + bin_midpoints = (bin_edges[:-1] + bin_edges[1:]) / 2 + + metrics_rows = [] + curve_rows = [] + + overall_base_rate = np.concatenate([d["outcomes"] for d in all_data]).mean() + overall_uncertainty = overall_base_rate * (1 - overall_base_rate) + + for d in all_data: + forecasts = d["forecasts"] + outcomes = d["outcomes"] + bins = np.digitize(forecasts, bin_edges, right=True).clip(1, N_BINS) + n_total = len(forecasts) + + ece = 0.0 + reliability = 0.0 + resolution = 0.0 + + for b in range(1, N_BINS + 1): + mask = bins == b + if not mask.any(): + continue + n_k = mask.sum() + weight = n_k / n_total + f_mean = forecasts[mask].mean() + o_rate = outcomes[mask].mean() + ece += weight * abs(f_mean - o_rate) + reliability += weight * (f_mean - o_rate) ** 2 + resolution += weight * (o_rate - overall_base_rate) ** 2 + + curve_rows.append({ + "model_pk": d["model_pk"], + "organization": d["organization"], + "model": d["model"], + "bin_midpoint": round(float(bin_midpoints[b - 1]), 3), + "forecast_mean": round(float(f_mean), 4), + "resolution_rate": round(float(o_rate), 4), + "n_bin": int(n_k), + }) + + metrics_rows.append({ + "model_pk": d["model_pk"], + "organization": d["organization"], + "model_organization": d["organization"], + "model": d["model"], + "ece": round(ece, 6), + "reliability": round(reliability, 6), + "resolution": round(resolution, 6), + "uncertainty": round(overall_uncertainty, 6), + "sharpness": round(float(forecasts.std()), 6), + "n_forecasts": n_total, + }) + + return pd.DataFrame(metrics_rows), curve_rows + + +def main(): + rng = np.random.default_rng(42) + all_data = [] + + for model_pk, org, model_name in MODELS: + forecasts, outcomes = generate_forecasts(model_pk, rng) + all_data.append({ + "model_pk": model_pk, + "organization": org, + "model": model_name, + "forecasts": forecasts, + "outcomes": outcomes, + }) + + df_metrics, curve_rows = compute_metrics_and_curves(all_data) + + os.makedirs(ASSET_DIR, exist_ok=True) + + for lb_type in ["baseline", "tournament"]: + df_metrics.to_csv( + os.path.join(ASSET_DIR, f"calibration_metrics_{lb_type}.csv"), + index=False, + ) + with open(os.path.join(ASSET_DIR, f"calibration_curves_{lb_type}.json"), "w") as f: + json.dump(curve_rows, f, indent=2) + + print(f"Wrote calibration data to {os.path.abspath(ASSET_DIR)}") + print(f" Metrics: {len(df_metrics)} models") + print(f" Curves: {len(curve_rows)} bins") + print() + print("Verify decomposition identity (reliability - resolution + uncertainty ≈ mean Brier):") + for d, row in zip(all_data, df_metrics.itertuples()): + brier = ((d["forecasts"] - d["outcomes"]) ** 2).mean() + decomp = row.reliability - row.resolution + row.uncertainty + print(f" {row.model:25s} decomp={decomp:.4f} brier={brier:.4f} diff={abs(decomp-brier):.6f}") + + +if __name__ == "__main__": + main() diff --git a/tests/test_calibration.py b/tests/test_calibration.py new file mode 100644 index 00000000..d31f69db --- /dev/null +++ b/tests/test_calibration.py @@ -0,0 +1,221 @@ +"""Tests for calibration metrics: ECE, Brier decomposition, sharpness. + +These tests duplicate the calibration logic from src/leaderboard/main.py to avoid importing +the full main module (which pulls in pyfixest, joblib, etc.). The functions under test are +compute_calibration_metrics() and compute_calibration_curve_data(). +""" + +import numpy as np +import pandas as pd + +# ── Inline copies of the functions under test ────────────────────────────────── +# Keep in sync with src/leaderboard/main.py + + +def compute_calibration_metrics(df, n_bins=10): + """Compute calibration metrics per model.""" + bin_edges = np.linspace(0, 1, n_bins + 1) + df = df.copy() + df["bin"] = np.digitize(df["forecast"], bin_edges, right=True).clip(1, n_bins) + + overall_base_rate = df["resolved_to"].mean() + overall_uncertainty = overall_base_rate * (1 - overall_base_rate) + + rows = [] + for (model_pk, org, model_org, model), grp in df.groupby( + ["model_pk", "organization", "model_organization", "model"] + ): + n_total = len(grp) + ece = 0.0 + reliability = 0.0 + resolution = 0.0 + + for _, bin_grp in grp.groupby("bin"): + n_k = len(bin_grp) + weight = n_k / n_total + forecast_mean = bin_grp["forecast"].mean() + observed_rate = bin_grp["resolved_to"].mean() + ece += weight * abs(forecast_mean - observed_rate) + reliability += weight * (forecast_mean - observed_rate) ** 2 + resolution += weight * (observed_rate - overall_base_rate) ** 2 + + sharpness = grp["forecast"].std() + + rows.append( + { + "model_pk": model_pk, + "organization": org, + "model_organization": model_org, + "model": model, + "ece": round(ece, 6), + "reliability": round(reliability, 6), + "resolution": round(resolution, 6), + "uncertainty": round(overall_uncertainty, 6), + "sharpness": round(sharpness, 6), + "n_forecasts": n_total, + } + ) + + return pd.DataFrame(rows) + + +def compute_calibration_curve_data(df, n_bins=10): + """Compute per-bin calibration curve data.""" + bin_edges = np.linspace(0, 1, n_bins + 1) + bin_midpoints = (bin_edges[:-1] + bin_edges[1:]) / 2 + df = df.copy() + df["bin"] = np.digitize(df["forecast"], bin_edges, right=True).clip(1, n_bins) + + rows = [] + for (model_pk, org, model), grp in df.groupby(["model_pk", "organization", "model"]): + for bin_idx, bin_grp in grp.groupby("bin"): + rows.append( + { + "model_pk": model_pk, + "organization": org, + "model": model, + "bin_midpoint": round(bin_midpoints[bin_idx - 1], 3), + "forecast_mean": round(bin_grp["forecast"].mean(), 4), + "resolution_rate": round(bin_grp["resolved_to"].mean(), 4), + "n_bin": len(bin_grp), + } + ) + + return pd.DataFrame(rows) + + +# ── Helpers ──────────────────────────────────────────────────────────────────── + + +def _make_forecasts( + forecasts, outcomes, model_pk="test_model", organization="TestOrg", model="test" +): + """Build a minimal DataFrame matching the expected schema.""" + return pd.DataFrame( + { + "forecast": forecasts, + "resolved_to": outcomes, + "brier_score": [(f - o) ** 2 for f, o in zip(forecasts, outcomes)], + "model_pk": model_pk, + "organization": organization, + "model_organization": organization, + "model": model, + } + ) + + +# ── Tests ────────────────────────────────────────────────────────────────────── + + +class TestBrierDecomposition: + """Verify reliability - resolution + uncertainty ≈ mean(brier_score).""" + + def test_decomposition_identity_synthetic(self): + """With known data, the Brier decomposition identity should hold.""" + rng = np.random.default_rng(42) + n = 2000 + forecasts = rng.uniform(0, 1, n) + outcomes = (rng.uniform(0, 1, n) < forecasts).astype(float) + + df = _make_forecasts(forecasts.tolist(), outcomes.tolist()) + metrics = compute_calibration_metrics(df, n_bins=10) + + assert len(metrics) == 1 + row = metrics.iloc[0] + + decomp = row["reliability"] - row["resolution"] + row["uncertainty"] + mean_brier = df["brier_score"].mean() + + assert abs(decomp - mean_brier) < 0.01, ( + f"Brier decomposition failed: {row['reliability']:.4f} - {row['resolution']:.4f} " + f"+ {row['uncertainty']:.4f} = {decomp:.4f} vs mean_brier = {mean_brier:.4f}" + ) + + def test_perfect_calibration_has_zero_ece(self): + """A perfectly calibrated forecaster should have ECE ≈ 0.""" + rng = np.random.default_rng(123) + forecasts = [] + outcomes = [] + for p in [0.1, 0.3, 0.5, 0.7, 0.9]: + n_per_bin = 500 + forecasts.extend([p] * n_per_bin) + outcomes.extend(rng.binomial(1, p, n_per_bin).tolist()) + + df = _make_forecasts(forecasts, outcomes) + metrics = compute_calibration_metrics(df, n_bins=10) + + row = metrics.iloc[0] + assert row["ece"] < 0.03, f"ECE too high for well-calibrated forecaster: {row['ece']:.4f}" + + def test_overconfident_forecaster_has_high_ece(self): + """A forecaster who always says 0.95 when base rate is 0.5 should have high ECE.""" + rng = np.random.default_rng(456) + n = 1000 + forecasts = [0.95] * n + outcomes = rng.binomial(1, 0.5, n).astype(float).tolist() + + df = _make_forecasts(forecasts, outcomes) + metrics = compute_calibration_metrics(df, n_bins=10) + + row = metrics.iloc[0] + assert ( + row["ece"] > 0.3 + ), f"ECE should be high for overconfident forecaster: {row['ece']:.4f}" + + def test_multiple_models(self): + """Metrics should return one row per model.""" + rng = np.random.default_rng(789) + dfs = [] + for i in range(3): + n = 500 + forecasts = rng.uniform(0, 1, n) + outcomes = (rng.uniform(0, 1, n) < forecasts).astype(float) + dfs.append( + _make_forecasts( + forecasts.tolist(), + outcomes.tolist(), + model_pk=f"model_{i}", + model=f"model_{i}", + ) + ) + + df = pd.concat(dfs, ignore_index=True) + metrics = compute_calibration_metrics(df, n_bins=10) + assert len(metrics) == 3 + + +class TestCalibrationCurves: + """Test compute_calibration_curve_data.""" + + def test_curve_data_shape(self): + """Curve data should have at most n_bins rows per model.""" + rng = np.random.default_rng(101) + n = 1000 + forecasts = rng.uniform(0, 1, n) + outcomes = (rng.uniform(0, 1, n) < forecasts).astype(float) + + df = _make_forecasts(forecasts.tolist(), outcomes.tolist()) + curves = compute_calibration_curve_data(df, n_bins=10) + + assert len(curves) <= 10 + assert set(curves.columns) == { + "model_pk", + "organization", + "model", + "bin_midpoint", + "forecast_mean", + "resolution_rate", + "n_bin", + } + + def test_bin_counts_sum_to_total(self): + """Sum of n_bin across bins should equal total forecasts.""" + rng = np.random.default_rng(202) + n = 800 + forecasts = rng.uniform(0, 1, n) + outcomes = (rng.uniform(0, 1, n) < forecasts).astype(float) + + df = _make_forecasts(forecasts.tolist(), outcomes.tolist()) + curves = compute_calibration_curve_data(df, n_bins=10) + + assert curves["n_bin"].sum() == n