ethp2p · cortze · Mar 13, 2026 · Mar 13, 2026 · Mar 16, 2026
diff --git a/notebooks/12-slot-classifier.ipynb b/notebooks/12-slot-classifier.ipynb
@@ -0,0 +1,331 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "0",
+   "metadata": {},
+   "source": [
+    "Analysis of Ethereum CL's Attestation Inclusion metrics using the networking events as reference.  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1",
+   "metadata": {
+    "jupyter": {
+     "is_executing": true
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import polars as pl\n",
+    "from loaders import load_parquet\n",
+    "from IPython.display import display\n",
+    "\n",
+    "import plotly.express as px\n",
+    "import plotly.graph_objects as go\n",
+    "from plotly.subplots import make_subplots\n",
+    "\n",
+    "from queries.slot_tags import TAG_ORDERS, TAG_LABELS, TAG_GROUPS, short_label\n",
+    "\n",
+    "# Global Variables\n",
+    "target_date = None # Use this as a default for the automation and the rendering of the page"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# read the parquet file\n",
+    "df = pl.from_pandas(load_parquet(\"slot_tags\", target_date=target_date))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4",
+   "metadata": {},
+   "source": [
+    "# Tag Distribution Overview — blocks, attestations, aggregations"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5",
+   "metadata": {},
+   "source": [
+    "## Tag Distribution Overview\n",
+    "\n",
+    "Percentage of slots with each tag value, broken down per dimension and grouped by category (blocks, attestations, aggregations)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "total = len(df)\n",
+    "\n",
+    "def plot_tag_distribution(tag_cols: list[str], title: str) -> None:\n",
+    "    n = len(tag_cols)\n",
+    "    row_height = 300\n",
+    "    fig = make_subplots(\n",
+    "        rows=n,\n",
+    "        cols=1,\n",
+    "        subplot_titles=[TAG_LABELS[c] for c in tag_cols],\n",
+    "        vertical_spacing=80 / (row_height * n),  # fixed 80px gap between subplots\n",
+    "    )\n",
+    "    for i, col in enumerate(tag_cols, 1):\n",
+    "        present = set(df[col].unique().to_list())\n",
+    "        order = [v for v in TAG_ORDERS[col] if v in present]\n",
+    "        counts = (\n",
+    "            df\n",
+    "            .group_by(col)\n",
+    "            .agg(count=pl.len())\n",
+    "            .with_columns(pct=(pl.col(\"count\") * 100 / total))\n",
+    "            .with_columns(pl.col(col).cast(pl.Enum(order)))\n",
+    "            .sort(col)\n",
+    "        )\n",
+    "        pct_values = counts[\"pct\"].to_list()\n",
+    "        fig.add_trace(\n",
+    "            go.Bar(\n",
+    "                x=[short_label(v) for v in counts[col].to_list()],\n",
+    "                y=pct_values,\n",
+    "                text=[f\"{v:.1f}%\" for v in pct_values],\n",
+    "                textposition=\"outside\",\n",
+    "                showlegend=False,\n",
+    "                marker_color=\"#6366f1\",\n",
+    "            ),\n",
+    "            row=i,\n",
+    "            col=1,\n",
+    "        )\n",
+    "        # Extend y-axis range so outside labels stay inside the plot area,\n",
+    "        # preventing the hover tooltip from disappearing when the cursor is over them.\n",
+    "        fig.update_yaxes(title_text=\"% slots\", range=[0, max(pct_values) * 1.25], row=i, col=1)\n",
+    "    fig.update_layout(\n",
+    "        title=title,\n",
+    "        height=row_height * n,\n",
+    "        width=1000,\n",
+    "        margin=dict(t=80, b=40),\n",
+    "        hovermode=\"x\",\n",
+    "    )\n",
+    "    fig.show()\n",
+    "\n",
+    "\n",
+    "for group_name, cols in TAG_GROUPS.items():\n",
+    "    plot_tag_distribution(cols, f\"Slot Tag Distributions — {group_name}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7",
+   "metadata": {},
+   "source": [
+    "## Cross-Dimension Co-occurrence Heatmaps\n",
+    "\n",
+    "For each pair of dimensions, cells show the percentage of slots in row-category X that also have column-category Y. Rows sum to 100%."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def cross_tab_heatmap(df: pl.DataFrame, col_x: str, col_y: str) -> go.Figure:\n",
+    "    \"\"\"Row-normalized cross-tab: for each value of col_x, % breakdown by col_y.\"\"\"\n",
+    "    present_x = set(df[col_x].unique().to_list())\n",
+    "    present_y = set(df[col_y].unique().to_list())\n",
+    "    order_x = [v for v in TAG_ORDERS[col_x] if v in present_x]\n",
+    "    order_y = [v for v in TAG_ORDERS[col_y] if v in present_y]\n",
+    "\n",
+    "    ct = df.group_by([col_x, col_y]).agg(count=pl.len())\n",
+    "    totals = ct.group_by(col_x).agg(total=pl.col(\"count\").sum())\n",
+    "    ct = ct.join(totals, on=col_x).with_columns(pct=(pl.col(\"count\") * 100 / pl.col(\"total\")))\n",
+    "\n",
+    "    matrix = []\n",
+    "    for x in order_x:\n",
+    "        row = []\n",
+    "        for y in order_y:\n",
+    "            cell = ct.filter((pl.col(col_x) == x) & (pl.col(col_y) == y))\n",
+    "            row.append(round(cell[\"pct\"][0], 1) if len(cell) > 0 else 0.0)\n",
+    "        matrix.append(row)\n",
+    "\n",
+    "    fig = go.Figure(go.Heatmap(\n",
+    "        z=matrix,\n",
+    "        x=[short_label(v) for v in order_y],\n",
+    "        y=[short_label(v) for v in order_x],\n",
+    "        colorscale=\"Blues\",\n",
+    "        zmin=0,\n",
+    "        zmax=100,\n",
+    "        text=[[f\"{v:.1f}%\" for v in row] for row in matrix],\n",
+    "        texttemplate=\"%{text}\",\n",
+    "        hoverongaps=False,\n",
+    "        colorbar=dict(title=\"% of row\"),\n",
+    "    ))\n",
+    "    fig.update_layout(\n",
+    "        title=f\"{TAG_LABELS[col_x]} vs {TAG_LABELS[col_y]}\",\n",
+    "        xaxis_title=TAG_LABELS[col_y],\n",
+    "        yaxis_title=TAG_LABELS[col_x],\n",
+    "        width=900,\n",
+    "        height=350,\n",
+    "        margin=dict(l=200, b=140),\n",
+    "    )\n",
+    "    return fig\n",
+    "\n",
+    "\n",
+    "for col_x, col_y in [\n",
+    "    (\"block_proposal_tag\", \"block_p50_spread_tag\"),\n",
+    "    (\"block_size_tag\",     \"block_p50_spread_tag\"),\n",
+    "    (\"blob_count_tag\",     \"block_p50_spread_tag\"),\n",
+    "    (\"block_proposal_tag\", \"block_p50_arrival_tag\"),\n",
+    "]:\n",
+    "    cross_tab_heatmap(df, col_x, col_y).show()\n",
+    "\n",
+    "for col_x, col_y in [\n",
+    "    (\"block_proposal_tag\",    \"col_first_seen_p50_tag\"),\n",
+    "    (\"blob_count_tag\",        \"col_first_seen_p50_tag\"),\n",
+    "    (\"col_first_seen_p50_tag\", \"col_spread_p50_tag\"),\n",
+    "]:\n",
+    "    cross_tab_heatmap(df, col_x, col_y).show()\n",
+    "\n",
+    "for col_x, col_y in [\n",
+    "    (\"block_proposal_tag\",     \"att_first_seen_p50_tag\"),\n",
+    "    (\"att_first_seen_p50_tag\", \"att_spread_p50_tag\"),\n",
+    "    (\"att_first_seen_p50_tag\", \"att_inclusion_p50_tag\"),\n",
+    "    (\"att_spread_p50_tag\",     \"att_inclusion_p50_tag\"),\n",
+    "]:\n",
+    "    cross_tab_heatmap(df, col_x, col_y).show()\n",
+    "\n",
+    "for col_x, col_y in [\n",
+    "    (\"block_proposal_tag\",     \"agg_first_seen_p50_tag\"),\n",
+    "    (\"agg_first_seen_p50_tag\", \"agg_spread_p50_tag\"),\n",
+    "    (\"att_first_seen_p50_tag\", \"agg_first_seen_p50_tag\"),\n",
+    "]:\n",
+    "    cross_tab_heatmap(df, col_x, col_y).show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9",
+   "metadata": {},
+   "source": [
+    "## Slot Tag Flow (Sankey)\n",
+    "\n",
+    "Shows how slots flow across dimensions: **Blob Count → Block Size → Proposal Timing → Broadcast Speed**. Width of each band is proportional to the number of slots."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "10",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def build_sankey(df: pl.DataFrame, cols: list[str], title: str = \"\") -> go.Figure:\n",
+    "    node_labels: list[str] = []\n",
+    "    node_index: dict[tuple[str, str], int] = {}\n",
+    "\n",
+    "    for col in cols:\n",
+    "        present = set(df[col].unique().to_list())\n",
+    "        for val in TAG_ORDERS[col]:\n",
+    "            if val in present:\n",
+    "                node_index[(col, val)] = len(node_labels)\n",
+    "                node_labels.append(f\"{TAG_LABELS[col]}: {short_label(val)}\")\n",
+    "\n",
+    "    sources, targets, values = [], [], []\n",
+    "\n",
+    "    for i in range(len(cols) - 1):\n",
+    "        col_a, col_b = cols[i], cols[i + 1]\n",
+    "        flow = df.group_by([col_a, col_b]).agg(count=pl.len())\n",
+    "        for row in flow.iter_rows(named=True):\n",
+    "            a = node_index.get((col_a, row[col_a]))\n",
+    "            b = node_index.get((col_b, row[col_b]))\n",
+    "            if a is not None and b is not None:\n",
+    "                sources.append(a)\n",
+    "                targets.append(b)\n",
+    "                values.append(row[\"count\"])\n",
+    "\n",
+    "    fig = go.Figure(go.Sankey(\n",
+    "        arrangement=\"snap\",\n",
+    "        node=dict(\n",
+    "            label=node_labels,\n",
+    "            pad=20,\n",
+    "            thickness=18,\n",
+    "            color=\"#6366f1\",\n",
+    "        ),\n",
+    "        link=dict(\n",
+    "            source=sources,\n",
+    "            target=targets,\n",
+    "            value=values,\n",
+    "            color=\"rgba(99,102,241,0.25)\",\n",
+    "        ),\n",
+    "    ))\n",
+    "    fig.update_layout(\n",
+    "        title=title,\n",
+    "        width=1200,\n",
+    "        height=700,\n",
+    "    )\n",
+    "    return fig\n",
+    "\n",
+    "\n",
+    "build_sankey(\n",
+    "    df,\n",
+    "    [\"block_proposal_tag\", \"block_size_tag\", \"blob_count_tag\", \"block_p50_arrival_tag\", \"block_p50_spread_tag\"],\n",
+    "    title=\"Slot Tag Flow: Blob Count → Block Size → Proposal Timing → Arrival → Broadcast Speed\",\n",
+    ").show()\n",
+    "\n",
+    "build_sankey(\n",
+    "    df,\n",
+    "    [\"block_proposal_tag\", \"blob_count_tag\", \"col_first_seen_p50_tag\", \"col_spread_p50_tag\"],\n",
+    "    title=\"Data Column Tag Flow (P50): Block Proposal → Blob Count → First Seen → Spread\",\n",
+    ").show()\n",
+    "\n",
+    "build_sankey(\n",
+    "    df,\n",
+    "    [\"block_proposal_tag\", \"att_first_seen_p50_tag\", \"att_spread_p50_tag\", \"att_inclusion_p50_tag\"],\n",
+    "    title=\"Attestation Tag Flow (P50): Block Proposal → First Seen → Spread → Inclusion Delay\",\n",
+    ").show()\n",
+    "\n",
+    "build_sankey(\n",
+    "    df,\n",
+    "    [\"block_proposal_tag\", \"agg_first_seen_p50_tag\", \"agg_spread_p50_tag\"],\n",
+    "    title=\"Aggregation Tag Flow (P50): Block Proposal → First Seen → Spread\",\n",
+    ").show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/pipeline.yaml b/pipeline.yaml
@@ -112,12 +112,11 @@ queries:
     description: Block propagation by geographic region from Sentries
     output_file: block_propagation_by_region.parquet
 
-  block_propagation_by_region_contributoor:
-    module: queries.block_propagation_contributoor
-    function: fetch_block_propagation_by_region_contributoor
-    database: contributoor
-    description: Block propagation by geographic region from Contributoor nodes
-    output_file: block_propagation_by_region_contributoor.parquet
+  slot_tags:
+    module: queries.slot_tagger
+    function: fetch_slot_tags
+    description: Classify each slot for what happened based on the events
+    output_file: slot_tags.parquet
 
 # ============================================
 # Notebook Registry
@@ -241,22 +240,19 @@ notebooks:
         required: true
     order: 8
 
-  - id: block-propagation-size
-    title: Block propagation
-    description: Block propagation timing by size with corrected MEV timing that isolates network latency from block building
+  - id: slot-tags
+    title: Slot types
+    description: Visualize and aggregate metrics based on what happened on each slot
     icon: Gauge
-    source: notebooks/09-block-propagation-size.ipynb
+    source: notebooks/12-slot-classifier.ipynb
     schedule: daily
     queries:
-      - block_propagation_by_size
-      - block_production_timeline
-      - block_propagation_by_region
-      - block_propagation_by_region_contributoor
+      - slot_tags
     parameters:
       - name: target_date
         type: date
         required: true
-    order: 9
+    order: 12
 
 # Schedule options: hourly, daily, weekly, manual
 # - hourly: Runs every hour, accumulating data throughout the day

diff --git a/pyproject.toml b/pyproject.toml
@@ -18,8 +18,15 @@ dependencies = [
     "boto3>=1.35.0",
     "scipy>=1.16.3",
     "statsmodels>=0.14.6",
+    "polars>=1.38.1",
 ]
 
+[tool.uv]
+package = true
+
+[tool.setuptools.packages.find]
+include = ["queries*"]
+
 [dependency-groups]
 dev = [
     "ipykernel>=7.1.0",