From dfd4aa585b629b9bf6b12495c2b693200c040413 Mon Sep 17 00:00:00 2001
From: Caio Pizzol <caiopizzol@icloud.com>
Date: Sun, 8 Mar 2026 21:45:59 -0300
Subject: [PATCH 1/2] feat: add document classification pipeline
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Python ML pipeline (scripts/classification/) that classifies ~800K .docx
documents by document type (10 classes) and topic (9 classes) using the
FineWeb-Edu pattern: LLM labels a sample → train ModernBERT → apply at scale.

Pipeline steps:
- sample.py: stratified sampling across languages and word count
- label.py: async LLM labeling with Claude (resumable)
- train.py: fine-tune two ModernBERT classifiers
- classify.py: batch inference on full corpus
- evaluate.py: quality metrics and distribution analysis

Also adds:
- LLM classification fields and methods to DbClient
- CLAUDE.md / AGENTS.md at root, packages/shared, and scripts/classification
- Updated README with Phase 5 (Classify) and project structure
---
 .gitignore                            |   2 +-
 AGENTS.md                             |   1 +
 CLAUDE.md                             |  64 ++++
 README.md                             |  76 ++++-
 bun.lock                              |  99 +++++-
 packages/shared/AGENTS.md             |   1 +
 packages/shared/CLAUDE.md             |  22 ++
 packages/shared/db.ts                 |  95 +++++-
 packages/shared/index.ts              |   1 +
 scripts/classification/AGENTS.md      |   1 +
 scripts/classification/CLAUDE.md      |  38 +++
 scripts/classification/README.md      |  81 +++++
 scripts/classification/classify.py    | 384 ++++++++++++++++++++++
 scripts/classification/common.py      | 130 ++++++++
 scripts/classification/evaluate.py    | 303 ++++++++++++++++++
 scripts/classification/label.py       | 375 ++++++++++++++++++++++
 scripts/classification/pyproject.toml |  18 ++
 scripts/classification/sample.py      | 250 +++++++++++++++
 scripts/classification/taxonomy.json  | 114 +++++++
 scripts/classification/train.py       | 443 ++++++++++++++++++++++++++
 20 files changed, 2469 insertions(+), 29 deletions(-)
 create mode 120000 AGENTS.md
 create mode 100644 CLAUDE.md
 create mode 120000 packages/shared/AGENTS.md
 create mode 100644 packages/shared/CLAUDE.md
 create mode 120000 scripts/classification/AGENTS.md
 create mode 100644 scripts/classification/CLAUDE.md
 create mode 100644 scripts/classification/README.md
 create mode 100644 scripts/classification/classify.py
 create mode 100644 scripts/classification/common.py
 create mode 100644 scripts/classification/evaluate.py
 create mode 100644 scripts/classification/label.py
 create mode 100644 scripts/classification/pyproject.toml
 create mode 100644 scripts/classification/sample.py
 create mode 100644 scripts/classification/taxonomy.json
 create mode 100644 scripts/classification/train.py
diff --git a/.gitignore b/.gitignore
index 0cff47c..7473c68 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 # Dependencies
 node_modules/
 .venv/
+__pycache__/
 
 # Build output
 dist/
@@ -23,4 +24,3 @@ coverage/
 
 # OS
 .DS_Store
-CLAUDE.md
diff --git a/AGENTS.md b/AGENTS.md
new file mode 120000
index 0000000..681311e
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1 @@
+CLAUDE.md
\ No newline at end of file
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..5511efb
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,64 @@
+# docx-corpus
+
+The largest open corpus of .docx files (~800K documents) for document processing research. Built by [SuperDoc](https://superdoc.dev).
+
+## Architecture
+
+This is a **data pipeline monorepo** with two runtimes:
+
+- **TypeScript (Bun)** — infrastructure: scraping, extraction, embedding
+- **Python** — data science: classification, export, publishing
+
+```
+apps/cli/               → corpus <command> (scrape, extract, embed, status)
+apps/cdx-filter/        → AWS Lambda for Common Crawl CDX filtering
+packages/shared/        → DB client (Bun.sql), R2 storage, UI helpers
+packages/scraper/       → Downloads .docx from Common Crawl WARC archives
+packages/extractor/     → Text extraction via Docling
+packages/embedder/      → Embeddings via Google gemini-embedding-001
+scripts/classification/ → ML classification pipeline (Python)
+db/                     → PostgreSQL schema + migrations
+```
+
+## Pipeline
+
+Each stage writes to the same PostgreSQL database (`documents` table):
+
+1. **Scrape** (TS) — Common Crawl → .docx files in R2 (`status = 'uploaded'`)
+2. **Extract** (TS) — Docling → text in R2 (`extracted_at`, `word_count`, `language`)
+3. **Embed** (TS) — Google API → pgvector (`embedding`, `embedded_at`)
+4. **Classify** (Python) — ModernBERT → labels (`document_type`, `document_topic`)
+
+## Database
+
+Single `documents` table in PostgreSQL (NeonDB) with pgvector. All pipeline stages write to this table.
+
+- **Connection**: `DATABASE_URL` env var (Bun.sql for TS, psycopg2 for Python)
+- **Schema**: `db/schema.sql` (canonical), `db/migrations/` (incremental)
+- **Key columns**: `id` (SHA-256 hash), `status`, `extracted_at`, `embedded_at`, `document_type`, `document_topic`
+
+## Storage
+
+Documents and extracted text live in Cloudflare R2:
+- `documents/{hash}.docx` — original files
+- `extracted/{hash}.txt` — extracted text
+
+Text is also available at `https://docxcorp.us/extracted/{id}.txt`.
+
+## Commands
+
+```bash
+bun install                        # Install TS dependencies
+bun run corpus scrape --crawl 3    # Scrape from Common Crawl
+bun run corpus extract             # Extract text
+bun run corpus embed               # Generate embeddings
+bun run corpus status              # Show pipeline stats
+```
+
+## Key conventions
+
+- Use `bun` for all TS tooling (not node/npm/pnpm)
+- DB client is in `packages/shared/db.ts` — all pipeline stages use `DbClient`
+- Storage abstraction in `packages/shared/storage.ts` — R2 or local
+- Environment: `.env` at project root (gitignored), see `.env.example`
+- Python scripts manage their own deps via `pyproject.toml`
diff --git a/README.md b/README.md
index 532af78..94bd0a2 100644
--- a/README.md
+++ b/README.md
@@ -55,6 +55,13 @@ Phase 4: Embed (corpus embed)
 │  extracted/    │ ──► │ transformers   │ ──► │   (pgvector)   │
 │  {hash}.txt    │     │   (Python)     │     │  embedding     │
 └────────────────┘     └────────────────┘     └────────────────┘
+
+Phase 5: Classify (Python ML pipeline)
+┌────────────────┐     ┌────────────────┐     ┌────────────────┐
+│  LLM labels    │     │  ModernBERT    │     │   PostgreSQL   │
+│  3,500 sample  │ ──► │  fine-tuning   │ ──► │  document_type │
+│  (Claude)      │     │  (2 models)    │     │  document_topic│
+└────────────────┘     └────────────────┘     └────────────────┘
 ```
 
 ### Why Common Crawl?
@@ -82,27 +89,29 @@ bun install
 ## Project Structure
 
 ```
-packages/
-  shared/         # Shared utilities (DB client, storage, formatting)
-  scraper/        # Core scraper logic (downloads WARC, validates .docx)
-  extractor/      # Text extraction using Docling (Python)
-  embedder/       # Document embeddings
 apps/
-  cli/            # Unified CLI - corpus <command>
-  cdx-filter/     # AWS Lambda - filters CDX indexes for .docx URLs
-  web/            # Landing page - docxcorp.us
+  cli/              # Unified CLI — corpus <command>
+  cdx-filter/       # AWS Lambda — filters CDX indexes for .docx URLs
+  web/              # Landing page — docxcorp.us
+packages/
+  shared/           # DB client, storage, formatting (Bun)
+  scraper/          # Downloads WARC, validates .docx (Bun)
+  extractor/        # Text extraction via Docling (Bun + Python)
+  embedder/         # Document embeddings (Bun)
+scripts/
+  classification/   # ML classification pipeline (Python)
 db/
-  schema.sql      # PostgreSQL schema (with pgvector)
-  migrations/     # Database migrations
+  schema.sql        # PostgreSQL schema (with pgvector)
+  migrations/       # Database migrations
 ```
 
 **Apps** (entry points)
 
-| App            | Purpose                         | Uses                     |
-| -------------- | ------------------------------- | ------------------------ |
-| **cli**        | `corpus` command                | scraper, extractor, embedder |
-| **cdx-filter** | Filter CDX indexes (Lambda)     | -                        |
-| **web**        | Landing page                    | -                        |
+| App            | Purpose                         | Runtime |
+| -------------- | ------------------------------- | ------- |
+| **cli**        | `corpus` command                | Bun     |
+| **cdx-filter** | Filter CDX indexes (Lambda)     | Bun     |
+| **web**        | Landing page                    | -       |
 
 **Packages** (libraries)
 
@@ -113,6 +122,12 @@ db/
 | **extractor**  | Extract text (Docling)            | Bun + Python |
 | **embedder**   | Generate embeddings               | Bun          |
 
+**Scripts** (data science)
+
+| Script                    | Purpose                                    | Runtime |
+| ------------------------- | ------------------------------------------ | ------- |
+| **scripts/classification** | Document type + topic classification (ML) | Python  |
+
 ## Usage
 
 ### 1. Run Lambda to filter CDX indexes
@@ -173,6 +188,34 @@ bun run corpus embed --batch 100 --verbose
 
 Uses Google's `gemini-embedding-001` model (3072 dimensions, ~$0.006/1M tokens). Documents are chunked and embeddings are combined via weighted average.
 
+### 5. Classify documents
+
+Classifies documents by **document type** (10 classes) and **topic** (9 classes) using the [FineWeb-Edu](https://huggingface.co/spaces/HuggingFaceFW/blogpost-fineweb-v1) pattern: LLM labels a sample → train classifier → apply at scale.
+
+```bash
+cd scripts/classification
+
+# Install Python dependencies
+pip install -e .
+
+# Step 1: Sample 3,500 documents (stratified by language, word count, domain)
+python sample.py --total 3500 --output sampled_docs.jsonl
+
+# Step 2: Label with Claude (~$3)
+python label.py --input sampled_docs.jsonl --output labeled_docs.jsonl
+
+# Step 3: Train ModernBERT classifiers (~30min GPU)
+python train.py --input labeled_docs.jsonl --output-dir ./models
+
+# Step 4: Classify full corpus (~800K docs)
+python classify.py --models-dir ./models
+
+# Check results
+python evaluate.py corpus
+```
+
+See [scripts/classification/README.md](scripts/classification/README.md) for full details.
+
 ### Docker
 
 Run the CLI in a container:
@@ -268,6 +311,9 @@ EMBED_INPUT_PREFIX=extracted
 EMBED_BATCH_SIZE=100
 EMBED_CONCURRENCY=20         # Parallel API requests
 GOOGLE_API_KEY=              # Required for embeddings
+
+# Classification (Python scripts only)
+ANTHROPIC_API_KEY=           # Required for LLM labeling step
 ```
 
 ### Rate Limiting
diff --git a/bun.lock b/bun.lock
index a075957..568281e 100644
--- a/bun.lock
+++ b/bun.lock
@@ -11,7 +11,7 @@
     },
     "apps/cdx-filter": {
       "name": "@docx-corpus/cdx-filter",
-      "version": "0.14.3",
+      "version": "0.17.0",
       "dependencies": {
         "@aws-sdk/client-s3": "^3.966.0",
       },
@@ -27,11 +27,12 @@
     },
     "apps/cli": {
       "name": "@docx-corpus/cli",
-      "version": "0.9.3",
+      "version": "0.12.0",
       "bin": {
         "corpus": "./index.ts",
       },
       "dependencies": {
+        "@docx-corpus/classifier": "workspace:*",
         "@docx-corpus/embedder": "workspace:*",
         "@docx-corpus/extractor": "workspace:*",
         "@docx-corpus/scraper": "workspace:*",
@@ -47,9 +48,21 @@
         "typescript": "^5.9.3",
       },
     },
+    "packages/classifier": {
+      "name": "@docx-corpus/classifier",
+      "version": "0.2.0",
+      "dependencies": {
+        "@anthropic-ai/sdk": "^0.39.0",
+        "@docx-corpus/shared": "workspace:*",
+      },
+      "devDependencies": {
+        "@types/bun": "latest",
+        "typescript": "^5.9.3",
+      },
+    },
     "packages/embedder": {
       "name": "@docx-corpus/embedder",
-      "version": "0.2.0",
+      "version": "0.1.0",
       "dependencies": {
         "@docx-corpus/shared": "workspace:*",
         "@google/genai": "^1.38.0",
@@ -104,6 +117,8 @@
 
     "@actions/io": ["@actions/io@2.0.0", "", {}, "sha512-Jv33IN09XLO+0HS79aaODsvIRyduiF7NY/F6LYeK5oeUmrsz7aFdRphQjFoESF4jS7lMauDOttKALcpapVDIAg=="],
 
+    "@anthropic-ai/sdk": ["@anthropic-ai/sdk@0.39.0", "", { "dependencies": { "@types/node": "^18.11.18", "@types/node-fetch": "^2.6.4", "abort-controller": "^3.0.0", "agentkeepalive": "^4.2.1", "form-data-encoder": "1.7.2", "formdata-node": "^4.3.2", "node-fetch": "^2.6.7" } }, "sha512-eMyDIPRZbt1CCLErRCi3exlAvNkBtRe+kW5vvJyef93PmNr/clstYgHhtvmkxN82nlKgzyGPCyGxrm0JQ1ZIdg=="],
+
     "@aws-crypto/crc32": ["@aws-crypto/crc32@5.2.0", "", { "dependencies": { "@aws-crypto/util": "^5.2.0", "@aws-sdk/types": "^3.222.0", "tslib": "^2.6.2" } }, "sha512-nLbCWqQNgUiwwtFsen1AdzAtvuLRsQS8rYgMuxCrdKf9kOssamGLuPwyTY9wyYblNr9+1XM8v6zoDTPPSIeANg=="],
 
     "@aws-crypto/crc32c": ["@aws-crypto/crc32c@5.2.0", "", { "dependencies": { "@aws-crypto/util": "^5.2.0", "@aws-sdk/types": "^3.222.0", "tslib": "^2.6.2" } }, "sha512-+iWb8qaHLYKrNvGRbiYRHSdKRWhto5XlZUEBwDjYNf+ly5SVYG6zEoYIdxvf5R3zyeP16w4PLBn3rH1xc74Rag=="],
@@ -212,6 +227,8 @@
 
     "@docx-corpus/cdx-filter": ["@docx-corpus/cdx-filter@workspace:apps/cdx-filter"],
 
+    "@docx-corpus/classifier": ["@docx-corpus/classifier@workspace:packages/classifier"],
+
     "@docx-corpus/cli": ["@docx-corpus/cli@workspace:apps/cli"],
 
     "@docx-corpus/embedder": ["@docx-corpus/embedder@workspace:packages/embedder"],
@@ -400,14 +417,20 @@
 
     "@smithy/uuid": ["@smithy/uuid@1.1.0", "", { "dependencies": { "tslib": "^2.6.2" } }, "sha512-4aUIteuyxtBUhVdiQqcDhKFitwfd9hqoSDYY2KRXiWtgoWJ9Bmise+KfEPDiVHWeJepvF8xJO9/9+WDIciMFFw=="],
 
-    "@types/bun": ["@types/bun@1.3.6", "", { "dependencies": { "bun-types": "1.3.6" } }, "sha512-uWCv6FO/8LcpREhenN1d1b6fcspAB+cefwD7uti8C8VffIv0Um08TKMn98FynpTiU38+y2dUO55T11NgDt8VAA=="],
+    "@types/bun": ["@types/bun@1.3.10", "", { "dependencies": { "bun-types": "1.3.10" } }, "sha512-0+rlrUrOrTSskibryHbvQkDOWRJwJZqZlxrUs1u4oOoTln8+WIXBPmAuCF35SWB2z4Zl3E84Nl/D0P7803nigQ=="],
 
     "@types/node": ["@types/node@25.0.6", "", { "dependencies": { "undici-types": "~7.16.0" } }, "sha512-NNu0sjyNxpoiW3YuVFfNz7mxSQ+S4X2G28uqg2s+CzoqoQjLPsWSbsFFyztIAqt2vb8kfEAsJNepMGPTxFDx3Q=="],
 
+    "@types/node-fetch": ["@types/node-fetch@2.6.13", "", { "dependencies": { "@types/node": "*", "form-data": "^4.0.4" } }, "sha512-QGpRVpzSaUs30JBSGPjOg4Uveu384erbHBoT1zeONvyCfwQxIkUshLAOqN/k9EjGviPRmWTTe6aH2qySWKTVSw=="],
+
     "@types/normalize-package-data": ["@types/normalize-package-data@2.4.4", "", {}, "sha512-37i+OaWTh9qeK4LSHPsyRC7NahnGotNuZvjLSgcPzblpHB3rrCJxAOgI5gCdKm7coonsaX1Of0ILiTcnZjbfxA=="],
 
+    "abort-controller": ["abort-controller@3.0.0", "", { "dependencies": { "event-target-shim": "^5.0.0" } }, "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg=="],
+
     "agent-base": ["agent-base@7.1.4", "", {}, "sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ=="],
 
+    "agentkeepalive": ["agentkeepalive@4.6.0", "", { "dependencies": { "humanize-ms": "^1.2.1" } }, "sha512-kja8j7PjmncONqaTsB8fQ+wE2mSU2DJ9D4XKoJ5PFWIdRMa6SLSN1ff4mOr4jCbfRSsxR4keIiySJU0N9T5hIQ=="],
+
     "aggregate-error": ["aggregate-error@3.1.0", "", { "dependencies": { "clean-stack": "^2.0.0", "indent-string": "^4.0.0" } }, "sha512-4I7Td01quW/RpocfNayFdFVk1qSuoh0E7JrbRJ16nH01HhKFQ88INq9Sd+nd72zqRySlr9BmDA8xlEJ6vJMrYA=="],
 
     "ansi-escapes": ["ansi-escapes@7.2.0", "", { "dependencies": { "environment": "^1.0.0" } }, "sha512-g6LhBsl+GBPRWGWsBtutpzBYuIIdBkLEvad5C/va/74Db018+5TZiyA26cZJAr3Rft5lprVqOIPxf5Vid6tqAw=="],
@@ -424,6 +447,8 @@
 
     "array-ify": ["array-ify@1.0.0", "", {}, "sha512-c5AMf34bKdvPhQ7tBGhqkgKNUzMr4WUs+WDtC2ZUGOUncbxKMTvqxYctiseW3+L4bA8ec+GcZ6/A/FW4m8ukng=="],
 
+    "asynckit": ["asynckit@0.4.0", "", {}, "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q=="],
+
     "balanced-match": ["balanced-match@1.0.2", "", {}, "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw=="],
 
     "base64-js": ["base64-js@1.5.1", "", {}, "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA=="],
@@ -442,7 +467,9 @@
 
     "buffer-equal-constant-time": ["buffer-equal-constant-time@1.0.1", "", {}, "sha512-zRpUiDwd/xk6ADqPMATG8vc9VPrkck7T07OIx0gnjmJAnHnTVXNQG3vfvWNuiZIkwu9KrKdA1iJKfsfTVxE6NA=="],
 
-    "bun-types": ["bun-types@1.3.6", "", { "dependencies": { "@types/node": "*" } }, "sha512-OlFwHcnNV99r//9v5IIOgQ9Uk37gZqrNMCcqEaExdkVq3Avwqok1bJFmvGMCkCE0FqzdY8VMOZpfpR3lwI+CsQ=="],
+    "bun-types": ["bun-types@1.3.10", "", { "dependencies": { "@types/node": "*" } }, "sha512-tcpfCCl6XWo6nCVnpcVrxQ+9AYN1iqMIzgrSKYMB/fjLtV2eyAVEg7AxQJuCq/26R6HpKWykQXuSOq/21RYcbg=="],
+
+    "call-bind-apply-helpers": ["call-bind-apply-helpers@1.0.2", "", { "dependencies": { "es-errors": "^1.3.0", "function-bind": "^1.1.2" } }, "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ=="],
 
     "callsites": ["callsites@3.1.0", "", {}, "sha512-P8BjAsXvZS+VIDUI11hHCQEv74YT67YUi5JJFNWIqL235sBmjX4+qx9Muvls5ivyNENctx46xQLQ3aTuE7ssaQ=="],
 
@@ -462,6 +489,8 @@
 
     "color-name": ["color-name@1.1.3", "", {}, "sha512-72fSenhMw2HZMTVHeCA9KCmpEIbzWiQsjN+BHcBbS9vr1mtt+vJjPdksIBNUmKAW8TFUDPJK5SUU3QhE9NEXDw=="],
 
+    "combined-stream": ["combined-stream@1.0.8", "", { "dependencies": { "delayed-stream": "~1.0.0" } }, "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg=="],
+
     "compare-func": ["compare-func@2.0.0", "", { "dependencies": { "array-ify": "^1.0.0", "dot-prop": "^5.1.0" } }, "sha512-zHig5N+tPWARooBnb0Zx1MFcdfpyJrfTJ3Y5L+IFvUm8rM74hHz66z0gw0x4tijh5CorKkKUCnW82R2vmpeCRA=="],
 
     "config-chain": ["config-chain@1.1.13", "", { "dependencies": { "ini": "^1.3.4", "proto-list": "~1.2.1" } }, "sha512-qj+f8APARXHrM0hraqXYb2/bOVSV4PvJQlNZ/DVj0QrmNM2q2euizkeuVckQ57J+W0mRH6Hvi+k50M4Jul2VRQ=="],
@@ -490,10 +519,14 @@
 
     "deep-extend": ["deep-extend@0.6.0", "", {}, "sha512-LOHxIOaPYdHlJRtCQfDIVZtfw/ufM8+rVj649RIHzcm/vGwQRXFt6OPqIFWsm2XEMrNIEtWR64sY1LEKD2vAOA=="],
 
+    "delayed-stream": ["delayed-stream@1.0.0", "", {}, "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ=="],
+
     "dir-glob": ["dir-glob@3.0.1", "", { "dependencies": { "path-type": "^4.0.0" } }, "sha512-WkrWp9GR4KXfKGYzOLmTuGVi1UWFfws377n9cc55/tb6DuqyF6pcQ5AbiHEshaDpY9v6oaSr2XCDidGmMwdzIA=="],
 
     "dot-prop": ["dot-prop@5.3.0", "", { "dependencies": { "is-obj": "^2.0.0" } }, "sha512-QM8q3zDe58hqUqjraQOmzZ1LIH9SWQJTlEKCH4kJ2oQvLZk7RbQXvtDM2XEq3fwkV9CCvvH4LA0AV+ogFsBM2Q=="],
 
+    "dunder-proto": ["dunder-proto@1.0.1", "", { "dependencies": { "call-bind-apply-helpers": "^1.0.1", "es-errors": "^1.3.0", "gopd": "^1.2.0" } }, "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A=="],
+
     "duplexer2": ["duplexer2@0.1.4", "", { "dependencies": { "readable-stream": "^2.0.2" } }, "sha512-asLFVfWWtJ90ZyOUHMqk7/S2w2guQKxUI2itj3d92ADHhxUSbCMGi1f1cBcJ7xM1To+pE/Khbwo1yuNbMEPKeA=="],
 
     "eastasianwidth": ["eastasianwidth@0.2.0", "", {}, "sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA=="],
@@ -512,10 +545,20 @@
 
     "error-ex": ["error-ex@1.3.4", "", { "dependencies": { "is-arrayish": "^0.2.1" } }, "sha512-sqQamAnR14VgCr1A618A3sGrygcpK+HEbenA/HiEAkkUwcZIIB/tgWqHFxWgOyDh4nB4JCRimh79dR5Ywc9MDQ=="],
 
+    "es-define-property": ["es-define-property@1.0.1", "", {}, "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g=="],
+
+    "es-errors": ["es-errors@1.3.0", "", {}, "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw=="],
+
+    "es-object-atoms": ["es-object-atoms@1.1.1", "", { "dependencies": { "es-errors": "^1.3.0" } }, "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA=="],
+
+    "es-set-tostringtag": ["es-set-tostringtag@2.1.0", "", { "dependencies": { "es-errors": "^1.3.0", "get-intrinsic": "^1.2.6", "has-tostringtag": "^1.0.2", "hasown": "^2.0.2" } }, "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA=="],
+
     "escalade": ["escalade@3.2.0", "", {}, "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA=="],
 
     "escape-string-regexp": ["escape-string-regexp@5.0.0", "", {}, "sha512-/veY75JbMK4j1yjvuUxuVsiS/hr/4iHs9FTT6cgTexxdE0Ly/glccBAkloH/DofkjRbZU3bnoj38mOmhkZ0lHw=="],
 
+    "event-target-shim": ["event-target-shim@5.0.1", "", {}, "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ=="],
+
     "execa": ["execa@9.6.1", "", { "dependencies": { "@sindresorhus/merge-streams": "^4.0.0", "cross-spawn": "^7.0.6", "figures": "^6.1.0", "get-stream": "^9.0.0", "human-signals": "^8.0.1", "is-plain-obj": "^4.1.0", "is-stream": "^4.0.1", "npm-run-path": "^6.0.0", "pretty-ms": "^9.2.0", "signal-exit": "^4.1.0", "strip-final-newline": "^4.0.0", "yoctocolors": "^2.1.1" } }, "sha512-9Be3ZoN4LmYR90tUoVu2te2BsbzHfhJyfEiAVfz7N5/zv+jduIfLrV2xdQXOHbaD6KgpGdO9PRPM1Y4Q9QkPkA=="],
 
     "extend": ["extend@3.0.2", "", {}, "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g=="],
@@ -540,12 +583,20 @@
 
     "foreground-child": ["foreground-child@3.3.1", "", { "dependencies": { "cross-spawn": "^7.0.6", "signal-exit": "^4.0.1" } }, "sha512-gIXjKqtFuWEgzFRJA9WCQeSJLZDjgJUOMCMzxtvFq/37KojM1BFGufqsCy0r4qSQmYLsZYMeyRqzIWOMup03sw=="],
 
+    "form-data": ["form-data@4.0.5", "", { "dependencies": { "asynckit": "^0.4.0", "combined-stream": "^1.0.8", "es-set-tostringtag": "^2.1.0", "hasown": "^2.0.2", "mime-types": "^2.1.12" } }, "sha512-8RipRLol37bNs2bhoV67fiTEvdTrbMUYcFTiy3+wuuOnUog2QBHCZWXDRijWQfAkhBj2Uf5UnVaiWwA5vdd82w=="],
+
+    "form-data-encoder": ["form-data-encoder@1.7.2", "", {}, "sha512-qfqtYan3rxrnCk1VYaA4H+Ms9xdpPqvLZa6xmMgFvhO32x7/3J/ExcTd6qpxM0vH2GdMI+poehyBZvqfMTto8A=="],
+
+    "formdata-node": ["formdata-node@4.4.1", "", { "dependencies": { "node-domexception": "1.0.0", "web-streams-polyfill": "4.0.0-beta.3" } }, "sha512-0iirZp3uVDjVGt9p49aTaqjk84TrglENEDuqfdlZQ1roC9CWlPk6Avf8EEnZNcAqPonwkG35x4n3ww/1THYAeQ=="],
+
     "formdata-polyfill": ["formdata-polyfill@4.0.10", "", { "dependencies": { "fetch-blob": "^3.1.2" } }, "sha512-buewHzMvYL29jdeQTVILecSaZKnt/RJWjoZCF5OW60Z67/GmSLBkOFM7qh1PI3zFNtJbaZL5eQu1vLfazOwj4g=="],
 
     "from2": ["from2@2.3.0", "", { "dependencies": { "inherits": "^2.0.1", "readable-stream": "^2.0.0" } }, "sha512-OMcX/4IC/uqEPVgGeyfN22LJk6AZrMkRZHxcHBMBvHScDGgwTm2GT2Wkgtocyd3JfZffjj2kYUDXXII0Fk9W0g=="],
 
     "fs-extra": ["fs-extra@11.3.3", "", { "dependencies": { "graceful-fs": "^4.2.0", "jsonfile": "^6.0.1", "universalify": "^2.0.0" } }, "sha512-VWSRii4t0AFm6ixFFmLLx1t7wS1gh+ckoa84aOeapGum0h+EZd1EhEumSB+ZdDLnEPuucsVB9oB7cxJHap6Afg=="],
 
+    "function-bind": ["function-bind@1.1.2", "", {}, "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA=="],
+
     "function-timeout": ["function-timeout@1.0.2", "", {}, "sha512-939eZS4gJ3htTHAldmyyuzlrD58P03fHG49v2JfFXbV6OhvZKRC9j2yAtdHw/zrp2zXHuv05zMIy40F0ge7spA=="],
 
     "gaxios": ["gaxios@7.1.3", "", { "dependencies": { "extend": "^3.0.2", "https-proxy-agent": "^7.0.1", "node-fetch": "^3.3.2", "rimraf": "^5.0.1" } }, "sha512-YGGyuEdVIjqxkxVH1pUTMY/XtmmsApXrCVv5EU25iX6inEPbV+VakJfLealkBtJN69AQmh1eGOdCl9Sm1UP6XQ=="],
@@ -556,6 +607,10 @@
 
     "get-east-asian-width": ["get-east-asian-width@1.4.0", "", {}, "sha512-QZjmEOC+IT1uk6Rx0sX22V6uHWVwbdbxf1faPqJ1QhLdGgsRGCZoyaQBm/piRdJy/D2um6hM1UP7ZEeQ4EkP+Q=="],
 
+    "get-intrinsic": ["get-intrinsic@1.3.0", "", { "dependencies": { "call-bind-apply-helpers": "^1.0.2", "es-define-property": "^1.0.1", "es-errors": "^1.3.0", "es-object-atoms": "^1.1.1", "function-bind": "^1.1.2", "get-proto": "^1.0.1", "gopd": "^1.2.0", "has-symbols": "^1.1.0", "hasown": "^2.0.2", "math-intrinsics": "^1.1.0" } }, "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ=="],
+
+    "get-proto": ["get-proto@1.0.1", "", { "dependencies": { "dunder-proto": "^1.0.1", "es-object-atoms": "^1.0.0" } }, "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g=="],
+
     "get-stream": ["get-stream@6.0.1", "", {}, "sha512-ts6Wi+2j3jQjqi70w5AlN8DFnkSwC+MqmxEzdEALB2qXZYV3X/b1CTfgPLGJNMeAWxdPfU8FO1ms3NUfaHCPYg=="],
 
     "git-log-parser": ["git-log-parser@1.2.1", "", { "dependencies": { "argv-formatter": "~1.0.0", "spawn-error-forwarder": "~1.0.0", "split2": "~1.0.0", "stream-combiner2": "~1.1.1", "through2": "~2.0.0", "traverse": "0.6.8" } }, "sha512-PI+sPDvHXNPl5WNOErAK05s3j0lgwUzMN6o8cyQrDaKfT3qd7TmNJKeXX+SknI5I0QhG5fVPAEwSY4tRGDtYoQ=="],
@@ -566,6 +621,8 @@
 
     "google-logging-utils": ["google-logging-utils@1.1.3", "", {}, "sha512-eAmLkjDjAFCVXg7A1unxHsLf961m6y17QFqXqAXGj/gVkKFrEICfStRfwUlGNfeCEjNRa32JEWOUTlYXPyyKvA=="],
 
+    "gopd": ["gopd@1.2.0", "", {}, "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg=="],
+
     "graceful-fs": ["graceful-fs@4.2.11", "", {}, "sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ=="],
 
     "gtoken": ["gtoken@8.0.0", "", { "dependencies": { "gaxios": "^7.0.0", "jws": "^4.0.0" } }, "sha512-+CqsMbHPiSTdtSO14O51eMNlrp9N79gmeqmXeouJOhfucAedHw9noVe/n5uJk3tbKE6a+6ZCQg3RPhVhHByAIw=="],
@@ -574,6 +631,12 @@
 
     "has-flag": ["has-flag@4.0.0", "", {}, "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ=="],
 
+    "has-symbols": ["has-symbols@1.1.0", "", {}, "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ=="],
+
+    "has-tostringtag": ["has-tostringtag@1.0.2", "", { "dependencies": { "has-symbols": "^1.0.3" } }, "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw=="],
+
+    "hasown": ["hasown@2.0.2", "", { "dependencies": { "function-bind": "^1.1.2" } }, "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ=="],
+
     "highlight.js": ["highlight.js@10.7.3", "", {}, "sha512-tzcUFauisWKNHaRkN4Wjl/ZA07gENAjFl3J/c480dprkGTg5EQstgaNFqBfUqCq54kZRIEcreTsAgF/m2quD7A=="],
 
     "hook-std": ["hook-std@4.0.0", "", {}, "sha512-IHI4bEVOt3vRUDJ+bFA9VUJlo7SzvFARPNLw75pqSmAOP2HmTWfFJtPvLBrDrlgjEYXY9zs7SFdHPQaJShkSCQ=="],
@@ -586,6 +649,8 @@
 
     "human-signals": ["human-signals@8.0.1", "", {}, "sha512-eKCa6bwnJhvxj14kZk5NCPc6Hb6BdsU9DZcOnmQKSnO1VKrfV0zCvtttPZUsBvjmNDn8rpcJfpwSYnHBjc95MQ=="],
 
+    "humanize-ms": ["humanize-ms@1.2.1", "", { "dependencies": { "ms": "^2.0.0" } }, "sha512-Fl70vYtsAFb/C06PTS9dZBo7ihau+Tu/DNCk/OyHhea07S+aeMWpFFkUaXRa8fI+ScZbEI8dfSxwY7gxZ9SAVQ=="],
+
     "husky": ["husky@9.1.7", "", { "bin": { "husky": "bin.js" } }, "sha512-5gs5ytaNjBrh5Ow3zrvdUUY+0VxIuWVL4i9irt6friV+BqdCfmV11CQTWMiBYWHbXhco+J1kHfTOUkePhCDvMA=="],
 
     "import-fresh": ["import-fresh@3.3.1", "", { "dependencies": { "parent-module": "^1.0.0", "resolve-from": "^4.0.0" } }, "sha512-TR3KfrTZTYLPB6jUjfx6MF9WcWrHL9su5TObK4ZkYgBdWKPOFoSoQIdEuTuR82pmtxH2spWG9h6etwfr1pLBqQ=="],
@@ -674,6 +739,8 @@
 
     "marked-terminal": ["marked-terminal@7.3.0", "", { "dependencies": { "ansi-escapes": "^7.0.0", "ansi-regex": "^6.1.0", "chalk": "^5.4.1", "cli-highlight": "^2.1.11", "cli-table3": "^0.6.5", "node-emoji": "^2.2.0", "supports-hyperlinks": "^3.1.0" }, "peerDependencies": { "marked": ">=1 <16" } }, "sha512-t4rBvPsHc57uE/2nJOLmMbZCQ4tgAccAED3ngXQqW6g+TxA488JzJ+FK3lQkzBQOI1mRV/r/Kq+1ZlJ4D0owQw=="],
 
+    "math-intrinsics": ["math-intrinsics@1.1.0", "", {}, "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g=="],
+
     "meow": ["meow@13.2.0", "", {}, "sha512-pxQJQzB6djGPXh08dacEloMFopsOqGVRKFPYvPOt9XDZ1HasbgDZA74CJGreSU4G3Ak7EFJGoiH2auq+yXISgA=="],
 
     "merge-stream": ["merge-stream@2.0.0", "", {}, "sha512-abv/qOcuPfk3URPfDzmZU1LKmuw8kT+0nIHvKrKgFrwifol/doWcdA4ZqsWQ8ENrFKkd67Mfpo/LovbIUsbt3w=="],
@@ -682,6 +749,10 @@
 
     "mime": ["mime@4.1.0", "", { "bin": { "mime": "bin/cli.js" } }, "sha512-X5ju04+cAzsojXKes0B/S4tcYtFAJ6tTMuSPBEn9CPGlrWr8Fiw7qYeLT0XyH80HSoAoqWCaz+MWKh22P7G1cw=="],
 
+    "mime-db": ["mime-db@1.52.0", "", {}, "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg=="],
+
+    "mime-types": ["mime-types@2.1.35", "", { "dependencies": { "mime-db": "1.52.0" } }, "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw=="],
+
     "mimic-fn": ["mimic-fn@2.1.0", "", {}, "sha512-OqbOk5oEQeAZ8WXWydlu9HJjz9WVdEIvamMCcXmuqUYjTknH/sqsWvhQ3vgwKFRR1HpjvNBKQ37nbJgYzGqGcg=="],
 
     "minimatch": ["minimatch@9.0.5", "", { "dependencies": { "brace-expansion": "^2.0.1" } }, "sha512-G6T0ZX48xgozx7587koeX9Ys2NYy6Gmv//P89sEte9V9whIapMNF4idKxnW2QtCcLiTWlb/wfCabAtAFWhhBow=="],
@@ -702,7 +773,7 @@
 
     "node-emoji": ["node-emoji@2.2.0", "", { "dependencies": { "@sindresorhus/is": "^4.6.0", "char-regex": "^1.0.2", "emojilib": "^2.4.0", "skin-tone": "^2.0.0" } }, "sha512-Z3lTE9pLaJF47NyMhd4ww1yFTAP8YhYI8SleJiHzM46Fgpm5cnNzSl9XfzFNqbaz+VlJrIj3fXQ4DeN1Rjm6cw=="],
 
-    "node-fetch": ["node-fetch@3.3.2", "", { "dependencies": { "data-uri-to-buffer": "^4.0.0", "fetch-blob": "^3.1.4", "formdata-polyfill": "^4.0.10" } }, "sha512-dRB78srN/l6gqWulah9SrxeYnxeddIG30+GOqK/9OlLVyLg3HPnr6SqOWTWOXKRwC2eGYCkZ59NNuSgvSrpgOA=="],
+    "node-fetch": ["node-fetch@2.7.0", "", { "dependencies": { "whatwg-url": "^5.0.0" }, "peerDependencies": { "encoding": "^0.1.0" }, "optionalPeers": ["encoding"] }, "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A=="],
 
     "normalize-package-data": ["normalize-package-data@8.0.0", "", { "dependencies": { "hosted-git-info": "^9.0.0", "semver": "^7.3.5", "validate-npm-package-license": "^3.0.4" } }, "sha512-RWk+PI433eESQ7ounYxIp67CYuVsS1uYSonX3kA6ps/3LWfjVQa/ptEg6Y3T6uAMq1mWpX9PQ+qx+QaHpsc7gQ=="],
 
@@ -866,6 +937,8 @@
 
     "to-regex-range": ["to-regex-range@5.0.1", "", { "dependencies": { "is-number": "^7.0.0" } }, "sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ=="],
 
+    "tr46": ["tr46@0.0.3", "", {}, "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw=="],
+
     "traverse": ["traverse@0.6.8", "", {}, "sha512-aXJDbk6SnumuaZSANd21XAo15ucCDE38H4fkqiGsc3MhCK+wOlZvLP9cB/TvpHT0mOyWgC4Z8EwRlzqYSUzdsA=="],
 
     "tslib": ["tslib@2.8.1", "", {}, "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w=="],
@@ -898,10 +971,14 @@
 
     "validate-npm-package-license": ["validate-npm-package-license@3.0.4", "", { "dependencies": { "spdx-correct": "^3.0.0", "spdx-expression-parse": "^3.0.0" } }, "sha512-DpKm2Ui/xN7/HQKCtpZxoRWBhZ9Z0kqtygG8XCgNQ8ZlDnxuQmWhj566j8fN4Cu3/JmbhsDo7fcAJq4s9h27Ew=="],
 
-    "web-streams-polyfill": ["web-streams-polyfill@3.3.3", "", {}, "sha512-d2JWLCivmZYTSIoge9MsgFCZrt571BikcWGYkjC1khllbTeDlGqZ2D8vD8E/lJa8WGWbb7Plm8/XJYV7IJHZZw=="],
+    "web-streams-polyfill": ["web-streams-polyfill@4.0.0-beta.3", "", {}, "sha512-QW95TCTaHmsYfHDybGMwO5IJIM93I/6vTRk+daHTWFPhwh+C8Cg7j7XyKrwrj8Ib6vYXe0ocYNrmzY4xAAN6ug=="],
 
     "web-worker": ["web-worker@1.2.0", "", {}, "sha512-PgF341avzqyx60neE9DD+XS26MMNMoUQRz9NOZwW32nPQrF6p77f1htcnjBSEV8BGMKZ16choqUG4hyI0Hx7mA=="],
 
+    "webidl-conversions": ["webidl-conversions@3.0.1", "", {}, "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ=="],
+
+    "whatwg-url": ["whatwg-url@5.0.0", "", { "dependencies": { "tr46": "~0.0.3", "webidl-conversions": "^3.0.0" } }, "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw=="],
+
     "which": ["which@2.0.2", "", { "dependencies": { "isexe": "^2.0.0" }, "bin": { "node-which": "./bin/node-which" } }, "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA=="],
 
     "wordwrap": ["wordwrap@1.0.0", "", {}, "sha512-gvVzJFlPycKc5dZN4yPkP8w7Dc37BtP1yczEneOb4uq34pXZcvrtRTmWV8W+Ume+XCxKgbjM+nevkyFPMybd4Q=="],
@@ -926,6 +1003,8 @@
 
     "@actions/http-client/undici": ["undici@5.29.0", "", { "dependencies": { "@fastify/busboy": "^2.0.0" } }, "sha512-raqeBD6NQK4SkWhQzeYKd1KmIG6dllBOTt55Rmkt4HtI9mwdWtJljnrXjAFUBLTSN67HWrOIZ3EPF4kjUw80Bg=="],
 
+    "@anthropic-ai/sdk/@types/node": ["@types/node@18.19.130", "", { "dependencies": { "undici-types": "~5.26.4" } }, "sha512-GRaXQx6jGfL8sKfaIDD6OupbIHBr9jv7Jnaml9tB7l4v068PAOXqfcujMMo5PhbIs6ggR1XODELqahT2R8v0fg=="],
+
     "@aws-crypto/sha1-browser/@smithy/util-utf8": ["@smithy/util-utf8@2.3.0", "", { "dependencies": { "@smithy/util-buffer-from": "^2.2.0", "tslib": "^2.6.2" } }, "sha512-R8Rdn8Hy72KKcebgLiv8jQcQkXoLMOGGv5uI1/k0l+snqkOzQ1R0ChUBCxWMlBsFMekWjq0wRudIweFs7sKT5A=="],
 
     "@aws-crypto/sha256-browser/@smithy/util-utf8": ["@smithy/util-utf8@2.3.0", "", { "dependencies": { "@smithy/util-buffer-from": "^2.2.0", "tslib": "^2.6.2" } }, "sha512-R8Rdn8Hy72KKcebgLiv8jQcQkXoLMOGGv5uI1/k0l+snqkOzQ1R0ChUBCxWMlBsFMekWjq0wRudIweFs7sKT5A=="],
@@ -970,6 +1049,10 @@
 
     "fdir/picomatch": ["picomatch@4.0.3", "", {}, "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q=="],
 
+    "fetch-blob/web-streams-polyfill": ["web-streams-polyfill@3.3.3", "", {}, "sha512-d2JWLCivmZYTSIoge9MsgFCZrt571BikcWGYkjC1khllbTeDlGqZ2D8vD8E/lJa8WGWbb7Plm8/XJYV7IJHZZw=="],
+
+    "gaxios/node-fetch": ["node-fetch@3.3.2", "", { "dependencies": { "data-uri-to-buffer": "^4.0.0", "fetch-blob": "^3.1.4", "formdata-polyfill": "^4.0.10" } }, "sha512-dRB78srN/l6gqWulah9SrxeYnxeddIG30+GOqK/9OlLVyLg3HPnr6SqOWTWOXKRwC2eGYCkZ59NNuSgvSrpgOA=="],
+
     "import-fresh/resolve-from": ["resolve-from@4.0.0", "", {}, "sha512-pb/MYmXstAkysRFx8piNI1tGFNQIFA3vkE3Gq4EuA1dF6gHp/+vgZqsCGJapvy8N3Q+4o7FwvquPJcnZ7RYy4g=="],
 
     "load-json-file/parse-json": ["parse-json@4.0.0", "", { "dependencies": { "error-ex": "^1.3.1", "json-parse-better-errors": "^1.0.1" } }, "sha512-aOIos8bujGN93/8Ox/jPLh7RwVnPEysynVFE+fQZyg6jKELEHwzgKdLRFHUgXJL6kylijVSBC4BvN9OmsB48Rw=="],
@@ -1324,6 +1407,8 @@
 
     "wrap-ansi-cjs/strip-ansi": ["strip-ansi@6.0.1", "", { "dependencies": { "ansi-regex": "^5.0.1" } }, "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A=="],
 
+    "@anthropic-ai/sdk/@types/node/undici-types": ["undici-types@5.26.5", "", {}, "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA=="],
+
     "@aws-crypto/sha1-browser/@smithy/util-utf8/@smithy/util-buffer-from": ["@smithy/util-buffer-from@2.2.0", "", { "dependencies": { "@smithy/is-array-buffer": "^2.2.0", "tslib": "^2.6.2" } }, "sha512-IJdWBbTcMQ6DA0gdNhh/BwrLkDR+ADW5Kr1aZmd4k3DIF6ezMV4R2NIAmT08wQJ3yUK82thHWmC/TnK/wpMMIA=="],
 
     "@aws-crypto/sha256-browser/@smithy/util-utf8/@smithy/util-buffer-from": ["@smithy/util-buffer-from@2.2.0", "", { "dependencies": { "@smithy/is-array-buffer": "^2.2.0", "tslib": "^2.6.2" } }, "sha512-IJdWBbTcMQ6DA0gdNhh/BwrLkDR+ADW5Kr1aZmd4k3DIF6ezMV4R2NIAmT08wQJ3yUK82thHWmC/TnK/wpMMIA=="],
diff --git a/packages/shared/AGENTS.md b/packages/shared/AGENTS.md
new file mode 120000
index 0000000..681311e
--- /dev/null
+++ b/packages/shared/AGENTS.md
@@ -0,0 +1 @@
+CLAUDE.md
\ No newline at end of file
diff --git a/packages/shared/CLAUDE.md b/packages/shared/CLAUDE.md
new file mode 100644
index 0000000..1c9697d
--- /dev/null
+++ b/packages/shared/CLAUDE.md
@@ -0,0 +1,22 @@
+# @docx-corpus/shared
+
+Shared utilities used by all TypeScript packages. This is the foundation layer — every TS package depends on it.
+
+## What's here
+
+- **`db.ts`** — `DbClient` interface and `createDb()` factory. Uses `Bun.sql` (not pg/postgres.js). All pipeline stages (scrape, extract, embed, classify) read/write through this client.
+- **`storage.ts`** — `Storage` interface with `createR2Storage()` and `createLocalStorage()`. Abstracts Cloudflare R2 vs local filesystem.
+- **`ui.ts`** — Terminal formatting helpers (progress bars, headers, multi-line progress).
+- **`index.ts`** — Barrel exports.
+
+## Key types
+
+- `DocumentRecord` — the full row from the `documents` table. Every pipeline stage adds columns to this.
+- `DbClient` — interface with methods grouped by pipeline stage (scraping, extraction, embedding, classification).
+- `LLMClassificationData` — `{ id, documentType, documentTopic, confidence, model }` for the classification pipeline.
+
+## When modifying
+
+- Adding a new pipeline stage? Add fields to `DocumentRecord`, add methods to `DbClient` interface AND the `createDb()` implementation.
+- DB uses tagged template literals (`sql\`...\``) for parameterized queries. Use `sql.unsafe()` only when dynamic column names are needed.
+- Don't add external dependencies — this package only depends on Bun built-ins and `@aws-sdk/client-s3`.
diff --git a/packages/shared/db.ts b/packages/shared/db.ts
index 61a66d0..97cb5ac 100644
--- a/packages/shared/db.ts
+++ b/packages/shared/db.ts
@@ -30,10 +30,16 @@ export interface DocumentRecord {
   embedding_model: string | null;
   embedding: number[] | null;
 
-  // Classification data
+  // Classification data (clustering)
   cluster_id: number | null;
   cluster_label: string | null;
   classified_at: string | null;
+
+  // LLM classification data
+  document_type: string | null;
+  document_topic: string | null;
+  classification_confidence: number | null;
+  classification_model: string | null;
 }
 
 export interface ExtractionData {
@@ -62,6 +68,14 @@ export interface ClassificationData {
   classified_at?: string;
 }
 
+export interface LLMClassificationData {
+  id: string;
+  documentType: string;
+  documentTopic: string;
+  confidence: number;
+  model: string;
+}
+
 export interface DbClient {
   // Scraping methods (existing)
   upsertDocument(doc: Partial<DocumentRecord> & { id: string }): Promise<void>;
@@ -82,18 +96,24 @@ export interface DbClient {
 
   // Embedding methods (new)
   updateEmbedding(data: EmbeddingData): Promise<void>;
+  markEmbeddingSkipped(id: string, reason: string): Promise<void>;
   getUnembeddedDocuments(limit: number): Promise<DocumentRecord[]>;
   getEmbeddedDocuments(limit: number): Promise<DocumentRecord[]>;
   getDocumentsWithEmbeddings(limit: number): Promise<{ id: string; embedding: number[] }[]>;
 
-  // Classification methods (new)
+  // Classification methods (clustering)
   updateClassification(data: ClassificationData): Promise<void>;
   updateClassificationBatch(data: ClassificationData[]): Promise<void>;
   getUnclassifiedDocuments(limit: number): Promise<DocumentRecord[]>;
 
+  // LLM classification methods
+  updateLLMClassification(data: LLMClassificationData): Promise<void>;
+  updateLLMClassificationBatch(ids: string[], data: Omit<LLMClassificationData, "id">): Promise<void>;
+  getLLMClassificationStats(): Promise<{ classified: number; pending: number; byType: Record<string, number> }>;
+
   // Stats
   getExtractionStats(): Promise<{ extracted: number; pending: number; errors: number }>;
-  getEmbeddingStats(): Promise<{ embedded: number; pending: number }>;
+  getEmbeddingStats(): Promise<{ embedded: number; pending: number; skipped: number }>;
   getClassificationStats(): Promise<{ classified: number; pending: number; clusters: number }>;
 
   close(): Promise<void>;
@@ -258,6 +278,15 @@ export async function createDb(databaseUrl: string): Promise<DbClient> {
       );
     },
 
+    async markEmbeddingSkipped(id: string, reason: string) {
+      await sql`
+        UPDATE documents SET
+          embedded_at = ${new Date().toISOString()},
+          embedding_model = ${`skipped:${reason}`}
+        WHERE id = ${id}
+      `;
+    },
+
     async getUnembeddedDocuments(limit: number) {
       return sql<DocumentRecord[]>`
         SELECT * FROM documents
@@ -327,6 +356,59 @@ export async function createDb(databaseUrl: string): Promise<DbClient> {
       `;
     },
 
+    // ==================== LLM Classification Methods ====================
+
+    async updateLLMClassification(data: LLMClassificationData) {
+      await sql`
+        UPDATE documents SET
+          document_type = ${data.documentType},
+          document_topic = ${data.documentTopic},
+          classification_confidence = ${data.confidence},
+          classification_model = ${data.model}
+        WHERE id = ${data.id}
+      `;
+    },
+
+    async updateLLMClassificationBatch(ids: string[], data: Omit<LLMClassificationData, "id">) {
+      for (const id of ids) {
+        await sql`
+          UPDATE documents SET
+            document_type = ${data.documentType},
+            document_topic = ${data.documentTopic},
+            classification_confidence = ${data.confidence},
+            classification_model = ${data.model}
+          WHERE id = ${id}
+        `;
+      }
+    },
+
+    async getLLMClassificationStats() {
+      const result = await sql<{ classified: number; pending: number }[]>`
+        SELECT
+          COUNT(*) FILTER (WHERE document_type IS NOT NULL)::int as classified,
+          COUNT(*) FILTER (WHERE extracted_at IS NOT NULL AND extraction_error IS NULL AND document_type IS NULL)::int as pending
+        FROM documents
+      `;
+
+      const byTypeRows = await sql<{ type: string; count: number }[]>`
+        SELECT document_type as type, COUNT(*)::int as count
+        FROM documents
+        WHERE document_type IS NOT NULL
+        GROUP BY document_type
+      `;
+
+      const byType: Record<string, number> = {};
+      for (const row of byTypeRows) {
+        byType[row.type] = row.count;
+      }
+
+      return {
+        classified: result[0].classified,
+        pending: result[0].pending,
+        byType,
+      };
+    },
+
     // ==================== Stats ====================
 
     async getExtractionStats() {
@@ -341,10 +423,11 @@ export async function createDb(databaseUrl: string): Promise<DbClient> {
     },
 
     async getEmbeddingStats() {
-      const result = await sql<{ embedded: number; pending: number }[]>`
+      const result = await sql<{ embedded: number; pending: number; skipped: number }[]>`
         SELECT
-          COUNT(*) FILTER (WHERE embedded_at IS NOT NULL)::int as embedded,
-          COUNT(*) FILTER (WHERE extracted_at IS NOT NULL AND extraction_error IS NULL AND embedded_at IS NULL)::int as pending
+          COUNT(*) FILTER (WHERE embedded_at IS NOT NULL AND embedding_model NOT LIKE 'skipped:%')::int as embedded,
+          COUNT(*) FILTER (WHERE extracted_at IS NOT NULL AND extraction_error IS NULL AND embedded_at IS NULL)::int as pending,
+          COUNT(*) FILTER (WHERE embedding_model LIKE 'skipped:%')::int as skipped
         FROM documents
       `;
       return result[0];
diff --git a/packages/shared/index.ts b/packages/shared/index.ts
index 2891613..cc0cd4c 100644
--- a/packages/shared/index.ts
+++ b/packages/shared/index.ts
@@ -29,4 +29,5 @@ export {
   type ExtractionData,
   type EmbeddingData,
   type ClassificationData,
+  type LLMClassificationData,
 } from "./db";
diff --git a/scripts/classification/AGENTS.md b/scripts/classification/AGENTS.md
new file mode 120000
index 0000000..681311e
--- /dev/null
+++ b/scripts/classification/AGENTS.md
@@ -0,0 +1 @@
+CLAUDE.md
\ No newline at end of file
diff --git a/scripts/classification/CLAUDE.md b/scripts/classification/CLAUDE.md
new file mode 100644
index 0000000..9d9a0bf
--- /dev/null
+++ b/scripts/classification/CLAUDE.md
@@ -0,0 +1,38 @@
+# Classification Pipeline
+
+Python ML pipeline that classifies ~800K .docx documents by **document type** (10 classes) and **topic** (9 classes).
+
+Uses the FineWeb-Edu pattern: LLM labels a small sample → train lightweight classifier → apply at scale.
+
+## Pipeline steps (run in order)
+
+1. **`sample.py`** — Stratified sampling from PostgreSQL. Samples proportionally across languages (en, ru, cs, pl, es), stratified by word count terciles and source domain diversity.
+2. **`label.py`** — Async LLM labeling with Claude. Supports resume (appends to JSONL). Rate-limited with configurable parallelism.
+3. **`train.py`** — Fine-tunes two independent ModernBERT classifiers (document_type and topic). Outputs models to `./models/`.
+4. **`classify.py`** — Batch inference on the full corpus. Fetches text from R2, runs both models, writes results to PostgreSQL.
+5. **`evaluate.py`** — Quality metrics. Two modes: `labels` (analyzes JSONL) and `corpus` (queries DB).
+
+## Key files
+
+- **`taxonomy.json`** — Single source of truth for the 2D taxonomy (10 document types × 9 topics). Both prompt building and model training reference this.
+- **`common.py`** — Shared utilities: DB connection (`psycopg2`), text fetching from `https://docxcorp.us/extracted/`, taxonomy loading.
+- **`pyproject.toml`** — Python dependencies. Install with `pip install -e .` or `uv pip install -e .`.
+
+## Database
+
+Writes to the same `documents` table as the TS pipeline:
+- `document_type` — one of 10 types (legal, forms, reports, etc.)
+- `document_topic` — one of 9 topics (government, education, healthcare, etc.)
+- `classification_confidence` — min(type_confidence, topic_confidence)
+- `classification_model` — e.g. "claude-haiku-4-5" or "modernbert-v2.0.0"
+
+Connection via `DATABASE_URL` env var loaded from `../../.env`.
+
+## Conventions
+
+- Python 3.11+, no type stubs needed
+- Uses `psycopg2` for DB (not Bun.sql — this is Python)
+- Uses `python-dotenv` to load `.env` from project root
+- Text is fetched via HTTP from the public R2 endpoint, not direct R2 access
+- All scripts support `--help` for usage
+- JSONL files are the interchange format between steps
diff --git a/scripts/classification/README.md b/scripts/classification/README.md
new file mode 100644
index 0000000..bca3f5b
--- /dev/null
+++ b/scripts/classification/README.md
@@ -0,0 +1,81 @@
+# Document Classification Pipeline
+
+Classifies ~800K .docx documents using the FineWeb-Edu / TnT-LLM pattern:
+LLM labels a small sample → train ModernBERT classifier → apply at scale.
+
+## Two-Dimensional Taxonomy
+
+Each document gets classified on two independent dimensions:
+
+- **Document Type** (10 classes): legal, forms, reports, policies, educational, correspondence, technical, administrative, creative, reference
+- **Topic** (9 classes): government, education, healthcare, finance, legal_judicial, technology, environment, nonprofit, general
+
+## Pipeline Steps
+
+### 1. Sample (`sample.py`)
+
+Stratified sampling across languages, word count, and source domains.
+
+```bash
+python sample.py --total 3500 --output sampled_docs.jsonl
+```
+
+### 2. Label (`label.py`)
+
+LLM classification with Claude. Supports resume — safe to interrupt and restart.
+
+```bash
+python label.py --input sampled_docs.jsonl --output labeled_docs.jsonl
+python label.py --input sampled_docs.jsonl --output labeled_docs.jsonl --model claude-haiku-4-5 --parallel 5
+```
+
+### 3. Evaluate Labels (`evaluate.py labels`)
+
+Check label quality before training.
+
+```bash
+python evaluate.py labels --input labeled_docs.jsonl
+```
+
+### 4. Train (`train.py`)
+
+Fine-tune ModernBERT on labeled data. Trains two independent classifiers.
+
+```bash
+python train.py --input labeled_docs.jsonl
+python train.py --input labeled_docs.jsonl --epochs 5 --lr 2e-5 --output-dir ./models
+```
+
+### 5. Classify (`classify.py`)
+
+Apply trained models to the full corpus.
+
+```bash
+python classify.py --models-dir ./models
+python classify.py --models-dir ./models --batch-size 256 --dry-run --limit 100
+```
+
+### 6. Evaluate Corpus (`evaluate.py corpus`)
+
+Check full corpus classification distribution.
+
+```bash
+python evaluate.py corpus
+python evaluate.py corpus --languages en,ru
+```
+
+## Setup
+
+```bash
+pip install -r requirements.txt
+```
+
+Required environment variables (`.env` in project root):
+- `DATABASE_URL` — PostgreSQL connection string
+- `ANTHROPIC_API_KEY` — For LLM labeling step only
+
+## Cost Estimate
+
+- **Labeling**: ~3,500 docs × Claude Haiku ≈ $2-5
+- **Training**: ~30 min on GPU (or ~2h on CPU)
+- **Inference**: ~800K docs, ~200-500 docs/sec on GPU
diff --git a/scripts/classification/classify.py b/scripts/classification/classify.py
new file mode 100644
index 0000000..411e743
--- /dev/null
+++ b/scripts/classification/classify.py
@@ -0,0 +1,384 @@
+#!/usr/bin/env python3
+"""
+Phase 4: Apply trained classifiers to the full corpus.
+
+Loads the trained ModernBERT models and classifies all unclassified documents.
+Fetches text from R2, runs inference, updates the database.
+
+Supports resume — already-classified documents are skipped.
+
+Usage:
+    python classify.py --models-dir ./models
+    python classify.py --models-dir ./models --batch-size 256 --languages en,ru,cs,pl,es
+    python classify.py --models-dir ./models --dry-run --limit 100
+"""
+
+import argparse
+import json
+import os
+import sys
+import time
+from pathlib import Path
+
+import numpy as np
+import torch
+from tqdm import tqdm
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+
+from common import (
+    fetch_documents_text_parallel,
+    get_db_connection,
+    load_taxonomy,
+    save_labels_to_db,
+)
+
+DEFAULT_BATCH_SIZE = 128
+DEFAULT_MAX_LENGTH = 512
+DEFAULT_MAX_CHARS = 2000
+
+
+def load_classifier(model_dir: str, device: torch.device):
+    """Load a trained classifier and tokenizer."""
+    tokenizer = AutoTokenizer.from_pretrained(model_dir)
+    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
+    model.to(device)
+    model.eval()
+    return tokenizer, model
+
+
+def get_unclassified_documents(
+    languages: list[str] | None = None,
+    limit: int | None = None,
+) -> list[dict]:
+    """Fetch documents that haven't been classified yet."""
+    conn = get_db_connection()
+    try:
+        with conn.cursor() as cur:
+            query = """
+                SELECT id, source_url, original_filename, word_count, language
+                FROM documents
+                WHERE extracted_at IS NOT NULL
+                  AND extraction_error IS NULL
+                  AND word_count > 0
+                  AND classification_model IS NULL
+            """
+            params: list = []
+
+            if languages:
+                placeholders = ",".join(["%s"] * len(languages))
+                query += f" AND language IN ({placeholders})"
+                params.extend(languages)
+
+            query += " ORDER BY random()"
+
+            if limit:
+                query += " LIMIT %s"
+                params.append(limit)
+
+            cur.execute(query, params)
+            return [
+                {
+                    "id": row[0],
+                    "source_url": row[1],
+                    "original_filename": row[2],
+                    "word_count": row[3],
+                    "language": row[4],
+                }
+                for row in cur.fetchall()
+            ]
+    finally:
+        conn.close()
+
+
+def get_classification_stats() -> dict:
+    """Get current classification progress."""
+    conn = get_db_connection()
+    try:
+        with conn.cursor() as cur:
+            cur.execute("""
+                SELECT
+                    COUNT(*)::int as total,
+                    COUNT(CASE WHEN classification_model IS NOT NULL THEN 1 END)::int as classified,
+                    COUNT(CASE WHEN extracted_at IS NOT NULL AND extraction_error IS NULL AND word_count > 0 THEN 1 END)::int as classifiable
+                FROM documents
+            """)
+            row = cur.fetchone()
+            return {
+                "total": row[0],
+                "classified": row[1],
+                "classifiable": row[2],
+                "remaining": row[2] - row[1],
+            }
+    finally:
+        conn.close()
+
+
+@torch.no_grad()
+def classify_batch(
+    texts: list[str],
+    tokenizer,
+    model,
+    max_length: int,
+    device: torch.device,
+) -> list[tuple[str, float]]:
+    """Classify a batch of texts. Returns list of (label, confidence)."""
+    inputs = tokenizer(
+        texts,
+        truncation=True,
+        max_length=max_length,
+        padding=True,
+        return_tensors="pt",
+    ).to(device)
+
+    outputs = model(**inputs)
+    probs = torch.softmax(outputs.logits, dim=-1)
+    confidences, pred_ids = torch.max(probs, dim=-1)
+
+    results = []
+    for pred_id, conf in zip(pred_ids.cpu().numpy(), confidences.cpu().numpy()):
+        label = model.config.id2label[int(pred_id)]
+        results.append((label, float(conf)))
+
+    return results
+
+
+def process_batch(
+    docs: list[dict],
+    type_tokenizer,
+    type_model,
+    topic_tokenizer,
+    topic_model,
+    max_length: int,
+    max_chars: int,
+    device: torch.device,
+    model_name: str,
+) -> list[dict]:
+    """Process a batch: fetch texts, classify, return label dicts."""
+    # Fetch texts
+    doc_ids = [d["id"] for d in docs]
+    texts = fetch_documents_text_parallel(doc_ids, max_chars=max_chars)
+
+    # Filter docs with text
+    valid_docs = []
+    valid_texts = []
+    for doc in docs:
+        text = texts.get(doc["id"], "")
+        if text:
+            valid_docs.append(doc)
+            valid_texts.append(text)
+
+    if not valid_texts:
+        return []
+
+    # Classify with both models
+    type_results = classify_batch(
+        valid_texts, type_tokenizer, type_model, max_length, device
+    )
+    topic_results = classify_batch(
+        valid_texts, topic_tokenizer, topic_model, max_length, device
+    )
+
+    # Build label dicts for DB update
+    labels = []
+    for doc, (doc_type, type_conf), (topic, topic_conf) in zip(
+        valid_docs, type_results, topic_results
+    ):
+        labels.append(
+            {
+                "id": doc["id"],
+                "document_type": doc_type,
+                "document_topic": topic,
+                "confidence": min(type_conf, topic_conf),
+                "model": model_name,
+            }
+        )
+
+    return labels
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Classify full corpus with trained ModernBERT models"
+    )
+    parser.add_argument(
+        "--models-dir",
+        type=str,
+        required=True,
+        help="Directory containing trained models (from train.py)",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=DEFAULT_BATCH_SIZE,
+        help=f"Inference batch size (default: {DEFAULT_BATCH_SIZE})",
+    )
+    parser.add_argument(
+        "--max-length",
+        type=int,
+        default=DEFAULT_MAX_LENGTH,
+        help=f"Max token length (default: {DEFAULT_MAX_LENGTH})",
+    )
+    parser.add_argument(
+        "--max-chars",
+        type=int,
+        default=DEFAULT_MAX_CHARS,
+        help=f"Max text characters to fetch (default: {DEFAULT_MAX_CHARS})",
+    )
+    parser.add_argument(
+        "--languages",
+        type=str,
+        default=None,
+        help="Comma-separated language codes to classify (default: all)",
+    )
+    parser.add_argument(
+        "--limit",
+        type=int,
+        default=None,
+        help="Max documents to classify (default: all)",
+    )
+    parser.add_argument(
+        "--db-batch-size",
+        type=int,
+        default=500,
+        help="DB update batch size (default: 500)",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Classify but don't write to DB",
+    )
+    args = parser.parse_args()
+
+    # Validate model directories
+    type_model_dir = os.path.join(args.models_dir, "document_type", "best")
+    topic_model_dir = os.path.join(args.models_dir, "topic", "best")
+
+    for d in [type_model_dir, topic_model_dir]:
+        if not os.path.exists(d):
+            print(f"ERROR: Model directory not found: {d}")
+            sys.exit(1)
+
+    # Load training config for model name
+    config_path = os.path.join(args.models_dir, "training_config.json")
+    if os.path.exists(config_path):
+        with open(config_path) as f:
+            train_config = json.load(f)
+        model_name = f"modernbert-{train_config.get('taxonomy_version', 'v2')}"
+    else:
+        model_name = "modernbert-v2"
+
+    # Device
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+    elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+        device = torch.device("mps")
+    else:
+        device = torch.device("cpu")
+    print(f"Device: {device}")
+
+    # Load models
+    print(f"\nLoading document_type model from {type_model_dir}...")
+    type_tokenizer, type_model = load_classifier(type_model_dir, device)
+    print(f"Loading topic model from {topic_model_dir}...")
+    topic_tokenizer, topic_model = load_classifier(topic_model_dir, device)
+
+    # Stats
+    stats = get_classification_stats()
+    print(f"\nCorpus stats:")
+    print(f"  Total documents: {stats['total']:,}")
+    print(f"  Classifiable: {stats['classifiable']:,}")
+    print(f"  Already classified: {stats['classified']:,}")
+    print(f"  Remaining: {stats['remaining']:,}")
+
+    # Get unclassified docs
+    languages = (
+        [l.strip() for l in args.languages.split(",")]
+        if args.languages
+        else None
+    )
+    print(f"\nFetching unclassified documents...")
+    docs = get_unclassified_documents(languages=languages, limit=args.limit)
+    print(f"  Found {len(docs):,} documents to classify")
+
+    if not docs:
+        print("Nothing to classify!")
+        return
+
+    if args.dry_run:
+        print("  (DRY RUN — will not write to database)")
+
+    # Process in batches
+    total_classified = 0
+    total_errors = 0
+    start_time = time.time()
+
+    # Use smaller fetch batches for text retrieval
+    fetch_batch_size = min(args.batch_size, 100)
+
+    pbar = tqdm(total=len(docs), desc="Classifying", unit="doc")
+
+    for i in range(0, len(docs), fetch_batch_size):
+        batch_docs = docs[i : i + fetch_batch_size]
+
+        labels = process_batch(
+            docs=batch_docs,
+            type_tokenizer=type_tokenizer,
+            type_model=type_model,
+            topic_tokenizer=topic_tokenizer,
+            topic_model=topic_model,
+            max_length=args.max_length,
+            max_chars=args.max_chars,
+            device=device,
+            model_name=model_name,
+        )
+
+        if labels and not args.dry_run:
+            save_labels_to_db(labels, batch_size=args.db_batch_size)
+
+        total_classified += len(labels)
+        total_errors += len(batch_docs) - len(labels)
+        pbar.update(len(batch_docs))
+
+        # Show throughput
+        elapsed = time.time() - start_time
+        rate = total_classified / elapsed if elapsed > 0 else 0
+        pbar.set_postfix_str(f"{rate:.0f} docs/s, {total_errors} errors")
+
+    pbar.close()
+
+    elapsed = time.time() - start_time
+    rate = total_classified / elapsed if elapsed > 0 else 0
+
+    print(f"\n{'=' * 60}")
+    print("Classification Complete")
+    print(f"{'=' * 60}")
+    print(f"  Classified: {total_classified:,}")
+    print(f"  Errors (no text): {total_errors:,}")
+    print(f"  Time: {elapsed:.1f}s ({rate:.0f} docs/s)")
+    print(f"  Model: {model_name}")
+
+    if not args.dry_run:
+        final_stats = get_classification_stats()
+        print(f"\n  Total classified in DB: {final_stats['classified']:,}")
+        print(f"  Remaining: {final_stats['remaining']:,}")
+
+    if args.dry_run:
+        print("\n  (DRY RUN — no changes written to database)")
+
+    # Print distribution of this batch
+    if total_classified > 0 and labels:
+        print(f"\nSample distribution (last batch):")
+        type_counts: dict[str, int] = {}
+        topic_counts: dict[str, int] = {}
+        for label in labels:
+            dt = label["document_type"]
+            tp = label["document_topic"]
+            type_counts[dt] = type_counts.get(dt, 0) + 1
+            topic_counts[tp] = topic_counts.get(tp, 0) + 1
+
+        print("  Types:", dict(sorted(type_counts.items(), key=lambda x: -x[1])))
+        print("  Topics:", dict(sorted(topic_counts.items(), key=lambda x: -x[1])))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/classification/common.py b/scripts/classification/common.py
new file mode 100644
index 0000000..a83abdc
--- /dev/null
+++ b/scripts/classification/common.py
@@ -0,0 +1,130 @@
+"""
+Shared utilities for the classification pipeline.
+DB connection, text fetching, and common helpers.
+"""
+
+import json
+import os
+import urllib.error
+import urllib.request
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+from typing import Optional
+
+import psycopg2
+from dotenv import load_dotenv
+
+# Load .env from project root
+env_path = Path(__file__).parent.parent.parent / ".env"
+load_dotenv(env_path)
+
+TEXT_BASE_URL = "https://docxcorp.us/extracted"
+
+
+def get_db_connection():
+    """Create a connection to the PostgreSQL database."""
+    database_url = os.getenv("DATABASE_URL")
+    if not database_url:
+        raise ValueError("DATABASE_URL environment variable not set")
+    return psycopg2.connect(database_url)
+
+
+def load_taxonomy(path: Optional[str] = None) -> dict:
+    """Load taxonomy from JSON file."""
+    if path is None:
+        path = Path(__file__).parent / "taxonomy.json"
+    with open(path) as f:
+        return json.load(f)
+
+
+def fetch_document_text(doc_id: str, max_chars: int = 2000) -> str:
+    """Fetch extracted text for a document from public URL."""
+    url = f"{TEXT_BASE_URL}/{doc_id}.txt"
+    try:
+        req = urllib.request.Request(url, headers={"User-Agent": "docx-classifier/2.0"})
+        with urllib.request.urlopen(req, timeout=15) as response:
+            text = response.read().decode("utf-8")
+            return text[:max_chars]
+    except urllib.error.HTTPError as e:
+        if e.code == 404:
+            return ""
+        return ""
+    except Exception:
+        return ""
+
+
+def fetch_documents_text_parallel(
+    doc_ids: list[str], max_chars: int = 2000, max_workers: int = 20
+) -> dict[str, str]:
+    """Fetch text for multiple documents in parallel."""
+    results = {}
+
+    def fetch_one(doc_id: str) -> tuple[str, str]:
+        return doc_id, fetch_document_text(doc_id, max_chars)
+
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        for doc_id, text in executor.map(fetch_one, doc_ids):
+            results[doc_id] = text
+
+    return results
+
+
+def get_extraction_stats_by_language() -> list[dict]:
+    """Get document counts per language for extracted docs."""
+    conn = get_db_connection()
+    try:
+        with conn.cursor() as cur:
+            cur.execute("""
+                SELECT
+                    language,
+                    COUNT(*)::int as count,
+                    ROUND(AVG(word_count))::int as avg_words
+                FROM documents
+                WHERE extracted_at IS NOT NULL
+                  AND extraction_error IS NULL
+                  AND language IS NOT NULL
+                ORDER BY count DESC
+            """)
+            return [
+                {"language": row[0], "count": row[1], "avg_words": row[2]}
+                for row in cur.fetchall()
+            ]
+    finally:
+        conn.close()
+
+
+def save_labels_to_db(labels: list[dict], batch_size: int = 500) -> int:
+    """
+    Save classification labels to the database.
+
+    Each label dict: {id, document_type, document_topic, confidence, model}
+    """
+    conn = get_db_connection()
+    total = 0
+    try:
+        with conn.cursor() as cur:
+            for i in range(0, len(labels), batch_size):
+                batch = labels[i : i + batch_size]
+                for label in batch:
+                    cur.execute(
+                        """
+                        UPDATE documents SET
+                            document_type = %s,
+                            document_topic = %s,
+                            classification_confidence = %s,
+                            classification_model = %s
+                        WHERE id = %s
+                        """,
+                        (
+                            label["document_type"],
+                            label["document_topic"],
+                            label["confidence"],
+                            label["model"],
+                            label["id"],
+                        ),
+                    )
+                    total += 1
+                conn.commit()
+    finally:
+        conn.close()
+    return total
diff --git a/scripts/classification/evaluate.py b/scripts/classification/evaluate.py
new file mode 100644
index 0000000..e229dcf
--- /dev/null
+++ b/scripts/classification/evaluate.py
@@ -0,0 +1,303 @@
+#!/usr/bin/env python3
+"""
+Evaluate classification quality.
+
+Can evaluate:
+  1. LLM labels (from label.py) — confidence distribution, class balance
+  2. Trained models (from train.py) — val set metrics
+  3. Full corpus (from classify.py) — distribution analysis from DB
+
+Usage:
+    python evaluate.py labels --input labeled_docs.jsonl
+    python evaluate.py corpus
+    python evaluate.py corpus --languages en,ru
+"""
+
+import argparse
+import json
+import os
+import sys
+
+from common import get_db_connection, load_taxonomy
+
+
+def evaluate_labels(input_path: str, taxonomy: dict):
+    """Evaluate LLM-labeled data: distribution, confidence, quality signals."""
+    docs = []
+    with open(input_path) as f:
+        for line in f:
+            if line.strip():
+                docs.append(json.loads(line))
+
+    print(f"\n{'=' * 60}")
+    print(f"LLM Label Evaluation ({len(docs)} documents)")
+    print(f"{'=' * 60}")
+
+    # Class distribution - document types
+    type_counts: dict[str, int] = {}
+    topic_counts: dict[str, int] = {}
+    lang_counts: dict[str, int] = {}
+
+    type_confs: dict[str, list[float]] = {}
+    topic_confs: dict[str, list[float]] = {}
+
+    for doc in docs:
+        dt = doc.get("document_type", "unknown")
+        tp = doc.get("document_topic", "unknown")
+        lang = doc.get("language", "unknown")
+
+        type_counts[dt] = type_counts.get(dt, 0) + 1
+        topic_counts[tp] = topic_counts.get(tp, 0) + 1
+        lang_counts[lang] = lang_counts.get(lang, 0) + 1
+
+        type_confs.setdefault(dt, []).append(doc.get("type_confidence", 0))
+        topic_confs.setdefault(tp, []).append(doc.get("topic_confidence", 0))
+
+    # Document type distribution
+    print("\nDocument Type Distribution:")
+    print(f"  {'Type':<20s} {'Count':>6s} {'%':>7s} {'Avg Conf':>10s}")
+    print(f"  {'-' * 45}")
+    for t, c in sorted(type_counts.items(), key=lambda x: -x[1]):
+        pct = 100 * c / len(docs)
+        avg_conf = sum(type_confs[t]) / len(type_confs[t]) if type_confs.get(t) else 0
+        print(f"  {t:<20s} {c:>6d} {pct:>6.1f}% {avg_conf:>9.3f}")
+
+    # Topic distribution
+    print("\nTopic Distribution:")
+    print(f"  {'Topic':<20s} {'Count':>6s} {'%':>7s} {'Avg Conf':>10s}")
+    print(f"  {'-' * 45}")
+    for t, c in sorted(topic_counts.items(), key=lambda x: -x[1]):
+        pct = 100 * c / len(docs)
+        avg_conf = sum(topic_confs[t]) / len(topic_confs[t]) if topic_confs.get(t) else 0
+        print(f"  {t:<20s} {c:>6d} {pct:>6.1f}% {avg_conf:>9.3f}")
+
+    # Language distribution
+    print("\nLanguage Distribution:")
+    for lang, c in sorted(lang_counts.items(), key=lambda x: -x[1]):
+        pct = 100 * c / len(docs)
+        print(f"  {lang:<5s}: {c:>6d} ({pct:.1f}%)")
+
+    # Overall confidence stats
+    all_confs = [doc.get("confidence", 0) for doc in docs]
+    if all_confs:
+        avg = sum(all_confs) / len(all_confs)
+        low = sum(1 for c in all_confs if c < 0.5)
+        med = sum(1 for c in all_confs if 0.5 <= c < 0.8)
+        high = sum(1 for c in all_confs if c >= 0.8)
+        print(f"\nConfidence Distribution:")
+        print(f"  Mean: {avg:.3f}")
+        print(f"  High (>=0.8): {high:>5d} ({100 * high / len(all_confs):.1f}%)")
+        print(f"  Medium (0.5-0.8): {med:>5d} ({100 * med / len(all_confs):.1f}%)")
+        print(f"  Low (<0.5): {low:>5d} ({100 * low / len(all_confs):.1f}%)")
+
+    # Parse failures
+    failed = sum(1 for doc in docs if doc.get("reasoning") == "Failed to parse LLM response")
+    if failed:
+        print(f"\n  Parse failures: {failed} ({100 * failed / len(docs):.1f}%)")
+
+    # Cross-tabulation (type x topic)
+    print("\nType x Topic Cross-tabulation (top 5 combos):")
+    combos: dict[str, int] = {}
+    for doc in docs:
+        key = f"{doc.get('document_type', '?')} + {doc.get('document_topic', '?')}"
+        combos[key] = combos.get(key, 0) + 1
+    for combo, c in sorted(combos.items(), key=lambda x: -x[1])[:10]:
+        pct = 100 * c / len(docs)
+        print(f"  {combo:<40s} {c:>5d} ({pct:.1f}%)")
+
+    # Check for taxonomy coverage
+    valid_types = {t["id"] for t in taxonomy["document_types"]}
+    valid_topics = {t["id"] for t in taxonomy["topics"]}
+    missing_types = valid_types - set(type_counts.keys())
+    missing_topics = valid_topics - set(topic_counts.keys())
+    if missing_types:
+        print(f"\n  Unused document types: {missing_types}")
+    if missing_topics:
+        print(f"  Unused topics: {missing_topics}")
+
+
+def evaluate_corpus(languages: list[str] | None = None):
+    """Evaluate classification results from the database."""
+    conn = get_db_connection()
+    try:
+        with conn.cursor() as cur:
+            # Overall stats
+            cur.execute("""
+                SELECT
+                    COUNT(*)::int as total,
+                    COUNT(CASE WHEN classification_model IS NOT NULL THEN 1 END)::int as classified,
+                    COUNT(CASE WHEN extracted_at IS NOT NULL AND extraction_error IS NULL THEN 1 END)::int as extracted
+                FROM documents
+            """)
+            row = cur.fetchone()
+            total, classified, extracted = row
+
+            print(f"\n{'=' * 60}")
+            print(f"Corpus Classification Status")
+            print(f"{'=' * 60}")
+            print(f"  Total documents: {total:,}")
+            print(f"  Extracted: {extracted:,}")
+            print(f"  Classified: {classified:,} ({100 * classified / max(extracted, 1):.1f}% of extracted)")
+
+            if classified == 0:
+                print("\n  No classified documents yet.")
+                return
+
+            # Build language filter
+            lang_filter = ""
+            params: list = []
+            if languages:
+                placeholders = ",".join(["%s"] * len(languages))
+                lang_filter = f"AND language IN ({placeholders})"
+                params = list(languages)
+
+            # Document type distribution
+            cur.execute(
+                f"""
+                SELECT document_type, COUNT(*)::int as count,
+                       ROUND(AVG(classification_confidence)::numeric, 3) as avg_conf
+                FROM documents
+                WHERE classification_model IS NOT NULL {lang_filter}
+                GROUP BY document_type
+                ORDER BY count DESC
+                """,
+                params,
+            )
+            rows = cur.fetchall()
+
+            scope = f" (languages: {','.join(languages)})" if languages else ""
+            print(f"\nDocument Type Distribution{scope}:")
+            print(f"  {'Type':<20s} {'Count':>8s} {'%':>7s} {'Avg Conf':>10s}")
+            print(f"  {'-' * 47}")
+            row_total = sum(r[1] for r in rows)
+            for dt, count, avg_conf in rows:
+                pct = 100 * count / row_total
+                print(f"  {dt or 'null':<20s} {count:>8,d} {pct:>6.1f}% {avg_conf or 0:>9.3f}")
+
+            # Topic distribution
+            cur.execute(
+                f"""
+                SELECT document_topic, COUNT(*)::int as count,
+                       ROUND(AVG(classification_confidence)::numeric, 3) as avg_conf
+                FROM documents
+                WHERE classification_model IS NOT NULL {lang_filter}
+                GROUP BY document_topic
+                ORDER BY count DESC
+                """,
+                params,
+            )
+            rows = cur.fetchall()
+
+            print(f"\nTopic Distribution{scope}:")
+            print(f"  {'Topic':<20s} {'Count':>8s} {'%':>7s} {'Avg Conf':>10s}")
+            print(f"  {'-' * 47}")
+            for tp, count, avg_conf in rows:
+                pct = 100 * count / row_total
+                print(f"  {tp or 'null':<20s} {count:>8,d} {pct:>6.1f}% {avg_conf or 0:>9.3f}")
+
+            # By language
+            cur.execute(
+                f"""
+                SELECT language, COUNT(*)::int as count
+                FROM documents
+                WHERE classification_model IS NOT NULL {lang_filter}
+                GROUP BY language
+                ORDER BY count DESC
+                LIMIT 20
+                """,
+                params,
+            )
+            rows = cur.fetchall()
+
+            print(f"\nBy Language (top 20):")
+            for lang, count in rows:
+                pct = 100 * count / row_total
+                print(f"  {lang or '?':<5s}: {count:>8,d} ({pct:.1f}%)")
+
+            # Confidence distribution
+            cur.execute(
+                f"""
+                SELECT
+                    COUNT(CASE WHEN classification_confidence >= 0.8 THEN 1 END)::int as high,
+                    COUNT(CASE WHEN classification_confidence >= 0.5 AND classification_confidence < 0.8 THEN 1 END)::int as med,
+                    COUNT(CASE WHEN classification_confidence < 0.5 THEN 1 END)::int as low,
+                    ROUND(AVG(classification_confidence)::numeric, 3) as avg
+                FROM documents
+                WHERE classification_model IS NOT NULL {lang_filter}
+                """,
+                params,
+            )
+            row = cur.fetchone()
+            print(f"\nConfidence Distribution:")
+            print(f"  Mean: {row[3]}")
+            print(f"  High (>=0.8): {row[0]:>8,d} ({100 * row[0] / row_total:.1f}%)")
+            print(f"  Medium (0.5-0.8): {row[1]:>8,d} ({100 * row[1] / row_total:.1f}%)")
+            print(f"  Low (<0.5): {row[2]:>8,d} ({100 * row[2] / row_total:.1f}%)")
+
+            # Classification model used
+            cur.execute(
+                f"""
+                SELECT classification_model, COUNT(*)::int
+                FROM documents
+                WHERE classification_model IS NOT NULL {lang_filter}
+                GROUP BY classification_model
+                """,
+                params,
+            )
+            rows = cur.fetchall()
+            print(f"\nModels Used:")
+            for model, count in rows:
+                print(f"  {model}: {count:,}")
+
+    finally:
+        conn.close()
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Evaluate classification quality")
+    subparsers = parser.add_subparsers(dest="command", help="Evaluation mode")
+
+    # Labels subcommand
+    labels_parser = subparsers.add_parser(
+        "labels", help="Evaluate LLM-labeled data"
+    )
+    labels_parser.add_argument(
+        "--input", type=str, required=True, help="Labeled JSONL file"
+    )
+
+    # Corpus subcommand
+    corpus_parser = subparsers.add_parser(
+        "corpus", help="Evaluate corpus classification from DB"
+    )
+    corpus_parser.add_argument(
+        "--languages",
+        type=str,
+        default=None,
+        help="Comma-separated language codes to filter",
+    )
+
+    args = parser.parse_args()
+
+    if not args.command:
+        parser.print_help()
+        sys.exit(1)
+
+    taxonomy = load_taxonomy()
+
+    if args.command == "labels":
+        if not os.path.exists(args.input):
+            print(f"ERROR: File not found: {args.input}")
+            sys.exit(1)
+        evaluate_labels(args.input, taxonomy)
+
+    elif args.command == "corpus":
+        languages = (
+            [l.strip() for l in args.languages.split(",")]
+            if args.languages
+            else None
+        )
+        evaluate_corpus(languages=languages)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/classification/label.py b/scripts/classification/label.py
new file mode 100644
index 0000000..8eb0575
--- /dev/null
+++ b/scripts/classification/label.py
@@ -0,0 +1,375 @@
+#!/usr/bin/env python3
+"""
+Phase 2, Step 2: LLM-label sampled documents using Claude.
+
+Reads a JSONL of sampled documents, fetches their text, sends to Claude
+for classification, and saves labeled results.
+
+Supports resume — already-labeled documents are skipped.
+
+Usage:
+    python label.py --input sampled_docs.jsonl --output labeled_docs.jsonl
+    python label.py --input sampled_docs.jsonl --output labeled_docs.jsonl --model claude-haiku-4-5
+    python label.py --input sampled_docs.jsonl --output labeled_docs.jsonl --parallel 5
+"""
+
+import argparse
+import asyncio
+import json
+import os
+import sys
+import time
+from pathlib import Path
+
+import anthropic
+from tqdm import tqdm
+
+from common import fetch_documents_text_parallel, load_taxonomy
+
+# Rate limiting defaults
+DEFAULT_PARALLEL = 5
+DEFAULT_RPM = 50
+DEFAULT_DELAY = 1.5
+
+
+def build_classification_prompt(taxonomy: dict, text: str, filename: str | None) -> str:
+    """Build the prompt for Claude to classify a single document."""
+    doc_types = "\n".join(
+        f"  - **{t['id']}**: {t['label']} — {t['description']}"
+        for t in taxonomy["document_types"]
+    )
+    topics = "\n".join(
+        f"  - **{t['id']}**: {t['label']} — {t['description']}"
+        for t in taxonomy["topics"]
+    )
+
+    filename_line = f"\n- Filename: {filename}" if filename else ""
+
+    return f"""Classify this document along two independent dimensions.
+
+## Dimension 1: Document Type (what kind of document is this?)
+{doc_types}
+
+## Dimension 2: Topic (what subject domain does it belong to?)
+{topics}
+
+## Document{filename_line}
+\"\"\"
+{text}
+\"\"\"
+
+## Instructions
+1. Read the document text carefully
+2. Choose the BEST matching document_type from Dimension 1
+3. Choose the BEST matching topic from Dimension 2
+4. These are INDEPENDENT — a "legal" type document can have any topic, and vice versa
+5. Provide confidence scores (0.0-1.0) for each choice
+
+Respond with ONLY this JSON:
+{{"document_type": {{"id": "<type_id>", "confidence": <0.0-1.0>}}, "topic": {{"id": "<topic_id>", "confidence": <0.0-1.0>}}, "reasoning": "<1 sentence>"}}"""
+
+
+def parse_llm_response(text: str, taxonomy: dict) -> dict:
+    """Parse and validate LLM classification response."""
+    import re
+
+    valid_types = {t["id"] for t in taxonomy["document_types"]}
+    valid_topics = {t["id"] for t in taxonomy["topics"]}
+
+    try:
+        json_match = re.search(r"\{[\s\S]*\}", text)
+        if json_match:
+            result = json.loads(json_match.group())
+            doc_type = result["document_type"]["id"]
+            topic = result["topic"]["id"]
+
+            # Validate against taxonomy
+            if doc_type not in valid_types:
+                doc_type = "general" if "general" in valid_types else list(valid_types)[0]
+            if topic not in valid_topics:
+                topic = "general" if "general" in valid_topics else list(valid_topics)[0]
+
+            return {
+                "document_type": doc_type,
+                "document_topic": topic,
+                "type_confidence": float(result["document_type"]["confidence"]),
+                "topic_confidence": float(result["topic"]["confidence"]),
+                "confidence": min(
+                    float(result["document_type"]["confidence"]),
+                    float(result["topic"]["confidence"]),
+                ),
+                "reasoning": result.get("reasoning", ""),
+            }
+    except (json.JSONDecodeError, KeyError, TypeError, ValueError):
+        pass
+
+    return {
+        "document_type": "general",
+        "document_topic": "general",
+        "type_confidence": 0.0,
+        "topic_confidence": 0.0,
+        "confidence": 0.0,
+        "reasoning": "Failed to parse LLM response",
+    }
+
+
+def load_existing_results(output_path: str) -> dict[str, dict]:
+    """Load already-labeled documents for resume support."""
+    results = {}
+    if os.path.exists(output_path):
+        with open(output_path) as f:
+            for line in f:
+                if line.strip():
+                    entry = json.loads(line)
+                    results[entry["id"]] = entry
+    return results
+
+
+def append_result(output_path: str, result: dict):
+    """Append a single labeled result to the output file."""
+    with open(output_path, "a") as f:
+        f.write(json.dumps(result) + "\n")
+
+
+async def label_documents(
+    docs: list[dict],
+    taxonomy: dict,
+    output_path: str,
+    model: str,
+    max_parallel: int,
+    rpm: int,
+    delay: float,
+):
+    """Label all documents with Claude, with rate limiting and resume."""
+    api_key = os.environ.get("ANTHROPIC_API_KEY")
+    if not api_key:
+        print("ERROR: ANTHROPIC_API_KEY environment variable not set")
+        sys.exit(1)
+
+    client = anthropic.AsyncAnthropic(api_key=api_key)
+    semaphore = asyncio.Semaphore(max_parallel)
+
+    # Load existing results for resume
+    existing = load_existing_results(output_path)
+    remaining = [d for d in docs if d["id"] not in existing]
+
+    if existing:
+        print(f"  Resuming: {len(existing)} already labeled, {len(remaining)} remaining")
+
+    if not remaining:
+        print("  All documents already labeled!")
+        return existing
+
+    # Prefetch text for all remaining documents
+    print(f"  Fetching text for {len(remaining)} documents...")
+    doc_ids = [d["id"] for d in remaining]
+
+    # Fetch in batches to avoid overwhelming the server
+    all_texts = {}
+    batch_size = 100
+    for i in range(0, len(doc_ids), batch_size):
+        batch = doc_ids[i : i + batch_size]
+        texts = fetch_documents_text_parallel(batch, max_chars=2000)
+        all_texts.update(texts)
+        if i + batch_size < len(doc_ids):
+            print(f"    Fetched text: {min(i + batch_size, len(doc_ids))}/{len(doc_ids)}")
+
+    print(f"  Text fetched. Starting LLM classification with {model}...")
+
+    # Rate limiting
+    interval = 60.0 / rpm
+    last_request = [0.0]
+    lock = asyncio.Lock()
+
+    async def rate_limit():
+        async with lock:
+            now = time.time()
+            wait = last_request[0] + max(interval, delay) - now
+            if wait > 0:
+                await asyncio.sleep(wait)
+            last_request[0] = time.time()
+
+    completed = [0]
+    errors = [0]
+    pbar = tqdm(total=len(remaining), desc="Labeling", unit="doc")
+
+    async def classify_one(doc: dict) -> dict | None:
+        text = all_texts.get(doc["id"], "")
+        if not text:
+            errors[0] += 1
+            pbar.update(1)
+            return None
+
+        prompt = build_classification_prompt(taxonomy, text, doc.get("original_filename"))
+
+        max_retries = 5
+        for attempt in range(max_retries):
+            await rate_limit()
+            async with semaphore:
+                try:
+                    response = await client.messages.create(
+                        model=model,
+                        max_tokens=256,
+                        messages=[{"role": "user", "content": prompt}],
+                    )
+                    classification = parse_llm_response(
+                        response.content[0].text, taxonomy
+                    )
+
+                    result = {
+                        "id": doc["id"],
+                        "language": doc.get("language"),
+                        "word_count": doc.get("word_count"),
+                        "original_filename": doc.get("original_filename"),
+                        "source_url": doc.get("source_url"),
+                        **classification,
+                        "model": model,
+                    }
+
+                    append_result(output_path, result)
+                    completed[0] += 1
+                    pbar.update(1)
+                    return result
+
+                except anthropic.RateLimitError:
+                    wait = min(10 * (2**attempt), 60)
+                    pbar.set_postfix_str(f"rate limited, waiting {wait}s")
+                    await asyncio.sleep(wait)
+                except Exception as e:
+                    if attempt == max_retries - 1:
+                        errors[0] += 1
+                        pbar.update(1)
+                        pbar.set_postfix_str(f"error: {str(e)[:50]}")
+                        return None
+                    await asyncio.sleep(2)
+
+        return None
+
+    # Process all documents
+    tasks = [classify_one(doc) for doc in remaining]
+    results = await asyncio.gather(*tasks)
+    pbar.close()
+
+    # Merge with existing
+    all_results = dict(existing)
+    for r in results:
+        if r:
+            all_results[r["id"]] = r
+
+    print(f"\n  Completed: {completed[0]}, Errors: {errors[0]}")
+    return all_results
+
+
+def print_summary(output_path: str, taxonomy: dict):
+    """Print classification distribution summary."""
+    results = load_existing_results(output_path)
+    if not results:
+        return
+
+    print(f"\n{'=' * 60}")
+    print(f"Classification Summary ({len(results)} documents)")
+    print(f"{'=' * 60}")
+
+    # By document type
+    type_counts: dict[str, int] = {}
+    topic_counts: dict[str, int] = {}
+    lang_counts: dict[str, int] = {}
+
+    for r in results.values():
+        dt = r.get("document_type", "unknown")
+        tp = r.get("document_topic", "unknown")
+        lang = r.get("language", "unknown")
+        type_counts[dt] = type_counts.get(dt, 0) + 1
+        topic_counts[tp] = topic_counts.get(tp, 0) + 1
+        lang_counts[lang] = lang_counts.get(lang, 0) + 1
+
+    print("\nBy Document Type:")
+    for t, c in sorted(type_counts.items(), key=lambda x: -x[1]):
+        pct = 100 * c / len(results)
+        print(f"  {t:20s}: {c:5d} ({pct:5.1f}%)")
+
+    print("\nBy Topic:")
+    for t, c in sorted(topic_counts.items(), key=lambda x: -x[1]):
+        pct = 100 * c / len(results)
+        print(f"  {t:20s}: {c:5d} ({pct:5.1f}%)")
+
+    print("\nBy Language:")
+    for t, c in sorted(lang_counts.items(), key=lambda x: -x[1]):
+        pct = 100 * c / len(results)
+        print(f"  {t:5s}: {c:5d} ({pct:5.1f}%)")
+
+    # Confidence stats
+    confidences = [r["confidence"] for r in results.values() if "confidence" in r]
+    if confidences:
+        avg = sum(confidences) / len(confidences)
+        low = sum(1 for c in confidences if c < 0.6)
+        print(f"\nConfidence: avg={avg:.2f}, <60%={low} ({100*low/len(confidences):.1f}%)")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="LLM-label sampled documents using Claude"
+    )
+    parser.add_argument(
+        "--input", type=str, required=True, help="Input JSONL from sample.py"
+    )
+    parser.add_argument(
+        "--output", type=str, default="labeled_docs.jsonl", help="Output JSONL"
+    )
+    parser.add_argument(
+        "--model", type=str, default="claude-haiku-4-5", help="Claude model"
+    )
+    parser.add_argument(
+        "--parallel", type=int, default=DEFAULT_PARALLEL, help="Max parallel requests"
+    )
+    parser.add_argument(
+        "--rpm", type=int, default=DEFAULT_RPM, help="Requests per minute limit"
+    )
+    parser.add_argument(
+        "--delay", type=float, default=DEFAULT_DELAY, help="Min delay between requests"
+    )
+    args = parser.parse_args()
+
+    if not os.path.exists(args.input):
+        print(f"ERROR: Input file not found: {args.input}")
+        sys.exit(1)
+
+    # Load taxonomy
+    taxonomy = load_taxonomy()
+    print(f"Taxonomy: {taxonomy['name']} v{taxonomy['version']}")
+    print(f"  Document types: {len(taxonomy['document_types'])}")
+    print(f"  Topics: {len(taxonomy['topics'])}")
+
+    # Load sampled documents
+    docs = []
+    with open(args.input) as f:
+        for line in f:
+            if line.strip():
+                docs.append(json.loads(line))
+
+    print(f"\nDocuments to label: {len(docs)}")
+    print(f"Model: {args.model}")
+    print(f"Parallel: {args.parallel}, RPM: {args.rpm}, Delay: {args.delay}s")
+    print()
+
+    # Run labeling
+    asyncio.run(
+        label_documents(
+            docs=docs,
+            taxonomy=taxonomy,
+            output_path=args.output,
+            model=args.model,
+            max_parallel=args.parallel,
+            rpm=args.rpm,
+            delay=args.delay,
+        )
+    )
+
+    # Print summary
+    print_summary(args.output, taxonomy)
+
+    print(f"\nNext step: python train.py --input {args.output}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/classification/pyproject.toml b/scripts/classification/pyproject.toml
new file mode 100644
index 0000000..9e2aa75
--- /dev/null
+++ b/scripts/classification/pyproject.toml
@@ -0,0 +1,18 @@
+[project]
+name = "docx-corpus-classifier"
+version = "0.1.0"
+description = "ML classification pipeline for docx-corpus"
+requires-python = ">=3.11"
+dependencies = [
+    "anthropic>=0.39.0",
+    "psycopg2-binary>=2.9.0",
+    "python-dotenv>=1.0.0",
+    "numpy>=1.26.0",
+    "tqdm>=4.66.0",
+    # ML / Training
+    "transformers>=4.47.0",
+    "torch>=2.5.0",
+    "datasets>=3.0.0",
+    "scikit-learn>=1.5.0",
+    "accelerate>=1.0.0",
+]
diff --git a/scripts/classification/sample.py b/scripts/classification/sample.py
new file mode 100644
index 0000000..2b00ea6
--- /dev/null
+++ b/scripts/classification/sample.py
@@ -0,0 +1,250 @@
+#!/usr/bin/env python3
+"""
+Phase 2, Step 1: Create a stratified sample of documents for LLM labeling.
+
+Samples documents across the top 5 languages, stratified by:
+- Language (proportional to corpus representation)
+- Word count (small/medium/large terciles)
+- Source domain diversity
+
+Usage:
+    python sample.py --total 3500 --output sampled_docs.jsonl
+    python sample.py --total 3500 --output sampled_docs.jsonl --languages en,ru,cs,pl,es
+"""
+
+import argparse
+import json
+import random
+import sys
+from urllib.parse import urlparse
+
+from common import get_db_connection
+
+# Default top 5 languages and their approximate sample allocation
+DEFAULT_LANGUAGES = ["en", "ru", "cs", "pl", "es"]
+
+
+def get_documents_for_language(
+    language: str, limit: int = 100000
+) -> list[dict]:
+    """Fetch extracted documents for a given language."""
+    conn = get_db_connection()
+    try:
+        with conn.cursor() as cur:
+            cur.execute(
+                """
+                SELECT id, source_url, original_filename, word_count, file_size_bytes
+                FROM documents
+                WHERE extracted_at IS NOT NULL
+                  AND extraction_error IS NULL
+                  AND language = %s
+                  AND word_count > 0
+                ORDER BY random()
+                LIMIT %s
+                """,
+                (language, limit),
+            )
+            return [
+                {
+                    "id": row[0],
+                    "source_url": row[1],
+                    "original_filename": row[2],
+                    "word_count": row[3],
+                    "file_size_bytes": row[4],
+                    "language": language,
+                }
+                for row in cur.fetchall()
+            ]
+    finally:
+        conn.close()
+
+
+def get_language_counts(languages: list[str]) -> dict[str, int]:
+    """Get document counts for the specified languages."""
+    conn = get_db_connection()
+    try:
+        with conn.cursor() as cur:
+            placeholders = ",".join(["%s"] * len(languages))
+            cur.execute(
+                f"""
+                SELECT language, COUNT(*)::int as count
+                FROM documents
+                WHERE extracted_at IS NOT NULL
+                  AND extraction_error IS NULL
+                  AND language IN ({placeholders})
+                  AND word_count > 0
+                GROUP BY language
+                ORDER BY count DESC
+                """,
+                languages,
+            )
+            return {row[0]: row[1] for row in cur.fetchall()}
+    finally:
+        conn.close()
+
+
+def stratified_sample(
+    docs: list[dict], n: int, seed: int = 42
+) -> list[dict]:
+    """
+    Stratified sample by word count terciles and source domain diversity.
+
+    Splits documents into 3 word-count bins (short/medium/long),
+    samples proportionally from each, preferring diverse source domains.
+    """
+    rng = random.Random(seed)
+
+    if len(docs) <= n:
+        return docs
+
+    # Sort by word count and split into terciles
+    sorted_docs = sorted(docs, key=lambda d: d["word_count"])
+    third = len(sorted_docs) // 3
+    bins = [
+        sorted_docs[:third],           # short
+        sorted_docs[third : 2 * third], # medium
+        sorted_docs[2 * third :],       # long
+    ]
+
+    # Sample proportionally from each bin with domain diversity
+    samples_per_bin = n // 3
+    remainder = n - (samples_per_bin * 3)
+
+    result = []
+    for i, bin_docs in enumerate(bins):
+        target = samples_per_bin + (1 if i < remainder else 0)
+        result.extend(_diverse_sample(bin_docs, target, rng))
+
+    return result
+
+
+def _diverse_sample(
+    docs: list[dict], n: int, rng: random.Random
+) -> list[dict]:
+    """Sample n documents, preferring diverse source domains."""
+    if len(docs) <= n:
+        return docs
+
+    # Group by domain
+    by_domain: dict[str, list[dict]] = {}
+    for doc in docs:
+        try:
+            domain = urlparse(doc["source_url"]).netloc
+        except Exception:
+            domain = "unknown"
+        by_domain.setdefault(domain, []).append(doc)
+
+    # Round-robin from domains until we have enough
+    result = []
+    domains = list(by_domain.keys())
+    rng.shuffle(domains)
+
+    # Shuffle within each domain
+    for domain in domains:
+        rng.shuffle(by_domain[domain])
+
+    idx = {d: 0 for d in domains}
+    while len(result) < n:
+        added_any = False
+        for domain in domains:
+            if len(result) >= n:
+                break
+            if idx[domain] < len(by_domain[domain]):
+                result.append(by_domain[domain][idx[domain]])
+                idx[domain] += 1
+                added_any = True
+        if not added_any:
+            break
+
+    return result[:n]
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Create stratified sample for LLM labeling"
+    )
+    parser.add_argument(
+        "--total", type=int, default=3500, help="Total documents to sample (default: 3500)"
+    )
+    parser.add_argument(
+        "--output", type=str, default="sampled_docs.jsonl", help="Output JSONL file"
+    )
+    parser.add_argument(
+        "--languages",
+        type=str,
+        default=",".join(DEFAULT_LANGUAGES),
+        help="Comma-separated language codes (default: en,ru,cs,pl,es)",
+    )
+    parser.add_argument("--seed", type=int, default=42, help="Random seed")
+    args = parser.parse_args()
+
+    languages = [l.strip() for l in args.languages.split(",")]
+
+    print("=" * 60)
+    print("Stratified Document Sampling")
+    print("=" * 60)
+
+    # Get language counts
+    print(f"\nFetching counts for languages: {languages}")
+    counts = get_language_counts(languages)
+    total_docs = sum(counts.values())
+
+    print(f"\nLanguage distribution:")
+    for lang, count in sorted(counts.items(), key=lambda x: -x[1]):
+        pct = 100 * count / total_docs
+        print(f"  {lang}: {count:,} ({pct:.1f}%)")
+    print(f"  Total: {total_docs:,}")
+
+    # Calculate per-language sample sizes (proportional)
+    min_per_lang = min(50, args.total // len(counts))
+    allocations = {}
+    for lang, count in counts.items():
+        proportion = count / total_docs
+        allocations[lang] = max(min_per_lang, round(args.total * proportion))
+
+    # Adjust to hit exact total
+    allocated = sum(allocations.values())
+    if allocated != args.total:
+        diff = args.total - allocated
+        # Add/remove from largest language
+        largest = max(allocations, key=allocations.get)
+        allocations[largest] = max(1, allocations[largest] + diff)
+
+    print(f"\nSample allocation (total={args.total}):")
+    for lang in sorted(allocations, key=lambda l: -allocations[l]):
+        print(f"  {lang}: {allocations[lang]}")
+
+    # Sample from each language
+    all_samples = []
+    for lang, n_samples in allocations.items():
+        print(f"\nSampling {n_samples} from {lang}...")
+        # Fetch more than needed to allow stratification
+        docs = get_documents_for_language(lang, limit=min(n_samples * 10, 100000))
+        print(f"  Fetched {len(docs):,} candidates")
+
+        sampled = stratified_sample(docs, n_samples, seed=args.seed)
+        all_samples.extend(sampled)
+        print(f"  Selected {len(sampled)} documents")
+
+        # Show word count distribution of sample
+        word_counts = [d["word_count"] for d in sampled if d["word_count"]]
+        if word_counts:
+            print(
+                f"  Word count: min={min(word_counts):,}, "
+                f"median={sorted(word_counts)[len(word_counts)//2]:,}, "
+                f"max={max(word_counts):,}"
+            )
+
+    # Save
+    with open(args.output, "w") as f:
+        for doc in all_samples:
+            f.write(json.dumps(doc) + "\n")
+
+    print(f"\n{'=' * 60}")
+    print(f"Saved {len(all_samples)} documents to {args.output}")
+    print(f"{'=' * 60}")
+    print(f"\nNext step: python label.py --input {args.output} --output labeled_docs.jsonl")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/classification/taxonomy.json b/scripts/classification/taxonomy.json
new file mode 100644
index 0000000..27fc3fb
--- /dev/null
+++ b/scripts/classification/taxonomy.json
@@ -0,0 +1,114 @@
+{
+  "name": "docx-corpus-v2",
+  "version": "2.0.0",
+  "description": "Two-dimensional taxonomy for .docx document classification. Dimension 1 (document_type) describes the document's form/structure. Dimension 2 (topic) describes its subject domain. Each document gets one label per dimension.",
+  "document_types": [
+    {
+      "id": "legal",
+      "label": "Legal Documents",
+      "description": "Contracts, agreements, legal notices, terms of service, regulations, statutes, and other legally binding or law-related documents",
+      "examples": ["employment agreement", "NDA", "terms and conditions", "legal notice", "legislation", "statute", "regulation"]
+    },
+    {
+      "id": "forms",
+      "label": "Forms & Applications",
+      "description": "Fillable forms, applications, registration documents, surveys, questionnaires, and other documents designed for data collection",
+      "examples": ["application form", "registration form", "survey", "questionnaire", "ballot", "submission form"]
+    },
+    {
+      "id": "reports",
+      "label": "Reports & Analysis",
+      "description": "Research reports, analysis documents, studies, assessments, evaluations, annual reports, and other documents presenting findings or data",
+      "examples": ["annual report", "research paper", "case study", "analysis", "assessment", "evaluation", "white paper"]
+    },
+    {
+      "id": "policies",
+      "label": "Policies & Procedures",
+      "description": "Policy documents, procedures, guidelines, manuals, handbooks, and other documents establishing rules or processes",
+      "examples": ["privacy policy", "employee handbook", "procedure manual", "guidelines", "standard operating procedure"]
+    },
+    {
+      "id": "educational",
+      "label": "Educational Materials",
+      "description": "Curricula, syllabi, lesson plans, educational content, worksheets, study guides, academic papers, theses, and other teaching or learning materials",
+      "examples": ["syllabus", "lesson plan", "course outline", "study guide", "worksheet", "thesis", "dissertation"]
+    },
+    {
+      "id": "correspondence",
+      "label": "Correspondence",
+      "description": "Letters, memos, announcements, communications, press releases, newsletters, and other direct communications",
+      "examples": ["letter", "memo", "press release", "announcement", "notice", "circular", "newsletter"]
+    },
+    {
+      "id": "technical",
+      "label": "Technical Documentation",
+      "description": "Technical specifications, manuals, documentation, standards, API docs, user guides, and other technical reference materials",
+      "examples": ["technical manual", "API documentation", "specifications", "user guide", "technical standard", "datasheet"]
+    },
+    {
+      "id": "administrative",
+      "label": "Administrative Documents",
+      "description": "Meeting minutes, agendas, schedules, organizational documents, resolutions, records, and other administrative materials",
+      "examples": ["meeting minutes", "agenda", "organizational chart", "resolution", "schedule", "roster", "directory"]
+    },
+    {
+      "id": "creative",
+      "label": "Creative & Marketing",
+      "description": "Marketing materials, brochures, proposals, presentations, promotional content, and other persuasive or creative documents",
+      "examples": ["brochure", "proposal", "marketing plan", "presentation script", "pitch deck content", "catalog"]
+    },
+    {
+      "id": "reference",
+      "label": "Reference & Catalogs",
+      "description": "Directories, catalogs, glossaries, FAQs, indices, lists, and other reference or lookup documents",
+      "examples": ["product catalog", "directory", "glossary", "FAQ", "index", "bibliography", "inventory list"]
+    }
+  ],
+  "topics": [
+    {
+      "id": "government",
+      "label": "Government & Public Sector",
+      "description": "Government agencies, public administration, civic affairs, municipal matters, public policy, elections"
+    },
+    {
+      "id": "education",
+      "label": "Education & Academia",
+      "description": "Schools, universities, educational institutions, academic programs, student affairs, research"
+    },
+    {
+      "id": "healthcare",
+      "label": "Healthcare & Medicine",
+      "description": "Medical, health services, hospitals, clinics, public health, pharmaceuticals, biotech"
+    },
+    {
+      "id": "finance",
+      "label": "Finance & Business",
+      "description": "Banking, finance, corporate, business operations, commerce, investment, insurance, real estate"
+    },
+    {
+      "id": "legal_judicial",
+      "label": "Legal & Judicial",
+      "description": "Law firms, courts, legal services, judicial matters, compliance, law enforcement"
+    },
+    {
+      "id": "technology",
+      "label": "Technology & Engineering",
+      "description": "IT, software, tech companies, digital services, engineering, telecommunications, R&D"
+    },
+    {
+      "id": "environment",
+      "label": "Environment & Energy",
+      "description": "Environmental protection, sustainability, conservation, climate, energy, natural resources, agriculture"
+    },
+    {
+      "id": "nonprofit",
+      "label": "Nonprofit & NGO",
+      "description": "Charitable organizations, NGOs, community organizations, social services, international aid"
+    },
+    {
+      "id": "general",
+      "label": "General / Other",
+      "description": "Documents that don't fit clearly into other topic categories, or span multiple domains equally"
+    }
+  ]
+}
diff --git a/scripts/classification/train.py b/scripts/classification/train.py
new file mode 100644
index 0000000..821b423
--- /dev/null
+++ b/scripts/classification/train.py
@@ -0,0 +1,443 @@
+#!/usr/bin/env python3
+"""
+Phase 3: Train ModernBERT classifiers on LLM-labeled documents.
+
+Trains two independent classifiers:
+  1. Document type (10 classes)
+  2. Topic (9 classes)
+
+Uses HuggingFace Transformers with the answerdotai/ModernBERT-base model.
+Supports configurable train/val split, epochs, learning rate, etc.
+
+Usage:
+    python train.py --input labeled_docs.jsonl
+    python train.py --input labeled_docs.jsonl --epochs 5 --lr 2e-5
+    python train.py --input labeled_docs.jsonl --output-dir ./models
+"""
+
+import argparse
+import json
+import os
+import sys
+from pathlib import Path
+
+import numpy as np
+import torch
+from datasets import Dataset
+from sklearn.metrics import (
+    accuracy_score,
+    classification_report,
+    f1_score,
+)
+from sklearn.model_selection import train_test_split
+from transformers import (
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    EarlyStoppingCallback,
+    Trainer,
+    TrainingArguments,
+)
+
+from common import fetch_documents_text_parallel, load_taxonomy
+
+DEFAULT_MODEL = "answerdotai/ModernBERT-base"
+DEFAULT_EPOCHS = 5
+DEFAULT_LR = 2e-5
+DEFAULT_BATCH_SIZE = 16
+DEFAULT_MAX_LENGTH = 512
+DEFAULT_VAL_SPLIT = 0.15
+DEFAULT_OUTPUT_DIR = "./models"
+
+
+def load_labeled_data(input_path: str, min_confidence: float = 0.0) -> list[dict]:
+    """Load labeled documents, optionally filtering by confidence."""
+    docs = []
+    with open(input_path) as f:
+        for line in f:
+            if line.strip():
+                entry = json.loads(line)
+                if entry.get("confidence", 0) >= min_confidence:
+                    docs.append(entry)
+    return docs
+
+
+def fetch_texts(docs: list[dict], max_chars: int = 2000) -> dict[str, str]:
+    """Fetch document texts in parallel batches."""
+    doc_ids = [d["id"] for d in docs]
+    all_texts = {}
+    batch_size = 100
+    for i in range(0, len(doc_ids), batch_size):
+        batch = doc_ids[i : i + batch_size]
+        texts = fetch_documents_text_parallel(batch, max_chars=max_chars)
+        all_texts.update(texts)
+        fetched = min(i + batch_size, len(doc_ids))
+        if fetched < len(doc_ids):
+            print(f"  Fetched text: {fetched}/{len(doc_ids)}")
+    return all_texts
+
+
+def build_label_maps(taxonomy: dict) -> tuple[dict, dict, dict, dict]:
+    """Build label-to-id and id-to-label mappings for both dimensions."""
+    type_labels = [t["id"] for t in taxonomy["document_types"]]
+    topic_labels = [t["id"] for t in taxonomy["topics"]]
+
+    type2id = {label: i for i, label in enumerate(type_labels)}
+    id2type = {i: label for label, i in type2id.items()}
+    topic2id = {label: i for i, label in enumerate(topic_labels)}
+    id2topic = {i: label for label, i in topic2id.items()}
+
+    return type2id, id2type, topic2id, id2topic
+
+
+def prepare_dataset(
+    docs: list[dict],
+    texts: dict[str, str],
+    label_map: dict[str, int],
+    label_field: str,
+) -> tuple[list[str], list[int]]:
+    """Prepare text/label pairs, skipping docs without text or valid labels."""
+    input_texts = []
+    labels = []
+    skipped = 0
+    for doc in docs:
+        text = texts.get(doc["id"], "")
+        label = doc.get(label_field, "")
+        if not text or label not in label_map:
+            skipped += 1
+            continue
+        input_texts.append(text)
+        labels.append(label_map[label])
+    if skipped:
+        print(f"  Skipped {skipped} docs (missing text or invalid label)")
+    return input_texts, labels
+
+
+def compute_metrics(eval_pred):
+    """Compute accuracy and macro F1 for evaluation."""
+    predictions, labels = eval_pred
+    preds = np.argmax(predictions, axis=-1)
+    acc = accuracy_score(labels, preds)
+    f1 = f1_score(labels, preds, average="macro")
+    return {"accuracy": acc, "f1_macro": f1}
+
+
+def train_classifier(
+    train_texts: list[str],
+    train_labels: list[int],
+    val_texts: list[str],
+    val_labels: list[int],
+    num_labels: int,
+    id2label: dict[int, str],
+    label2id: dict[str, int],
+    output_dir: str,
+    model_name: str,
+    epochs: int,
+    lr: float,
+    batch_size: int,
+    max_length: int,
+    classifier_name: str,
+):
+    """Train a single classifier (type or topic)."""
+    print(f"\n{'=' * 60}")
+    print(f"Training: {classifier_name}")
+    print(f"  Train: {len(train_texts)}, Val: {len(val_texts)}")
+    print(f"  Classes: {num_labels}")
+    print(f"  Model: {model_name}")
+    print(f"  Epochs: {epochs}, LR: {lr}, Batch: {batch_size}")
+    print(f"{'=' * 60}")
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_name,
+        num_labels=num_labels,
+        id2label=id2label,
+        label2id=label2id,
+    )
+
+    def tokenize(examples):
+        return tokenizer(
+            examples["text"],
+            truncation=True,
+            max_length=max_length,
+            padding="max_length",
+        )
+
+    train_ds = Dataset.from_dict({"text": train_texts, "label": train_labels})
+    val_ds = Dataset.from_dict({"text": val_texts, "label": val_labels})
+
+    train_ds = train_ds.map(tokenize, batched=True, remove_columns=["text"])
+    val_ds = val_ds.map(tokenize, batched=True, remove_columns=["text"])
+
+    train_ds.set_format("torch")
+    val_ds.set_format("torch")
+
+    save_dir = os.path.join(output_dir, classifier_name)
+
+    training_args = TrainingArguments(
+        output_dir=save_dir,
+        num_train_epochs=epochs,
+        per_device_train_batch_size=batch_size,
+        per_device_eval_batch_size=batch_size * 2,
+        learning_rate=lr,
+        weight_decay=0.01,
+        warmup_ratio=0.1,
+        eval_strategy="epoch",
+        save_strategy="epoch",
+        load_best_model_at_end=True,
+        metric_for_best_model="f1_macro",
+        greater_is_better=True,
+        save_total_limit=2,
+        logging_steps=50,
+        fp16=torch.cuda.is_available(),
+        report_to="none",
+        seed=42,
+    )
+
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_ds,
+        eval_dataset=val_ds,
+        compute_metrics=compute_metrics,
+        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
+    )
+
+    trainer.train()
+
+    # Evaluate on validation set
+    eval_results = trainer.evaluate()
+    print(f"\n  Val accuracy: {eval_results['eval_accuracy']:.4f}")
+    print(f"  Val F1 macro: {eval_results['eval_f1_macro']:.4f}")
+
+    # Save best model
+    best_dir = os.path.join(save_dir, "best")
+    trainer.save_model(best_dir)
+    tokenizer.save_pretrained(best_dir)
+
+    # Full classification report on val set
+    preds = trainer.predict(val_ds)
+    pred_labels = np.argmax(preds.predictions, axis=-1)
+    report = classification_report(
+        val_ds["label"],
+        pred_labels,
+        target_names=[id2label[i] for i in range(num_labels)],
+    )
+    print(f"\nClassification Report ({classifier_name}):\n{report}")
+
+    # Save report
+    report_path = os.path.join(save_dir, "eval_report.txt")
+    with open(report_path, "w") as f:
+        f.write(f"Model: {model_name}\n")
+        f.write(f"Classifier: {classifier_name}\n")
+        f.write(f"Train size: {len(train_texts)}\n")
+        f.write(f"Val size: {len(val_texts)}\n")
+        f.write(f"Epochs: {epochs}\n")
+        f.write(f"Val accuracy: {eval_results['eval_accuracy']:.4f}\n")
+        f.write(f"Val F1 macro: {eval_results['eval_f1_macro']:.4f}\n\n")
+        f.write(report)
+
+    return eval_results
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Train ModernBERT classifiers on labeled documents"
+    )
+    parser.add_argument(
+        "--input", type=str, required=True, help="Labeled JSONL from label.py"
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=DEFAULT_OUTPUT_DIR,
+        help=f"Output directory for models (default: {DEFAULT_OUTPUT_DIR})",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default=DEFAULT_MODEL,
+        help=f"Base model (default: {DEFAULT_MODEL})",
+    )
+    parser.add_argument(
+        "--epochs",
+        type=int,
+        default=DEFAULT_EPOCHS,
+        help=f"Training epochs (default: {DEFAULT_EPOCHS})",
+    )
+    parser.add_argument(
+        "--lr",
+        type=float,
+        default=DEFAULT_LR,
+        help=f"Learning rate (default: {DEFAULT_LR})",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=DEFAULT_BATCH_SIZE,
+        help=f"Batch size (default: {DEFAULT_BATCH_SIZE})",
+    )
+    parser.add_argument(
+        "--max-length",
+        type=int,
+        default=DEFAULT_MAX_LENGTH,
+        help=f"Max token length (default: {DEFAULT_MAX_LENGTH})",
+    )
+    parser.add_argument(
+        "--val-split",
+        type=float,
+        default=DEFAULT_VAL_SPLIT,
+        help=f"Validation split ratio (default: {DEFAULT_VAL_SPLIT})",
+    )
+    parser.add_argument(
+        "--min-confidence",
+        type=float,
+        default=0.0,
+        help="Minimum LLM confidence to include (default: 0.0)",
+    )
+    parser.add_argument(
+        "--max-chars",
+        type=int,
+        default=2000,
+        help="Max characters of document text (default: 2000)",
+    )
+    parser.add_argument("--seed", type=int, default=42, help="Random seed")
+    args = parser.parse_args()
+
+    if not os.path.exists(args.input):
+        print(f"ERROR: Input file not found: {args.input}")
+        sys.exit(1)
+
+    # Load taxonomy and label maps
+    taxonomy = load_taxonomy()
+    type2id, id2type, topic2id, id2topic = build_label_maps(taxonomy)
+    print(f"Taxonomy: {taxonomy['name']} v{taxonomy['version']}")
+    print(f"  Document types: {len(type2id)} classes")
+    print(f"  Topics: {len(topic2id)} classes")
+
+    # Load labeled data
+    docs = load_labeled_data(args.input, min_confidence=args.min_confidence)
+    print(f"\nLoaded {len(docs)} labeled documents")
+    if args.min_confidence > 0:
+        print(f"  (filtered by confidence >= {args.min_confidence})")
+
+    # Fetch texts
+    print(f"\nFetching document texts (max {args.max_chars} chars)...")
+    texts = fetch_texts(docs, max_chars=args.max_chars)
+    print(f"  Got text for {sum(1 for t in texts.values() if t)}/{len(docs)} docs")
+
+    # Prepare datasets for both classifiers
+    print("\nPreparing document_type dataset...")
+    type_texts, type_labels = prepare_dataset(docs, texts, type2id, "document_type")
+    print(f"  {len(type_texts)} examples across {len(set(type_labels))} classes")
+
+    print("Preparing topic dataset...")
+    topic_texts, topic_labels = prepare_dataset(docs, texts, topic2id, "document_topic")
+    print(f"  {len(topic_texts)} examples across {len(set(topic_labels))} classes")
+
+    # Split into train/val (same split for both to keep comparable)
+    print(f"\nSplitting: {1 - args.val_split:.0%} train / {args.val_split:.0%} val")
+
+    type_train_texts, type_val_texts, type_train_labels, type_val_labels = (
+        train_test_split(
+            type_texts,
+            type_labels,
+            test_size=args.val_split,
+            random_state=args.seed,
+            stratify=type_labels,
+        )
+    )
+
+    topic_train_texts, topic_val_texts, topic_train_labels, topic_val_labels = (
+        train_test_split(
+            topic_texts,
+            topic_labels,
+            test_size=args.val_split,
+            random_state=args.seed,
+            stratify=topic_labels,
+        )
+    )
+
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    # Train document_type classifier
+    type_results = train_classifier(
+        train_texts=type_train_texts,
+        train_labels=type_train_labels,
+        val_texts=type_val_texts,
+        val_labels=type_val_labels,
+        num_labels=len(type2id),
+        id2label=id2type,
+        label2id=type2id,
+        output_dir=args.output_dir,
+        model_name=args.model,
+        epochs=args.epochs,
+        lr=args.lr,
+        batch_size=args.batch_size,
+        max_length=args.max_length,
+        classifier_name="document_type",
+    )
+
+    # Train topic classifier
+    topic_results = train_classifier(
+        train_texts=topic_train_texts,
+        train_labels=topic_train_labels,
+        val_texts=topic_val_texts,
+        val_labels=topic_val_labels,
+        num_labels=len(topic2id),
+        id2label=id2topic,
+        label2id=topic2id,
+        output_dir=args.output_dir,
+        model_name=args.model,
+        epochs=args.epochs,
+        lr=args.lr,
+        batch_size=args.batch_size,
+        max_length=args.max_length,
+        classifier_name="topic",
+    )
+
+    # Summary
+    print(f"\n{'=' * 60}")
+    print("Training Complete")
+    print(f"{'=' * 60}")
+    print(f"  Document Type — Acc: {type_results['eval_accuracy']:.4f}, F1: {type_results['eval_f1_macro']:.4f}")
+    print(f"  Topic         — Acc: {topic_results['eval_accuracy']:.4f}, F1: {topic_results['eval_f1_macro']:.4f}")
+    print(f"\n  Models saved to: {args.output_dir}/")
+    print(f"    {args.output_dir}/document_type/best/")
+    print(f"    {args.output_dir}/topic/best/")
+
+    # Save training config
+    config = {
+        "base_model": args.model,
+        "taxonomy": taxonomy["name"],
+        "taxonomy_version": taxonomy["version"],
+        "input_file": args.input,
+        "total_docs": len(docs),
+        "min_confidence": args.min_confidence,
+        "max_chars": args.max_chars,
+        "max_length": args.max_length,
+        "epochs": args.epochs,
+        "learning_rate": args.lr,
+        "batch_size": args.batch_size,
+        "val_split": args.val_split,
+        "seed": args.seed,
+        "results": {
+            "document_type": {
+                "accuracy": type_results["eval_accuracy"],
+                "f1_macro": type_results["eval_f1_macro"],
+            },
+            "topic": {
+                "accuracy": topic_results["eval_accuracy"],
+                "f1_macro": topic_results["eval_f1_macro"],
+            },
+        },
+    }
+    config_path = os.path.join(args.output_dir, "training_config.json")
+    with open(config_path, "w") as f:
+        json.dump(config, f, indent=2)
+    print(f"  Config saved to: {config_path}")
+
+    print(f"\nNext step: python classify.py --models-dir {args.output_dir}")
+
+
+if __name__ == "__main__":
+    main()

From eb0aaf9ecf5d10e9ade85b25e830647c1cc49797 Mon Sep 17 00:00:00 2001
From: Caio Pizzol <caiopizzol@icloud.com>
Date: Mon, 9 Mar 2026 14:09:41 -0300
Subject: [PATCH 2/2] feat: add Modal cloud GPU support and clean up
 classification pipeline

- Merge train_modal.py into train.py (--modal flag)
- Merge classify_modal.py into classify.py (--modal --workers N)
- Switch base model to xlm-roberta-base (multilingual)
- Add class-weighted loss for imbalanced classes
- Add --exclude flag to sample.py for iterative sampling
- Gitignore models/ and *.jsonl artifacts
- Update docs for Modal setup and cost estimates
---
 .gitignore                         |   4 +
 scripts/classification/CLAUDE.md   |  15 +-
 scripts/classification/README.md   |  64 ++-
 scripts/classification/classify.py | 468 +++++++++++++---------
 scripts/classification/sample.py   |  17 +
 scripts/classification/train.py    | 603 +++++++++++++++++++----------
 6 files changed, 743 insertions(+), 428 deletions(-)

diff --git a/.gitignore b/.gitignore
index 7473c68..450ae9f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,3 +24,7 @@ coverage/
 
 # OS
 .DS_Store
+
+# Classification pipeline artifacts
+scripts/classification/models/
+scripts/classification/*.jsonl
diff --git a/scripts/classification/CLAUDE.md b/scripts/classification/CLAUDE.md
index 9d9a0bf..7b20bc0 100644
--- a/scripts/classification/CLAUDE.md
+++ b/scripts/classification/CLAUDE.md
@@ -8,8 +8,8 @@ Uses the FineWeb-Edu pattern: LLM labels a small sample → train lightweight cl
 
 1. **`sample.py`** — Stratified sampling from PostgreSQL. Samples proportionally across languages (en, ru, cs, pl, es), stratified by word count terciles and source domain diversity.
 2. **`label.py`** — Async LLM labeling with Claude. Supports resume (appends to JSONL). Rate-limited with configurable parallelism.
-3. **`train.py`** — Fine-tunes two independent ModernBERT classifiers (document_type and topic). Outputs models to `./models/`.
-4. **`classify.py`** — Batch inference on the full corpus. Fetches text from R2, runs both models, writes results to PostgreSQL.
+3. **`train.py`** — Fine-tunes two independent xlm-roberta-base classifiers (document_type and topic). Supports `--modal` for cloud GPU training. Outputs models to `./models/`.
+4. **`classify.py`** — Batch inference on the full corpus. Supports `--modal` for parallel cloud workers (20x speedup). Fetches text from R2, runs both models, writes results to PostgreSQL.
 5. **`evaluate.py`** — Quality metrics. Two modes: `labels` (analyzes JSONL) and `corpus` (queries DB).
 
 ## Key files
@@ -24,10 +24,18 @@ Writes to the same `documents` table as the TS pipeline:
 - `document_type` — one of 10 types (legal, forms, reports, etc.)
 - `document_topic` — one of 9 topics (government, education, healthcare, etc.)
 - `classification_confidence` — min(type_confidence, topic_confidence)
-- `classification_model` — e.g. "claude-haiku-4-5" or "modernbert-v2.0.0"
+- `classification_model` — e.g. "claude-haiku-4-5" or "modernbert-2.0.0"
 
 Connection via `DATABASE_URL` env var loaded from `../../.env`.
 
+## Modal (cloud GPU)
+
+Both `train.py` and `classify.py` support a `--modal` flag for cloud execution:
+- Training uses a single GPU (T4 default, configurable with `--gpu`)
+- Classification fans out across `--workers` parallel containers for ~160 docs/s aggregate
+- Models are persisted in a Modal Volume (`classifier-models`)
+- DB credentials are stored in a Modal Secret (`docx-db`)
+
 ## Conventions
 
 - Python 3.11+, no type stubs needed
@@ -36,3 +44,4 @@ Connection via `DATABASE_URL` env var loaded from `../../.env`.
 - Text is fetched via HTTP from the public R2 endpoint, not direct R2 access
 - All scripts support `--help` for usage
 - JSONL files are the interchange format between steps
+- Data files (*.jsonl, models/) are gitignored — store locally in `~/data/docx-corpus/classification/`
diff --git a/scripts/classification/README.md b/scripts/classification/README.md
index bca3f5b..5ef6db5 100644
--- a/scripts/classification/README.md
+++ b/scripts/classification/README.md
@@ -1,7 +1,7 @@
 # Document Classification Pipeline
 
-Classifies ~800K .docx documents using the FineWeb-Edu / TnT-LLM pattern:
-LLM labels a small sample → train ModernBERT classifier → apply at scale.
+Classifies ~800K .docx documents using the [FineWeb-Edu / TnT-LLM](https://huggingface.co/spaces/HuggingFaceFW/blogpost-fineweb-v1) pattern:
+LLM labels a small sample → train classifier → apply at scale.
 
 ## Two-Dimensional Taxonomy
 
@@ -10,6 +10,8 @@ Each document gets classified on two independent dimensions:
 - **Document Type** (10 classes): legal, forms, reports, policies, educational, correspondence, technical, administrative, creative, reference
 - **Topic** (9 classes): government, education, healthcare, finance, legal_judicial, technology, environment, nonprofit, general
 
+See [`taxonomy.json`](taxonomy.json) for full definitions and examples.
+
 ## Pipeline Steps
 
 ### 1. Sample (`sample.py`)
@@ -29,37 +31,37 @@ python label.py --input sampled_docs.jsonl --output labeled_docs.jsonl
 python label.py --input sampled_docs.jsonl --output labeled_docs.jsonl --model claude-haiku-4-5 --parallel 5
 ```
 
-### 3. Evaluate Labels (`evaluate.py labels`)
-
-Check label quality before training.
-
-```bash
-python evaluate.py labels --input labeled_docs.jsonl
-```
-
-### 4. Train (`train.py`)
+### 3. Train (`train.py`)
 
-Fine-tune ModernBERT on labeled data. Trains two independent classifiers.
+Fine-tune xlm-roberta-base on labeled data. Trains two independent classifiers with class-weighted loss.
 
 ```bash
+# Local training (CPU/MPS/CUDA)
 python train.py --input labeled_docs.jsonl
-python train.py --input labeled_docs.jsonl --epochs 5 --lr 2e-5 --output-dir ./models
+
+# Cloud training on Modal GPU
+python train.py --input labeled_docs.jsonl --modal
+python train.py --input labeled_docs.jsonl --modal --gpu a10g
 ```
 
-### 5. Classify (`classify.py`)
+### 4. Classify (`classify.py`)
 
-Apply trained models to the full corpus.
+Apply trained models to the full corpus. Supports parallel cloud workers via Modal.
 
 ```bash
+# Local inference
 python classify.py --models-dir ./models
-python classify.py --models-dir ./models --batch-size 256 --dry-run --limit 100
+
+# Cloud inference with 20 parallel GPU workers
+python classify.py --models-dir ./models --modal --workers 20
 ```
 
-### 6. Evaluate Corpus (`evaluate.py corpus`)
+### 5. Evaluate (`evaluate.py`)
 
-Check full corpus classification distribution.
+Check label quality or corpus classification distribution.
 
 ```bash
+python evaluate.py labels --input labeled_docs.jsonl
 python evaluate.py corpus
 python evaluate.py corpus --languages en,ru
 ```
@@ -67,15 +69,35 @@ python evaluate.py corpus --languages en,ru
 ## Setup
 
 ```bash
-pip install -r requirements.txt
+pip install -e .
 ```
 
 Required environment variables (`.env` in project root):
 - `DATABASE_URL` — PostgreSQL connection string
 - `ANTHROPIC_API_KEY` — For LLM labeling step only
 
+### Modal Setup (optional, for cloud training/inference)
+
+```bash
+pip install modal
+python -m modal setup
+modal secret create docx-db DATABASE_URL="postgres://..."
+```
+
+## Key Files
+
+| File | Purpose |
+|------|---------|
+| `taxonomy.json` | Two-dimensional taxonomy definition (source of truth) |
+| `common.py` | Shared utilities: DB, text fetching, taxonomy loading |
+| `sample.py` | Stratified document sampling from PostgreSQL |
+| `label.py` | Async LLM labeling with Claude |
+| `train.py` | Fine-tune classifiers (local or Modal) |
+| `classify.py` | Batch inference on full corpus (local or Modal) |
+| `evaluate.py` | Quality metrics and distribution analysis |
+
 ## Cost Estimate
 
 - **Labeling**: ~3,500 docs × Claude Haiku ≈ $2-5
-- **Training**: ~30 min on GPU (or ~2h on CPU)
-- **Inference**: ~800K docs, ~200-500 docs/sec on GPU
+- **Training**: ~30 min on T4 GPU (~$0.30 on Modal, free tier covers it)
+- **Inference**: ~800K docs with 20 Modal workers ≈ 75 min (~$12 or within free tier)
diff --git a/scripts/classification/classify.py b/scripts/classification/classify.py
index 411e743..067826a 100644
--- a/scripts/classification/classify.py
+++ b/scripts/classification/classify.py
@@ -1,20 +1,24 @@
 #!/usr/bin/env python3
 """
-Phase 4: Apply trained classifiers to the full corpus.
+Apply trained classifiers to the full corpus.
 
-Loads the trained ModernBERT models and classifies all unclassified documents.
+Loads the trained models and classifies all unclassified documents.
 Fetches text from R2, runs inference, updates the database.
-
 Supports resume — already-classified documents are skipped.
 
 Usage:
+    # Local classification
     python classify.py --models-dir ./models
-    python classify.py --models-dir ./models --batch-size 256 --languages en,ru,cs,pl,es
-    python classify.py --models-dir ./models --dry-run --limit 100
+    python classify.py --models-dir ./models --batch-size 256 --languages en,ru
+
+    # Cloud classification on Modal (parallel GPU workers)
+    python classify.py --models-dir ./models --modal
+    python classify.py --models-dir ./models --modal --workers 20 --gpu a10g
 """
 
 import argparse
 import json
+import math
 import os
 import sys
 import time
@@ -37,6 +41,11 @@
 DEFAULT_MAX_CHARS = 2000
 
 
+# ---------------------------------------------------------------------------
+# Shared helpers
+# ---------------------------------------------------------------------------
+
+
 def load_classifier(model_dir: str, device: torch.device):
     """Load a trained classifier and tokenizer."""
     tokenizer = AutoTokenizer.from_pretrained(model_dir)
@@ -63,28 +72,19 @@ def get_unclassified_documents(
                   AND classification_model IS NULL
             """
             params: list = []
-
             if languages:
                 placeholders = ",".join(["%s"] * len(languages))
                 query += f" AND language IN ({placeholders})"
                 params.extend(languages)
-
             query += " ORDER BY random()"
-
             if limit:
                 query += " LIMIT %s"
                 params.append(limit)
-
             cur.execute(query, params)
             return [
-                {
-                    "id": row[0],
-                    "source_url": row[1],
-                    "original_filename": row[2],
-                    "word_count": row[3],
-                    "language": row[4],
-                }
-                for row in cur.fetchall()
+                {"id": r[0], "source_url": r[1], "original_filename": r[2],
+                 "word_count": r[3], "language": r[4]}
+                for r in cur.fetchall()
             ]
     finally:
         conn.close()
@@ -104,10 +104,8 @@ def get_classification_stats() -> dict:
             """)
             row = cur.fetchone()
             return {
-                "total": row[0],
-                "classified": row[1],
-                "classifiable": row[2],
-                "remaining": row[2] - row[1],
+                "total": row[0], "classified": row[1],
+                "classifiable": row[2], "remaining": row[2] - row[1],
             }
     finally:
         conn.close()
@@ -115,50 +113,29 @@ def get_classification_stats() -> dict:
 
 @torch.no_grad()
 def classify_batch(
-    texts: list[str],
-    tokenizer,
-    model,
-    max_length: int,
-    device: torch.device,
+    texts: list[str], tokenizer, model, max_length: int, device: torch.device,
 ) -> list[tuple[str, float]]:
     """Classify a batch of texts. Returns list of (label, confidence)."""
     inputs = tokenizer(
-        texts,
-        truncation=True,
-        max_length=max_length,
-        padding=True,
-        return_tensors="pt",
+        texts, truncation=True, max_length=max_length,
+        padding=True, return_tensors="pt",
     ).to(device)
-
     outputs = model(**inputs)
     probs = torch.softmax(outputs.logits, dim=-1)
     confidences, pred_ids = torch.max(probs, dim=-1)
-
-    results = []
-    for pred_id, conf in zip(pred_ids.cpu().numpy(), confidences.cpu().numpy()):
-        label = model.config.id2label[int(pred_id)]
-        results.append((label, float(conf)))
-
-    return results
+    return [
+        (model.config.id2label[int(pid)], float(conf))
+        for pid, conf in zip(pred_ids.cpu().numpy(), confidences.cpu().numpy())
+    ]
 
 
 def process_batch(
-    docs: list[dict],
-    type_tokenizer,
-    type_model,
-    topic_tokenizer,
-    topic_model,
-    max_length: int,
-    max_chars: int,
-    device: torch.device,
-    model_name: str,
+    docs, type_tokenizer, type_model, topic_tokenizer, topic_model,
+    max_length, max_chars, device, model_name,
 ) -> list[dict]:
     """Process a batch: fetch texts, classify, return label dicts."""
-    # Fetch texts
     doc_ids = [d["id"] for d in docs]
     texts = fetch_documents_text_parallel(doc_ids, max_chars=max_chars)
-
-    # Filter docs with text
     valid_docs = []
     valid_texts = []
     for doc in docs:
@@ -166,99 +143,33 @@ def process_batch(
         if text:
             valid_docs.append(doc)
             valid_texts.append(text)
-
     if not valid_texts:
         return []
+    type_results = classify_batch(valid_texts, type_tokenizer, type_model, max_length, device)
+    topic_results = classify_batch(valid_texts, topic_tokenizer, topic_model, max_length, device)
+    return [
+        {
+            "id": doc["id"], "document_type": dt, "document_topic": tp,
+            "confidence": min(tc, tpc), "model": model_name,
+        }
+        for doc, (dt, tc), (tp, tpc) in zip(valid_docs, type_results, topic_results)
+    ]
 
-    # Classify with both models
-    type_results = classify_batch(
-        valid_texts, type_tokenizer, type_model, max_length, device
-    )
-    topic_results = classify_batch(
-        valid_texts, topic_tokenizer, topic_model, max_length, device
-    )
 
-    # Build label dicts for DB update
-    labels = []
-    for doc, (doc_type, type_conf), (topic, topic_conf) in zip(
-        valid_docs, type_results, topic_results
-    ):
-        labels.append(
-            {
-                "id": doc["id"],
-                "document_type": doc_type,
-                "document_topic": topic,
-                "confidence": min(type_conf, topic_conf),
-                "model": model_name,
-            }
-        )
-
-    return labels
+# ---------------------------------------------------------------------------
+# Local classification
+# ---------------------------------------------------------------------------
 
 
-def main():
-    parser = argparse.ArgumentParser(
-        description="Classify full corpus with trained ModernBERT models"
-    )
-    parser.add_argument(
-        "--models-dir",
-        type=str,
-        required=True,
-        help="Directory containing trained models (from train.py)",
-    )
-    parser.add_argument(
-        "--batch-size",
-        type=int,
-        default=DEFAULT_BATCH_SIZE,
-        help=f"Inference batch size (default: {DEFAULT_BATCH_SIZE})",
-    )
-    parser.add_argument(
-        "--max-length",
-        type=int,
-        default=DEFAULT_MAX_LENGTH,
-        help=f"Max token length (default: {DEFAULT_MAX_LENGTH})",
-    )
-    parser.add_argument(
-        "--max-chars",
-        type=int,
-        default=DEFAULT_MAX_CHARS,
-        help=f"Max text characters to fetch (default: {DEFAULT_MAX_CHARS})",
-    )
-    parser.add_argument(
-        "--languages",
-        type=str,
-        default=None,
-        help="Comma-separated language codes to classify (default: all)",
-    )
-    parser.add_argument(
-        "--limit",
-        type=int,
-        default=None,
-        help="Max documents to classify (default: all)",
-    )
-    parser.add_argument(
-        "--db-batch-size",
-        type=int,
-        default=500,
-        help="DB update batch size (default: 500)",
-    )
-    parser.add_argument(
-        "--dry-run",
-        action="store_true",
-        help="Classify but don't write to DB",
-    )
-    args = parser.parse_args()
-
-    # Validate model directories
+def run_local(args):
+    """Classify documents locally using available device."""
     type_model_dir = os.path.join(args.models_dir, "document_type", "best")
     topic_model_dir = os.path.join(args.models_dir, "topic", "best")
-
     for d in [type_model_dir, topic_model_dir]:
         if not os.path.exists(d):
             print(f"ERROR: Model directory not found: {d}")
             sys.exit(1)
 
-    # Load training config for model name
     config_path = os.path.join(args.models_dir, "training_config.json")
     if os.path.exists(config_path):
         with open(config_path) as f:
@@ -267,7 +178,6 @@ def main():
     else:
         model_name = "modernbert-v2"
 
-    # Device
     if torch.cuda.is_available():
         device = torch.device("cuda")
     elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
@@ -276,108 +186,282 @@ def main():
         device = torch.device("cpu")
     print(f"Device: {device}")
 
-    # Load models
     print(f"\nLoading document_type model from {type_model_dir}...")
     type_tokenizer, type_model = load_classifier(type_model_dir, device)
     print(f"Loading topic model from {topic_model_dir}...")
     topic_tokenizer, topic_model = load_classifier(topic_model_dir, device)
 
-    # Stats
     stats = get_classification_stats()
-    print(f"\nCorpus stats:")
-    print(f"  Total documents: {stats['total']:,}")
-    print(f"  Classifiable: {stats['classifiable']:,}")
-    print(f"  Already classified: {stats['classified']:,}")
-    print(f"  Remaining: {stats['remaining']:,}")
-
-    # Get unclassified docs
-    languages = (
-        [l.strip() for l in args.languages.split(",")]
-        if args.languages
-        else None
-    )
-    print(f"\nFetching unclassified documents...")
-    docs = get_unclassified_documents(languages=languages, limit=args.limit)
-    print(f"  Found {len(docs):,} documents to classify")
+    print(f"\nCorpus: {stats['total']:,} total, {stats['classifiable']:,} classifiable, "
+          f"{stats['classified']:,} done, {stats['remaining']:,} remaining")
 
+    languages = [l.strip() for l in args.languages.split(",")] if args.languages else None
+    docs = get_unclassified_documents(languages=languages, limit=args.limit)
+    print(f"Found {len(docs):,} documents to classify")
     if not docs:
         print("Nothing to classify!")
         return
-
     if args.dry_run:
         print("  (DRY RUN — will not write to database)")
 
-    # Process in batches
     total_classified = 0
     total_errors = 0
     start_time = time.time()
-
-    # Use smaller fetch batches for text retrieval
     fetch_batch_size = min(args.batch_size, 100)
-
     pbar = tqdm(total=len(docs), desc="Classifying", unit="doc")
 
     for i in range(0, len(docs), fetch_batch_size):
         batch_docs = docs[i : i + fetch_batch_size]
-
         labels = process_batch(
-            docs=batch_docs,
-            type_tokenizer=type_tokenizer,
-            type_model=type_model,
-            topic_tokenizer=topic_tokenizer,
-            topic_model=topic_model,
-            max_length=args.max_length,
-            max_chars=args.max_chars,
-            device=device,
-            model_name=model_name,
+            batch_docs, type_tokenizer, type_model, topic_tokenizer, topic_model,
+            args.max_length, args.max_chars, device, model_name,
         )
-
         if labels and not args.dry_run:
             save_labels_to_db(labels, batch_size=args.db_batch_size)
-
         total_classified += len(labels)
         total_errors += len(batch_docs) - len(labels)
         pbar.update(len(batch_docs))
-
-        # Show throughput
         elapsed = time.time() - start_time
         rate = total_classified / elapsed if elapsed > 0 else 0
         pbar.set_postfix_str(f"{rate:.0f} docs/s, {total_errors} errors")
-
     pbar.close()
 
     elapsed = time.time() - start_time
     rate = total_classified / elapsed if elapsed > 0 else 0
-
     print(f"\n{'=' * 60}")
     print("Classification Complete")
     print(f"{'=' * 60}")
-    print(f"  Classified: {total_classified:,}")
-    print(f"  Errors (no text): {total_errors:,}")
+    print(f"  Classified: {total_classified:,}, Errors: {total_errors:,}")
     print(f"  Time: {elapsed:.1f}s ({rate:.0f} docs/s)")
-    print(f"  Model: {model_name}")
-
     if not args.dry_run:
-        final_stats = get_classification_stats()
-        print(f"\n  Total classified in DB: {final_stats['classified']:,}")
-        print(f"  Remaining: {final_stats['remaining']:,}")
+        final = get_classification_stats()
+        print(f"  DB classified: {final['classified']:,}, remaining: {final['remaining']:,}")
+    if args.dry_run:
+        print("  (DRY RUN — no changes written)")
+
+
+# ---------------------------------------------------------------------------
+# Modal cloud classification (parallel workers)
+# ---------------------------------------------------------------------------
 
+
+def run_modal(args):
+    """Classify on Modal with parallel GPU workers."""
+    import modal
+
+    app = modal.App("docx-classifier-inference")
+    inference_image = (
+        modal.Image.debian_slim(python_version="3.11")
+        .pip_install("torch", "transformers", "numpy", "psycopg2-binary")
+    )
+    model_volume = modal.Volume.from_name("classifier-models")
+    db_secret = modal.Secret.from_name("docx-db")
+
+    gpu_map = {"t4": "T4", "a10g": "a10g", "l4": "l4", "a100": "a100"}
+    gpu = gpu_map.get(args.gpu.lower(), args.gpu)
+
+    @app.function(image=inference_image, timeout=300, secrets=[db_secret])
+    def fetch_unclassified_ids(languages: list[str] | None, limit: int | None) -> list[str]:
+        import os
+        import psycopg2
+        conn = psycopg2.connect(os.environ["DATABASE_URL"])
+        try:
+            with conn.cursor() as cur:
+                query = """
+                    SELECT id FROM documents
+                    WHERE extracted_at IS NOT NULL AND extraction_error IS NULL
+                      AND word_count > 0 AND classification_model IS NULL
+                """
+                params = []
+                if languages:
+                    query += f" AND language IN ({','.join(['%s'] * len(languages))})"
+                    params.extend(languages)
+                query += " ORDER BY random()"
+                if limit:
+                    query += " LIMIT %s"
+                    params.append(limit)
+                cur.execute(query, params)
+                return [r[0] for r in cur.fetchall()]
+        finally:
+            conn.close()
+
+    @app.function(
+        image=inference_image, gpu=gpu, timeout=7200,
+        volumes={"/models": model_volume}, secrets=[db_secret],
+    )
+    def classify_chunk(
+        doc_ids: list[str], worker_id: int, total_workers: int,
+        max_length: int, max_chars: int, dry_run: bool,
+    ) -> dict:
+        import os
+        import time
+        import urllib.request
+        from concurrent.futures import ThreadPoolExecutor
+        import psycopg2
+        import torch
+        from transformers import AutoModelForSequenceClassification, AutoTokenizer
+
+        TEXT_BASE_URL = "https://docxcorp.us/extracted"
+
+        def fetch_text(did, mc=2000):
+            try:
+                req = urllib.request.Request(f"{TEXT_BASE_URL}/{did}.txt", headers={"User-Agent": "docx-classifier/2.0"})
+                with urllib.request.urlopen(req, timeout=15) as r:
+                    return r.read().decode("utf-8")[:mc]
+            except Exception:
+                return ""
+
+        def fetch_parallel(ids, mc):
+            res = {}
+            with ThreadPoolExecutor(max_workers=50) as ex:
+                for did, txt in ex.map(lambda d: (d, fetch_text(d, mc)), ids):
+                    res[did] = txt
+            return res
+
+        def save_batch(labels):
+            conn = psycopg2.connect(os.environ["DATABASE_URL"])
+            try:
+                with conn.cursor() as cur:
+                    for l in labels:
+                        cur.execute("""
+                            UPDATE documents SET document_type=%s, document_topic=%s,
+                                classification_confidence=%s, classification_model=%s
+                            WHERE id=%s
+                        """, (l["document_type"], l["document_topic"], l["confidence"], l["model"], l["id"]))
+                conn.commit()
+            finally:
+                conn.close()
+
+        model_volume.reload()
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        gpu_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "cpu"
+        print(f"[Worker {worker_id}/{total_workers}] {len(doc_ids):,} docs, device={gpu_name}")
+
+        cfg_path = "/models/training_config.json"
+        model_name = "modernbert-v2"
+        if os.path.exists(cfg_path):
+            with open(cfg_path) as f:
+                model_name = f"modernbert-{json.load(f).get('taxonomy_version', 'v2')}"
+
+        type_tok = AutoTokenizer.from_pretrained("/models/document_type/best")
+        type_mdl = AutoModelForSequenceClassification.from_pretrained("/models/document_type/best").to(device).eval()
+        topic_tok = AutoTokenizer.from_pretrained("/models/topic/best")
+        topic_mdl = AutoModelForSequenceClassification.from_pretrained("/models/topic/best").to(device).eval()
+
+        @torch.no_grad()
+        def infer(texts, tok, mdl):
+            inp = tok(texts, truncation=True, max_length=max_length, padding=True, return_tensors="pt").to(device)
+            probs = torch.softmax(mdl(**inp).logits, dim=-1)
+            confs, preds = torch.max(probs, dim=-1)
+            return [(mdl.config.id2label[int(p)], float(c)) for p, c in zip(preds.cpu().numpy(), confs.cpu().numpy())]
+
+        total_classified = 0
+        total_errors = 0
+        start = time.time()
+
+        for i in range(0, len(doc_ids), 100):
+            batch = doc_ids[i:i+100]
+            texts = fetch_parallel(batch, max_chars)
+            valid = [(did, texts[did]) for did in batch if texts.get(did)]
+            if not valid:
+                total_errors += len(batch)
+                continue
+            vids, vtxts = zip(*valid)
+            vtxts = list(vtxts)
+            tr = infer(vtxts, type_tok, type_mdl)
+            tpr = infer(vtxts, topic_tok, topic_mdl)
+            labels = [
+                {"id": did, "document_type": dt, "document_topic": tp,
+                 "confidence": min(tc, tpc), "model": model_name}
+                for did, (dt, tc), (tp, tpc) in zip(vids, tr, tpr)
+            ]
+            if labels and not dry_run:
+                save_batch(labels)
+            total_classified += len(labels)
+            total_errors += len(batch) - len(labels)
+            if (i // 100) % 10 == 0:
+                elapsed = time.time() - start
+                rate = total_classified / elapsed if elapsed > 0 else 0
+                pct = (i + len(batch)) / len(doc_ids) * 100
+                print(f"  [Worker {worker_id}] [{pct:5.1f}%] {total_classified:,} done, {rate:.0f} docs/s")
+
+        elapsed = time.time() - start
+        rate = total_classified / elapsed if elapsed > 0 else 0
+        print(f"  [Worker {worker_id}] DONE — {total_classified:,} in {elapsed:.0f}s ({rate:.0f} docs/s)")
+        return {"worker_id": worker_id, "classified": total_classified, "errors": total_errors,
+                "elapsed_seconds": round(elapsed, 1), "docs_per_second": round(rate, 1)}
+
+    languages = [l.strip() for l in args.languages.split(",")] if args.languages else None
+    n_workers = args.workers
+
+    print(f"Modal parallel classification ({gpu} GPU, {n_workers} workers)")
     if args.dry_run:
-        print("\n  (DRY RUN — no changes written to database)")
-
-    # Print distribution of this batch
-    if total_classified > 0 and labels:
-        print(f"\nSample distribution (last batch):")
-        type_counts: dict[str, int] = {}
-        topic_counts: dict[str, int] = {}
-        for label in labels:
-            dt = label["document_type"]
-            tp = label["document_topic"]
-            type_counts[dt] = type_counts.get(dt, 0) + 1
-            topic_counts[tp] = topic_counts.get(tp, 0) + 1
-
-        print("  Types:", dict(sorted(type_counts.items(), key=lambda x: -x[1])))
-        print("  Topics:", dict(sorted(topic_counts.items(), key=lambda x: -x[1])))
+        print("  DRY RUN mode")
+    print()
+
+    with app.run():
+        print("Fetching unclassified document IDs...")
+        all_ids = fetch_unclassified_ids.remote(languages=languages, limit=args.limit)
+        print(f"  Found {len(all_ids):,} documents to classify")
+        if not all_ids:
+            print("Nothing to classify!")
+            return
+
+        n_workers = min(n_workers, len(all_ids))
+        chunk_size = math.ceil(len(all_ids) / n_workers)
+        chunks = [all_ids[i:i+chunk_size] for i in range(0, len(all_ids), chunk_size)]
+        print(f"  Split into {len(chunks)} chunks of ~{chunk_size:,} docs")
+        print(f"  Estimated: ~{len(all_ids) / (n_workers * 8) / 60:.0f} minutes\n")
+
+        results = list(classify_chunk.map(
+            chunks,
+            [i for i in range(len(chunks))],
+            [len(chunks)] * len(chunks),
+            [args.max_length] * len(chunks),
+            [args.max_chars] * len(chunks),
+            [args.dry_run] * len(chunks),
+        ))
+
+        total_classified = sum(r["classified"] for r in results)
+        total_errors = sum(r["errors"] for r in results)
+        max_elapsed = max(r["elapsed_seconds"] for r in results)
+        agg_rate = total_classified / max_elapsed if max_elapsed > 0 else 0
+
+        print(f"\n{'=' * 60}")
+        print("Classification Complete")
+        print(f"{'=' * 60}")
+        print(f"  Workers: {len(results)}")
+        print(f"  Classified: {total_classified:,}, Errors: {total_errors:,}")
+        print(f"  Wall time: {max_elapsed:.0f}s ({max_elapsed/60:.1f} min)")
+        print(f"  Aggregate: {agg_rate:.0f} docs/s")
+        if args.dry_run:
+            print("  (DRY RUN — no changes written)")
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Classify corpus with trained models")
+    parser.add_argument("--models-dir", type=str, required=True)
+    parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE)
+    parser.add_argument("--max-length", type=int, default=DEFAULT_MAX_LENGTH)
+    parser.add_argument("--max-chars", type=int, default=DEFAULT_MAX_CHARS)
+    parser.add_argument("--languages", type=str, default=None)
+    parser.add_argument("--limit", type=int, default=None)
+    parser.add_argument("--db-batch-size", type=int, default=500)
+    parser.add_argument("--dry-run", action="store_true")
+    parser.add_argument("--modal", action="store_true", help="Run on Modal cloud GPUs")
+    parser.add_argument("--workers", type=int, default=20, help="Modal parallel workers (default: 20)")
+    parser.add_argument("--gpu", type=str, default="T4", help="Modal GPU type: T4, a10g, l4, a100")
+    args = parser.parse_args()
+
+    if args.modal:
+        run_modal(args)
+    else:
+        run_local(args)
 
 
 if __name__ == "__main__":
diff --git a/scripts/classification/sample.py b/scripts/classification/sample.py
index 2b00ea6..b0cb5e5 100644
--- a/scripts/classification/sample.py
+++ b/scripts/classification/sample.py
@@ -175,11 +175,26 @@ def main():
         default=",".join(DEFAULT_LANGUAGES),
         help="Comma-separated language codes (default: en,ru,cs,pl,es)",
     )
+    parser.add_argument(
+        "--exclude",
+        type=str,
+        default=None,
+        help="JSONL file of docs to exclude (already sampled)",
+    )
     parser.add_argument("--seed", type=int, default=42, help="Random seed")
     args = parser.parse_args()
 
     languages = [l.strip() for l in args.languages.split(",")]
 
+    # Load exclusion set
+    exclude_ids: set[str] = set()
+    if args.exclude:
+        with open(args.exclude) as f:
+            for line in f:
+                if line.strip():
+                    exclude_ids.add(json.loads(line)["id"])
+        print(f"Excluding {len(exclude_ids)} already-sampled documents")
+
     print("=" * 60)
     print("Stratified Document Sampling")
     print("=" * 60)
@@ -220,6 +235,8 @@ def main():
         print(f"\nSampling {n_samples} from {lang}...")
         # Fetch more than needed to allow stratification
         docs = get_documents_for_language(lang, limit=min(n_samples * 10, 100000))
+        if exclude_ids:
+            docs = [d for d in docs if d["id"] not in exclude_ids]
         print(f"  Fetched {len(docs):,} candidates")
 
         sampled = stratified_sample(docs, n_samples, seed=args.seed)
diff --git a/scripts/classification/train.py b/scripts/classification/train.py
index 821b423..75a1533 100644
--- a/scripts/classification/train.py
+++ b/scripts/classification/train.py
@@ -1,18 +1,22 @@
 #!/usr/bin/env python3
 """
-Phase 3: Train ModernBERT classifiers on LLM-labeled documents.
+Train classifiers on LLM-labeled documents.
 
 Trains two independent classifiers:
   1. Document type (10 classes)
   2. Topic (9 classes)
 
-Uses HuggingFace Transformers with the answerdotai/ModernBERT-base model.
-Supports configurable train/val split, epochs, learning rate, etc.
+Uses HuggingFace Transformers with xlm-roberta-base (multilingual).
+Supports class-weighted loss, configurable train/val split, epochs, etc.
 
 Usage:
+    # Local training
     python train.py --input labeled_docs.jsonl
     python train.py --input labeled_docs.jsonl --epochs 5 --lr 2e-5
-    python train.py --input labeled_docs.jsonl --output-dir ./models
+
+    # Cloud training on Modal (GPU)
+    python train.py --input labeled_docs.jsonl --modal
+    python train.py --input labeled_docs.jsonl --modal --gpu a10g
 """
 
 import argparse
@@ -24,11 +28,7 @@
 import numpy as np
 import torch
 from datasets import Dataset
-from sklearn.metrics import (
-    accuracy_score,
-    classification_report,
-    f1_score,
-)
+from sklearn.metrics import accuracy_score, classification_report, f1_score
 from sklearn.model_selection import train_test_split
 from transformers import (
     AutoModelForSequenceClassification,
@@ -40,7 +40,7 @@
 
 from common import fetch_documents_text_parallel, load_taxonomy
 
-DEFAULT_MODEL = "answerdotai/ModernBERT-base"
+DEFAULT_MODEL = "xlm-roberta-base"
 DEFAULT_EPOCHS = 5
 DEFAULT_LR = 2e-5
 DEFAULT_BATCH_SIZE = 16
@@ -49,6 +49,11 @@
 DEFAULT_OUTPUT_DIR = "./models"
 
 
+# ---------------------------------------------------------------------------
+# Shared helpers (used by both local and Modal paths)
+# ---------------------------------------------------------------------------
+
+
 def load_labeled_data(input_path: str, min_confidence: float = 0.0) -> list[dict]:
     """Load labeled documents, optionally filtering by confidence."""
     docs = []
@@ -61,31 +66,14 @@ def load_labeled_data(input_path: str, min_confidence: float = 0.0) -> list[dict
     return docs
 
 
-def fetch_texts(docs: list[dict], max_chars: int = 2000) -> dict[str, str]:
-    """Fetch document texts in parallel batches."""
-    doc_ids = [d["id"] for d in docs]
-    all_texts = {}
-    batch_size = 100
-    for i in range(0, len(doc_ids), batch_size):
-        batch = doc_ids[i : i + batch_size]
-        texts = fetch_documents_text_parallel(batch, max_chars=max_chars)
-        all_texts.update(texts)
-        fetched = min(i + batch_size, len(doc_ids))
-        if fetched < len(doc_ids):
-            print(f"  Fetched text: {fetched}/{len(doc_ids)}")
-    return all_texts
-
-
 def build_label_maps(taxonomy: dict) -> tuple[dict, dict, dict, dict]:
     """Build label-to-id and id-to-label mappings for both dimensions."""
     type_labels = [t["id"] for t in taxonomy["document_types"]]
     topic_labels = [t["id"] for t in taxonomy["topics"]]
-
     type2id = {label: i for i, label in enumerate(type_labels)}
     id2type = {i: label for label, i in type2id.items()}
     topic2id = {label: i for i, label in enumerate(topic_labels)}
     id2topic = {i: label for label, i in topic2id.items()}
-
     return type2id, id2type, topic2id, id2topic
 
 
@@ -116,9 +104,39 @@ def compute_metrics(eval_pred):
     """Compute accuracy and macro F1 for evaluation."""
     predictions, labels = eval_pred
     preds = np.argmax(predictions, axis=-1)
-    acc = accuracy_score(labels, preds)
-    f1 = f1_score(labels, preds, average="macro")
-    return {"accuracy": acc, "f1_macro": f1}
+    return {
+        "accuracy": accuracy_score(labels, preds),
+        "f1_macro": f1_score(labels, preds, average="macro"),
+    }
+
+
+def compute_class_weights(labels: list[int], num_classes: int) -> torch.Tensor:
+    """Compute inverse-frequency class weights."""
+    from collections import Counter
+
+    counts = Counter(labels)
+    total = len(labels)
+    weights = [total / (num_classes * counts.get(i, 1)) for i in range(num_classes)]
+    return torch.tensor(weights, dtype=torch.float32)
+
+
+class WeightedTrainer(Trainer):
+    """Trainer with class-weighted cross-entropy loss."""
+
+    def __init__(self, class_weights: torch.Tensor | None = None, **kwargs):
+        super().__init__(**kwargs)
+        self.class_weights = class_weights
+
+    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
+        labels = inputs.pop("labels")
+        outputs = model(**inputs)
+        logits = outputs.logits
+        if self.class_weights is not None:
+            weight = self.class_weights.to(logits.device)
+            loss = torch.nn.functional.cross_entropy(logits, labels, weight=weight)
+        else:
+            loss = torch.nn.functional.cross_entropy(logits, labels)
+        return (loss, outputs) if return_outputs else loss
 
 
 def train_classifier(
@@ -141,33 +159,25 @@ def train_classifier(
     print(f"\n{'=' * 60}")
     print(f"Training: {classifier_name}")
     print(f"  Train: {len(train_texts)}, Val: {len(val_texts)}")
-    print(f"  Classes: {num_labels}")
-    print(f"  Model: {model_name}")
+    print(f"  Classes: {num_labels}, Model: {model_name}")
     print(f"  Epochs: {epochs}, LR: {lr}, Batch: {batch_size}")
     print(f"{'=' * 60}")
 
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     model = AutoModelForSequenceClassification.from_pretrained(
-        model_name,
-        num_labels=num_labels,
-        id2label=id2label,
-        label2id=label2id,
+        model_name, num_labels=num_labels, id2label=id2label, label2id=label2id,
     )
 
     def tokenize(examples):
         return tokenizer(
-            examples["text"],
-            truncation=True,
-            max_length=max_length,
-            padding="max_length",
+            examples["text"], truncation=True,
+            max_length=max_length, padding="max_length",
         )
 
     train_ds = Dataset.from_dict({"text": train_texts, "label": train_labels})
     val_ds = Dataset.from_dict({"text": val_texts, "label": val_labels})
-
     train_ds = train_ds.map(tokenize, batched=True, remove_columns=["text"])
     val_ds = val_ds.map(tokenize, batched=True, remove_columns=["text"])
-
     train_ds.set_format("torch")
     val_ds.set_format("torch")
 
@@ -193,44 +203,38 @@ def tokenize(examples):
         seed=42,
     )
 
-    trainer = Trainer(
-        model=model,
-        args=training_args,
-        train_dataset=train_ds,
-        eval_dataset=val_ds,
+    class_weights = compute_class_weights(train_labels, num_labels)
+    print(f"  Class weights: {[f'{w:.2f}' for w in class_weights.tolist()]}")
+
+    trainer = WeightedTrainer(
+        class_weights=class_weights, model=model, args=training_args,
+        train_dataset=train_ds, eval_dataset=val_ds,
         compute_metrics=compute_metrics,
         callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
     )
 
     trainer.train()
 
-    # Evaluate on validation set
     eval_results = trainer.evaluate()
     print(f"\n  Val accuracy: {eval_results['eval_accuracy']:.4f}")
     print(f"  Val F1 macro: {eval_results['eval_f1_macro']:.4f}")
 
-    # Save best model
     best_dir = os.path.join(save_dir, "best")
     trainer.save_model(best_dir)
     tokenizer.save_pretrained(best_dir)
 
-    # Full classification report on val set
     preds = trainer.predict(val_ds)
     pred_labels = np.argmax(preds.predictions, axis=-1)
     report = classification_report(
-        val_ds["label"],
-        pred_labels,
+        val_ds["label"], pred_labels,
         target_names=[id2label[i] for i in range(num_labels)],
     )
     print(f"\nClassification Report ({classifier_name}):\n{report}")
 
-    # Save report
     report_path = os.path.join(save_dir, "eval_report.txt")
     with open(report_path, "w") as f:
-        f.write(f"Model: {model_name}\n")
-        f.write(f"Classifier: {classifier_name}\n")
-        f.write(f"Train size: {len(train_texts)}\n")
-        f.write(f"Val size: {len(val_texts)}\n")
+        f.write(f"Model: {model_name}\nClassifier: {classifier_name}\n")
+        f.write(f"Train size: {len(train_texts)}\nVal size: {len(val_texts)}\n")
         f.write(f"Epochs: {epochs}\n")
         f.write(f"Val accuracy: {eval_results['eval_accuracy']:.4f}\n")
         f.write(f"Val F1 macro: {eval_results['eval_f1_macro']:.4f}\n\n")
@@ -239,93 +243,12 @@ def tokenize(examples):
     return eval_results
 
 
-def main():
-    parser = argparse.ArgumentParser(
-        description="Train ModernBERT classifiers on labeled documents"
-    )
-    parser.add_argument(
-        "--input", type=str, required=True, help="Labeled JSONL from label.py"
-    )
-    parser.add_argument(
-        "--output-dir",
-        type=str,
-        default=DEFAULT_OUTPUT_DIR,
-        help=f"Output directory for models (default: {DEFAULT_OUTPUT_DIR})",
-    )
-    parser.add_argument(
-        "--model",
-        type=str,
-        default=DEFAULT_MODEL,
-        help=f"Base model (default: {DEFAULT_MODEL})",
-    )
-    parser.add_argument(
-        "--epochs",
-        type=int,
-        default=DEFAULT_EPOCHS,
-        help=f"Training epochs (default: {DEFAULT_EPOCHS})",
-    )
-    parser.add_argument(
-        "--lr",
-        type=float,
-        default=DEFAULT_LR,
-        help=f"Learning rate (default: {DEFAULT_LR})",
-    )
-    parser.add_argument(
-        "--batch-size",
-        type=int,
-        default=DEFAULT_BATCH_SIZE,
-        help=f"Batch size (default: {DEFAULT_BATCH_SIZE})",
-    )
-    parser.add_argument(
-        "--max-length",
-        type=int,
-        default=DEFAULT_MAX_LENGTH,
-        help=f"Max token length (default: {DEFAULT_MAX_LENGTH})",
-    )
-    parser.add_argument(
-        "--val-split",
-        type=float,
-        default=DEFAULT_VAL_SPLIT,
-        help=f"Validation split ratio (default: {DEFAULT_VAL_SPLIT})",
-    )
-    parser.add_argument(
-        "--min-confidence",
-        type=float,
-        default=0.0,
-        help="Minimum LLM confidence to include (default: 0.0)",
-    )
-    parser.add_argument(
-        "--max-chars",
-        type=int,
-        default=2000,
-        help="Max characters of document text (default: 2000)",
-    )
-    parser.add_argument("--seed", type=int, default=42, help="Random seed")
-    args = parser.parse_args()
-
-    if not os.path.exists(args.input):
-        print(f"ERROR: Input file not found: {args.input}")
-        sys.exit(1)
-
-    # Load taxonomy and label maps
-    taxonomy = load_taxonomy()
-    type2id, id2type, topic2id, id2topic = build_label_maps(taxonomy)
-    print(f"Taxonomy: {taxonomy['name']} v{taxonomy['version']}")
-    print(f"  Document types: {len(type2id)} classes")
-    print(f"  Topics: {len(topic2id)} classes")
-
-    # Load labeled data
-    docs = load_labeled_data(args.input, min_confidence=args.min_confidence)
-    print(f"\nLoaded {len(docs)} labeled documents")
-    if args.min_confidence > 0:
-        print(f"  (filtered by confidence >= {args.min_confidence})")
-
-    # Fetch texts
-    print(f"\nFetching document texts (max {args.max_chars} chars)...")
-    texts = fetch_texts(docs, max_chars=args.max_chars)
-    print(f"  Got text for {sum(1 for t in texts.values() if t)}/{len(docs)} docs")
-
-    # Prepare datasets for both classifiers
+def run_training_pipeline(
+    docs, texts, taxonomy, type2id, id2type, topic2id, id2topic,
+    output_dir, model_name, epochs, lr, batch_size, max_length, val_split, seed,
+):
+    """Shared training pipeline used by both local and Modal paths."""
+    # Prepare datasets
     print("\nPreparing document_type dataset...")
     type_texts, type_labels = prepare_dataset(docs, texts, type2id, "document_type")
     print(f"  {len(type_texts)} examples across {len(set(type_labels))} classes")
@@ -334,92 +257,45 @@ def main():
     topic_texts, topic_labels = prepare_dataset(docs, texts, topic2id, "document_topic")
     print(f"  {len(topic_texts)} examples across {len(set(topic_labels))} classes")
 
-    # Split into train/val (same split for both to keep comparable)
-    print(f"\nSplitting: {1 - args.val_split:.0%} train / {args.val_split:.0%} val")
+    print(f"\nSplitting: {1 - val_split:.0%} train / {val_split:.0%} val")
 
-    type_train_texts, type_val_texts, type_train_labels, type_val_labels = (
-        train_test_split(
-            type_texts,
-            type_labels,
-            test_size=args.val_split,
-            random_state=args.seed,
-            stratify=type_labels,
-        )
+    type_train_t, type_val_t, type_train_l, type_val_l = train_test_split(
+        type_texts, type_labels, test_size=val_split,
+        random_state=seed, stratify=type_labels,
     )
-
-    topic_train_texts, topic_val_texts, topic_train_labels, topic_val_labels = (
-        train_test_split(
-            topic_texts,
-            topic_labels,
-            test_size=args.val_split,
-            random_state=args.seed,
-            stratify=topic_labels,
-        )
+    topic_train_t, topic_val_t, topic_train_l, topic_val_l = train_test_split(
+        topic_texts, topic_labels, test_size=val_split,
+        random_state=seed, stratify=topic_labels,
     )
 
-    os.makedirs(args.output_dir, exist_ok=True)
+    os.makedirs(output_dir, exist_ok=True)
 
-    # Train document_type classifier
     type_results = train_classifier(
-        train_texts=type_train_texts,
-        train_labels=type_train_labels,
-        val_texts=type_val_texts,
-        val_labels=type_val_labels,
-        num_labels=len(type2id),
-        id2label=id2type,
-        label2id=type2id,
-        output_dir=args.output_dir,
-        model_name=args.model,
-        epochs=args.epochs,
-        lr=args.lr,
-        batch_size=args.batch_size,
-        max_length=args.max_length,
-        classifier_name="document_type",
+        type_train_t, type_train_l, type_val_t, type_val_l,
+        len(type2id), id2type, type2id, output_dir,
+        model_name, epochs, lr, batch_size, max_length, "document_type",
     )
 
-    # Train topic classifier
     topic_results = train_classifier(
-        train_texts=topic_train_texts,
-        train_labels=topic_train_labels,
-        val_texts=topic_val_texts,
-        val_labels=topic_val_labels,
-        num_labels=len(topic2id),
-        id2label=id2topic,
-        label2id=topic2id,
-        output_dir=args.output_dir,
-        model_name=args.model,
-        epochs=args.epochs,
-        lr=args.lr,
-        batch_size=args.batch_size,
-        max_length=args.max_length,
-        classifier_name="topic",
+        topic_train_t, topic_train_l, topic_val_t, topic_val_l,
+        len(topic2id), id2topic, topic2id, output_dir,
+        model_name, epochs, lr, batch_size, max_length, "topic",
     )
 
-    # Summary
     print(f"\n{'=' * 60}")
     print("Training Complete")
     print(f"{'=' * 60}")
     print(f"  Document Type — Acc: {type_results['eval_accuracy']:.4f}, F1: {type_results['eval_f1_macro']:.4f}")
     print(f"  Topic         — Acc: {topic_results['eval_accuracy']:.4f}, F1: {topic_results['eval_f1_macro']:.4f}")
-    print(f"\n  Models saved to: {args.output_dir}/")
-    print(f"    {args.output_dir}/document_type/best/")
-    print(f"    {args.output_dir}/topic/best/")
 
-    # Save training config
     config = {
-        "base_model": args.model,
+        "base_model": model_name,
         "taxonomy": taxonomy["name"],
         "taxonomy_version": taxonomy["version"],
-        "input_file": args.input,
         "total_docs": len(docs),
-        "min_confidence": args.min_confidence,
-        "max_chars": args.max_chars,
-        "max_length": args.max_length,
-        "epochs": args.epochs,
-        "learning_rate": args.lr,
-        "batch_size": args.batch_size,
-        "val_split": args.val_split,
-        "seed": args.seed,
+        "epochs": epochs,
+        "learning_rate": lr,
+        "batch_size": batch_size,
         "results": {
             "document_type": {
                 "accuracy": type_results["eval_accuracy"],
@@ -431,12 +307,315 @@ def main():
             },
         },
     }
-    config_path = os.path.join(args.output_dir, "training_config.json")
+    config_path = os.path.join(output_dir, "training_config.json")
     with open(config_path, "w") as f:
         json.dump(config, f, indent=2)
-    print(f"  Config saved to: {config_path}")
+    print(f"  Config: {config_path}")
+
+    return config
+
+
+# ---------------------------------------------------------------------------
+# Local training
+# ---------------------------------------------------------------------------
+
+
+def run_local(args):
+    """Train locally using available device (CPU/MPS/CUDA)."""
+    taxonomy = load_taxonomy()
+    type2id, id2type, topic2id, id2topic = build_label_maps(taxonomy)
+    print(f"Taxonomy: {taxonomy['name']} v{taxonomy['version']}")
+    print(f"  Document types: {len(type2id)}, Topics: {len(topic2id)}")
+
+    docs = load_labeled_data(args.input, min_confidence=args.min_confidence)
+    print(f"\nLoaded {len(docs)} labeled documents")
+
+    print(f"\nFetching document texts (max {args.max_chars} chars)...")
+    doc_ids = [d["id"] for d in docs]
+    all_texts = {}
+    for i in range(0, len(doc_ids), 100):
+        batch = doc_ids[i : i + 100]
+        all_texts.update(fetch_documents_text_parallel(batch, max_chars=args.max_chars))
+    print(f"  Got text for {sum(1 for t in all_texts.values() if t)}/{len(docs)} docs")
+
+    config = run_training_pipeline(
+        docs, all_texts, taxonomy, type2id, id2type, topic2id, id2topic,
+        args.output_dir, args.model, args.epochs, args.lr,
+        args.batch_size, args.max_length, args.val_split, args.seed,
+    )
+
+    print(f"\n  Models saved to: {args.output_dir}/")
+    print(f"  Next step: python classify.py --models-dir {args.output_dir}")
+    return config
+
+
+# ---------------------------------------------------------------------------
+# Modal cloud training
+# ---------------------------------------------------------------------------
+
+
+def run_modal(args):
+    """Train on Modal with a cloud GPU. Downloads models to --output-dir."""
+    import modal
+
+    app = modal.App("docx-classifier-training")
+
+    training_image = (
+        modal.Image.debian_slim(python_version="3.11")
+        .pip_install("torch", "transformers", "datasets", "scikit-learn", "accelerate", "numpy")
+    )
+    model_volume = modal.Volume.from_name("classifier-models", create_if_missing=True)
+
+    # Read local files to send to Modal
+    labeled_jsonl = Path(args.input).read_text()
+    taxonomy_path = Path(__file__).parent / "taxonomy.json"
+    with open(taxonomy_path) as f:
+        taxonomy = json.load(f)
+
+    gpu_map = {"t4": "T4", "a10g": "a10g", "l4": "l4", "a100": "a100"}
+    gpu = gpu_map.get(args.gpu.lower(), args.gpu)
+
+    @app.function(image=training_image, gpu=gpu, timeout=3600, volumes={"/models": model_volume})
+    def train_remote(labeled_jsonl: str, taxonomy: dict, **kwargs):
+        """Self-contained training function running on Modal GPU."""
+        import json
+        import os
+        import urllib.request
+        from collections import Counter
+        from concurrent.futures import ThreadPoolExecutor
+
+        import numpy as np
+        import torch
+        from datasets import Dataset
+        from sklearn.metrics import accuracy_score, classification_report, f1_score
+        from sklearn.model_selection import train_test_split
+        from transformers import (
+            AutoModelForSequenceClassification, AutoTokenizer,
+            EarlyStoppingCallback, Trainer, TrainingArguments,
+        )
+
+        TEXT_BASE_URL = "https://docxcorp.us/extracted"
+
+        def fetch_text(doc_id, max_chars=2000):
+            try:
+                req = urllib.request.Request(
+                    f"{TEXT_BASE_URL}/{doc_id}.txt",
+                    headers={"User-Agent": "docx-classifier/2.0"},
+                )
+                with urllib.request.urlopen(req, timeout=15) as resp:
+                    return resp.read().decode("utf-8")[:max_chars]
+            except Exception:
+                return ""
+
+        def fetch_texts_parallel(docs, max_chars):
+            results = {}
+            def fetch_one(did):
+                return did, fetch_text(did, max_chars)
+            with ThreadPoolExecutor(max_workers=40) as ex:
+                for did, text in ex.map(fetch_one, [d["id"] for d in docs]):
+                    results[did] = text
+            print(f"  Fetched text for {sum(1 for t in results.values() if t)}/{len(docs)} docs")
+            return results
+
+        class _WeightedTrainer(Trainer):
+            def __init__(self, class_weights=None, **kw):
+                super().__init__(**kw)
+                self.class_weights = class_weights
+
+            def compute_loss(self, model, inputs, return_outputs=False, **kw):
+                labels = inputs.pop("labels")
+                outputs = model(**inputs)
+                logits = outputs.logits
+                if self.class_weights is not None:
+                    w = self.class_weights.to(logits.device)
+                    loss = torch.nn.functional.cross_entropy(logits, labels, weight=w)
+                else:
+                    loss = torch.nn.functional.cross_entropy(logits, labels)
+                return (loss, outputs) if return_outputs else loss
+
+        def _compute_metrics(eval_pred):
+            preds = np.argmax(eval_pred.predictions, axis=-1)
+            return {
+                "accuracy": accuracy_score(eval_pred.label_ids, preds),
+                "f1_macro": f1_score(eval_pred.label_ids, preds, average="macro"),
+            }
+
+        def _class_weights(labels, n):
+            counts = Counter(labels)
+            total = len(labels)
+            return torch.tensor([total / (n * counts.get(i, 1)) for i in range(n)], dtype=torch.float32)
+
+        def _train_one(train_t, train_l, val_t, val_l, n_labels, id2l, l2id, out, mname, ep, lr, bs, ml, name):
+            print(f"\n{'='*60}\nTraining: {name}\n  Train: {len(train_t)}, Val: {len(val_t)}, Classes: {n_labels}")
+            print(f"  Device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")
+
+            tok = AutoTokenizer.from_pretrained(mname)
+            mdl = AutoModelForSequenceClassification.from_pretrained(mname, num_labels=n_labels, id2label=id2l, label2id=l2id)
+
+            def tokenize(ex):
+                return tok(ex["text"], truncation=True, max_length=ml, padding="max_length")
+
+            tds = Dataset.from_dict({"text": train_t, "label": train_l}).map(tokenize, batched=True, remove_columns=["text"])
+            vds = Dataset.from_dict({"text": val_t, "label": val_l}).map(tokenize, batched=True, remove_columns=["text"])
+            tds.set_format("torch"); vds.set_format("torch")
+
+            sd = os.path.join(out, name)
+            args = TrainingArguments(
+                output_dir=sd, num_train_epochs=ep, per_device_train_batch_size=bs,
+                per_device_eval_batch_size=bs*2, learning_rate=lr, weight_decay=0.01,
+                warmup_ratio=0.1, eval_strategy="epoch", save_strategy="epoch",
+                load_best_model_at_end=True, metric_for_best_model="f1_macro",
+                greater_is_better=True, save_total_limit=2, logging_steps=50,
+                fp16=torch.cuda.is_available(), report_to="none", seed=42,
+            )
+            cw = _class_weights(train_l, n_labels)
+            trainer = _WeightedTrainer(
+                class_weights=cw, model=mdl, args=args,
+                train_dataset=tds, eval_dataset=vds, compute_metrics=_compute_metrics,
+                callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
+            )
+            trainer.train()
+            res = trainer.evaluate()
+            best = os.path.join(sd, "best")
+            trainer.save_model(best); tok.save_pretrained(best)
+
+            preds = trainer.predict(vds)
+            pl = np.argmax(preds.predictions, axis=-1)
+            report = classification_report(vds["label"], pl, target_names=[id2l[i] for i in range(n_labels)])
+            print(f"\nClassification Report ({name}):\n{report}")
+            return res
+
+        # --- Main pipeline ---
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        print(f"Device: {device} ({torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'cpu'})")
+
+        docs = [json.loads(l) for l in labeled_jsonl.strip().split("\n") if l.strip()]
+        docs = [d for d in docs if d.get("confidence", 0) >= kwargs.get("min_confidence", 0)]
+        print(f"Loaded {len(docs)} documents")
+
+        texts = fetch_texts_parallel(docs, kwargs.get("max_chars", 2000))
+
+        type_labels = [t["id"] for t in taxonomy["document_types"]]
+        topic_labels = [t["id"] for t in taxonomy["topics"]]
+        type2id = {l: i for i, l in enumerate(type_labels)}
+        id2type = {i: l for l, i in type2id.items()}
+        topic2id = {l: i for i, l in enumerate(topic_labels)}
+        id2topic = {i: l for l, i in topic2id.items()}
+
+        def prep(field, lmap):
+            it, il = [], []
+            for d in docs:
+                t, lab = texts.get(d["id"], ""), d.get(field, "")
+                if t and lab in lmap:
+                    it.append(t); il.append(lmap[lab])
+            return it, il
+
+        tt, tl = prep("document_type", type2id)
+        tpt, tpl = prep("document_topic", topic2id)
+
+        vs = kwargs.get("val_split", 0.15)
+        sd = kwargs.get("seed", 42)
+        ttt, tvt, ttl, tvl = train_test_split(tt, tl, test_size=vs, random_state=sd, stratify=tl)
+        tptt, tpvt, tptl, tpvl = train_test_split(tpt, tpl, test_size=vs, random_state=sd, stratify=tpl)
+
+        out = "/models"
+        os.makedirs(out, exist_ok=True)
+        mn = kwargs.get("model_name", "xlm-roberta-base")
+        ep = kwargs.get("epochs", 5)
+        lr = kwargs.get("lr", 2e-5)
+        bs = kwargs.get("batch_size", 16)
+        ml = kwargs.get("max_length", 512)
+
+        tr = _train_one(ttt, ttl, tvt, tvl, len(type2id), id2type, type2id, out, mn, ep, lr, bs, ml, "document_type")
+        tpr = _train_one(tptt, tptl, tpvt, tpvl, len(topic2id), id2topic, topic2id, out, mn, ep, lr, bs, ml, "topic")
+
+        config = {
+            "base_model": mn, "taxonomy": taxonomy["name"],
+            "taxonomy_version": taxonomy["version"], "total_docs": len(docs),
+            "epochs": ep, "learning_rate": lr, "batch_size": bs,
+            "results": {
+                "document_type": {"accuracy": tr["eval_accuracy"], "f1_macro": tr["eval_f1_macro"]},
+                "topic": {"accuracy": tpr["eval_accuracy"], "f1_macro": tpr["eval_f1_macro"]},
+            },
+        }
+        with open(os.path.join(out, "training_config.json"), "w") as f:
+            json.dump(config, f, indent=2)
+        return config
+
+    @app.function(image=training_image, volumes={"/models": model_volume})
+    def collect_models() -> dict[str, bytes]:
+        model_volume.reload()
+        files = {}
+        for root, _dirs, filenames in os.walk("/models"):
+            if "/best" in root or root == "/models":
+                for fname in filenames:
+                    full = os.path.join(root, fname)
+                    files[full.replace("/models/", "")] = open(full, "rb").read()
+        return files
+
+    print(f"Submitting training job to Modal ({gpu} GPU)...")
+    print(f"  Input: {args.input} ({labeled_jsonl.count(chr(10))} lines)")
+    print(f"  Model: {args.model}, Epochs: {args.epochs}")
+    print()
+
+    with app.run():
+        config = train_remote.remote(
+            labeled_jsonl=labeled_jsonl, taxonomy=taxonomy,
+            model_name=args.model, epochs=args.epochs, lr=args.lr,
+            batch_size=args.batch_size, max_length=args.max_length,
+            val_split=args.val_split, min_confidence=args.min_confidence,
+            max_chars=args.max_chars, seed=args.seed,
+        )
+
+        print("\n--- Results ---")
+        print(json.dumps(config, indent=2))
+
+        # Download models
+        output_dir = Path(args.output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        print(f"\nDownloading models to {output_dir}...")
+
+        files = collect_models.remote()
+        for rel_path, data in files.items():
+            local_path = output_dir / rel_path
+            local_path.parent.mkdir(parents=True, exist_ok=True)
+            local_path.write_bytes(data)
+            print(f"  {rel_path}")
+        print(f"\nModels saved to {output_dir}/")
+
+    return config
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Train classifiers on labeled documents")
+    parser.add_argument("--input", type=str, required=True, help="Labeled JSONL from label.py")
+    parser.add_argument("--output-dir", type=str, default=DEFAULT_OUTPUT_DIR)
+    parser.add_argument("--model", type=str, default=DEFAULT_MODEL)
+    parser.add_argument("--epochs", type=int, default=DEFAULT_EPOCHS)
+    parser.add_argument("--lr", type=float, default=DEFAULT_LR)
+    parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE)
+    parser.add_argument("--max-length", type=int, default=DEFAULT_MAX_LENGTH)
+    parser.add_argument("--val-split", type=float, default=DEFAULT_VAL_SPLIT)
+    parser.add_argument("--min-confidence", type=float, default=0.0)
+    parser.add_argument("--max-chars", type=int, default=2000)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--modal", action="store_true", help="Train on Modal cloud GPU")
+    parser.add_argument("--gpu", type=str, default="T4", help="Modal GPU type: T4, a10g, l4, a100")
+    args = parser.parse_args()
+
+    if not os.path.exists(args.input):
+        print(f"ERROR: Input file not found: {args.input}")
+        sys.exit(1)
 
-    print(f"\nNext step: python classify.py --models-dir {args.output_dir}")
+    if args.modal:
+        run_modal(args)
+    else:
+        run_local(args)
 
 
 if __name__ == "__main__":