From dfd4aa585b629b9bf6b12495c2b693200c040413 Mon Sep 17 00:00:00 2001 From: Caio Pizzol Date: Sun, 8 Mar 2026 21:45:59 -0300 Subject: [PATCH 1/2] feat: add document classification pipeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Python ML pipeline (scripts/classification/) that classifies ~800K .docx documents by document type (10 classes) and topic (9 classes) using the FineWeb-Edu pattern: LLM labels a sample → train ModernBERT → apply at scale. Pipeline steps: - sample.py: stratified sampling across languages and word count - label.py: async LLM labeling with Claude (resumable) - train.py: fine-tune two ModernBERT classifiers - classify.py: batch inference on full corpus - evaluate.py: quality metrics and distribution analysis Also adds: - LLM classification fields and methods to DbClient - CLAUDE.md / AGENTS.md at root, packages/shared, and scripts/classification - Updated README with Phase 5 (Classify) and project structure --- .gitignore | 2 +- AGENTS.md | 1 + CLAUDE.md | 64 ++++ README.md | 76 ++++- bun.lock | 99 +++++- packages/shared/AGENTS.md | 1 + packages/shared/CLAUDE.md | 22 ++ packages/shared/db.ts | 95 +++++- packages/shared/index.ts | 1 + scripts/classification/AGENTS.md | 1 + scripts/classification/CLAUDE.md | 38 +++ scripts/classification/README.md | 81 +++++ scripts/classification/classify.py | 384 ++++++++++++++++++++++ scripts/classification/common.py | 130 ++++++++ scripts/classification/evaluate.py | 303 ++++++++++++++++++ scripts/classification/label.py | 375 ++++++++++++++++++++++ scripts/classification/pyproject.toml | 18 ++ scripts/classification/sample.py | 250 +++++++++++++++ scripts/classification/taxonomy.json | 114 +++++++ scripts/classification/train.py | 443 ++++++++++++++++++++++++++ 20 files changed, 2469 insertions(+), 29 deletions(-) create mode 120000 AGENTS.md create mode 100644 CLAUDE.md create mode 120000 packages/shared/AGENTS.md create mode 100644 packages/shared/CLAUDE.md create mode 120000 scripts/classification/AGENTS.md create mode 100644 scripts/classification/CLAUDE.md create mode 100644 scripts/classification/README.md create mode 100644 scripts/classification/classify.py create mode 100644 scripts/classification/common.py create mode 100644 scripts/classification/evaluate.py create mode 100644 scripts/classification/label.py create mode 100644 scripts/classification/pyproject.toml create mode 100644 scripts/classification/sample.py create mode 100644 scripts/classification/taxonomy.json create mode 100644 scripts/classification/train.py diff --git a/.gitignore b/.gitignore index 0cff47c..7473c68 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ # Dependencies node_modules/ .venv/ +__pycache__/ # Build output dist/ @@ -23,4 +24,3 @@ coverage/ # OS .DS_Store -CLAUDE.md diff --git a/AGENTS.md b/AGENTS.md new file mode 120000 index 0000000..681311e --- /dev/null +++ b/AGENTS.md @@ -0,0 +1 @@ +CLAUDE.md \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..5511efb --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,64 @@ +# docx-corpus + +The largest open corpus of .docx files (~800K documents) for document processing research. Built by [SuperDoc](https://superdoc.dev). + +## Architecture + +This is a **data pipeline monorepo** with two runtimes: + +- **TypeScript (Bun)** — infrastructure: scraping, extraction, embedding +- **Python** — data science: classification, export, publishing + +``` +apps/cli/ → corpus (scrape, extract, embed, status) +apps/cdx-filter/ → AWS Lambda for Common Crawl CDX filtering +packages/shared/ → DB client (Bun.sql), R2 storage, UI helpers +packages/scraper/ → Downloads .docx from Common Crawl WARC archives +packages/extractor/ → Text extraction via Docling +packages/embedder/ → Embeddings via Google gemini-embedding-001 +scripts/classification/ → ML classification pipeline (Python) +db/ → PostgreSQL schema + migrations +``` + +## Pipeline + +Each stage writes to the same PostgreSQL database (`documents` table): + +1. **Scrape** (TS) — Common Crawl → .docx files in R2 (`status = 'uploaded'`) +2. **Extract** (TS) — Docling → text in R2 (`extracted_at`, `word_count`, `language`) +3. **Embed** (TS) — Google API → pgvector (`embedding`, `embedded_at`) +4. **Classify** (Python) — ModernBERT → labels (`document_type`, `document_topic`) + +## Database + +Single `documents` table in PostgreSQL (NeonDB) with pgvector. All pipeline stages write to this table. + +- **Connection**: `DATABASE_URL` env var (Bun.sql for TS, psycopg2 for Python) +- **Schema**: `db/schema.sql` (canonical), `db/migrations/` (incremental) +- **Key columns**: `id` (SHA-256 hash), `status`, `extracted_at`, `embedded_at`, `document_type`, `document_topic` + +## Storage + +Documents and extracted text live in Cloudflare R2: +- `documents/{hash}.docx` — original files +- `extracted/{hash}.txt` — extracted text + +Text is also available at `https://docxcorp.us/extracted/{id}.txt`. + +## Commands + +```bash +bun install # Install TS dependencies +bun run corpus scrape --crawl 3 # Scrape from Common Crawl +bun run corpus extract # Extract text +bun run corpus embed # Generate embeddings +bun run corpus status # Show pipeline stats +``` + +## Key conventions + +- Use `bun` for all TS tooling (not node/npm/pnpm) +- DB client is in `packages/shared/db.ts` — all pipeline stages use `DbClient` +- Storage abstraction in `packages/shared/storage.ts` — R2 or local +- Environment: `.env` at project root (gitignored), see `.env.example` +- Python scripts manage their own deps via `pyproject.toml` diff --git a/README.md b/README.md index 532af78..94bd0a2 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,13 @@ Phase 4: Embed (corpus embed) │ extracted/ │ ──► │ transformers │ ──► │ (pgvector) │ │ {hash}.txt │ │ (Python) │ │ embedding │ └────────────────┘ └────────────────┘ └────────────────┘ + +Phase 5: Classify (Python ML pipeline) +┌────────────────┐ ┌────────────────┐ ┌────────────────┐ +│ LLM labels │ │ ModernBERT │ │ PostgreSQL │ +│ 3,500 sample │ ──► │ fine-tuning │ ──► │ document_type │ +│ (Claude) │ │ (2 models) │ │ document_topic│ +└────────────────┘ └────────────────┘ └────────────────┘ ``` ### Why Common Crawl? @@ -82,27 +89,29 @@ bun install ## Project Structure ``` -packages/ - shared/ # Shared utilities (DB client, storage, formatting) - scraper/ # Core scraper logic (downloads WARC, validates .docx) - extractor/ # Text extraction using Docling (Python) - embedder/ # Document embeddings apps/ - cli/ # Unified CLI - corpus - cdx-filter/ # AWS Lambda - filters CDX indexes for .docx URLs - web/ # Landing page - docxcorp.us + cli/ # Unified CLI — corpus + cdx-filter/ # AWS Lambda — filters CDX indexes for .docx URLs + web/ # Landing page — docxcorp.us +packages/ + shared/ # DB client, storage, formatting (Bun) + scraper/ # Downloads WARC, validates .docx (Bun) + extractor/ # Text extraction via Docling (Bun + Python) + embedder/ # Document embeddings (Bun) +scripts/ + classification/ # ML classification pipeline (Python) db/ - schema.sql # PostgreSQL schema (with pgvector) - migrations/ # Database migrations + schema.sql # PostgreSQL schema (with pgvector) + migrations/ # Database migrations ``` **Apps** (entry points) -| App | Purpose | Uses | -| -------------- | ------------------------------- | ------------------------ | -| **cli** | `corpus` command | scraper, extractor, embedder | -| **cdx-filter** | Filter CDX indexes (Lambda) | - | -| **web** | Landing page | - | +| App | Purpose | Runtime | +| -------------- | ------------------------------- | ------- | +| **cli** | `corpus` command | Bun | +| **cdx-filter** | Filter CDX indexes (Lambda) | Bun | +| **web** | Landing page | - | **Packages** (libraries) @@ -113,6 +122,12 @@ db/ | **extractor** | Extract text (Docling) | Bun + Python | | **embedder** | Generate embeddings | Bun | +**Scripts** (data science) + +| Script | Purpose | Runtime | +| ------------------------- | ------------------------------------------ | ------- | +| **scripts/classification** | Document type + topic classification (ML) | Python | + ## Usage ### 1. Run Lambda to filter CDX indexes @@ -173,6 +188,34 @@ bun run corpus embed --batch 100 --verbose Uses Google's `gemini-embedding-001` model (3072 dimensions, ~$0.006/1M tokens). Documents are chunked and embeddings are combined via weighted average. +### 5. Classify documents + +Classifies documents by **document type** (10 classes) and **topic** (9 classes) using the [FineWeb-Edu](https://huggingface.co/spaces/HuggingFaceFW/blogpost-fineweb-v1) pattern: LLM labels a sample → train classifier → apply at scale. + +```bash +cd scripts/classification + +# Install Python dependencies +pip install -e . + +# Step 1: Sample 3,500 documents (stratified by language, word count, domain) +python sample.py --total 3500 --output sampled_docs.jsonl + +# Step 2: Label with Claude (~$3) +python label.py --input sampled_docs.jsonl --output labeled_docs.jsonl + +# Step 3: Train ModernBERT classifiers (~30min GPU) +python train.py --input labeled_docs.jsonl --output-dir ./models + +# Step 4: Classify full corpus (~800K docs) +python classify.py --models-dir ./models + +# Check results +python evaluate.py corpus +``` + +See [scripts/classification/README.md](scripts/classification/README.md) for full details. + ### Docker Run the CLI in a container: @@ -268,6 +311,9 @@ EMBED_INPUT_PREFIX=extracted EMBED_BATCH_SIZE=100 EMBED_CONCURRENCY=20 # Parallel API requests GOOGLE_API_KEY= # Required for embeddings + +# Classification (Python scripts only) +ANTHROPIC_API_KEY= # Required for LLM labeling step ``` ### Rate Limiting diff --git a/bun.lock b/bun.lock index a075957..568281e 100644 --- a/bun.lock +++ b/bun.lock @@ -11,7 +11,7 @@ }, "apps/cdx-filter": { "name": "@docx-corpus/cdx-filter", - "version": "0.14.3", + "version": "0.17.0", "dependencies": { "@aws-sdk/client-s3": "^3.966.0", }, @@ -27,11 +27,12 @@ }, "apps/cli": { "name": "@docx-corpus/cli", - "version": "0.9.3", + "version": "0.12.0", "bin": { "corpus": "./index.ts", }, "dependencies": { + "@docx-corpus/classifier": "workspace:*", "@docx-corpus/embedder": "workspace:*", "@docx-corpus/extractor": "workspace:*", "@docx-corpus/scraper": "workspace:*", @@ -47,9 +48,21 @@ "typescript": "^5.9.3", }, }, + "packages/classifier": { + "name": "@docx-corpus/classifier", + "version": "0.2.0", + "dependencies": { + "@anthropic-ai/sdk": "^0.39.0", + "@docx-corpus/shared": "workspace:*", + }, + "devDependencies": { + "@types/bun": "latest", + "typescript": "^5.9.3", + }, + }, "packages/embedder": { "name": "@docx-corpus/embedder", - "version": "0.2.0", + "version": "0.1.0", "dependencies": { "@docx-corpus/shared": "workspace:*", "@google/genai": "^1.38.0", @@ -104,6 +117,8 @@ "@actions/io": ["@actions/io@2.0.0", "", {}, "sha512-Jv33IN09XLO+0HS79aaODsvIRyduiF7NY/F6LYeK5oeUmrsz7aFdRphQjFoESF4jS7lMauDOttKALcpapVDIAg=="], + "@anthropic-ai/sdk": ["@anthropic-ai/sdk@0.39.0", "", { "dependencies": { "@types/node": "^18.11.18", "@types/node-fetch": "^2.6.4", "abort-controller": "^3.0.0", "agentkeepalive": "^4.2.1", "form-data-encoder": "1.7.2", "formdata-node": "^4.3.2", "node-fetch": "^2.6.7" } }, "sha512-eMyDIPRZbt1CCLErRCi3exlAvNkBtRe+kW5vvJyef93PmNr/clstYgHhtvmkxN82nlKgzyGPCyGxrm0JQ1ZIdg=="], + "@aws-crypto/crc32": ["@aws-crypto/crc32@5.2.0", "", { "dependencies": { "@aws-crypto/util": "^5.2.0", "@aws-sdk/types": "^3.222.0", "tslib": "^2.6.2" } }, "sha512-nLbCWqQNgUiwwtFsen1AdzAtvuLRsQS8rYgMuxCrdKf9kOssamGLuPwyTY9wyYblNr9+1XM8v6zoDTPPSIeANg=="], "@aws-crypto/crc32c": ["@aws-crypto/crc32c@5.2.0", "", { "dependencies": { "@aws-crypto/util": "^5.2.0", "@aws-sdk/types": "^3.222.0", "tslib": "^2.6.2" } }, "sha512-+iWb8qaHLYKrNvGRbiYRHSdKRWhto5XlZUEBwDjYNf+ly5SVYG6zEoYIdxvf5R3zyeP16w4PLBn3rH1xc74Rag=="], @@ -212,6 +227,8 @@ "@docx-corpus/cdx-filter": ["@docx-corpus/cdx-filter@workspace:apps/cdx-filter"], + "@docx-corpus/classifier": ["@docx-corpus/classifier@workspace:packages/classifier"], + "@docx-corpus/cli": ["@docx-corpus/cli@workspace:apps/cli"], "@docx-corpus/embedder": ["@docx-corpus/embedder@workspace:packages/embedder"], @@ -400,14 +417,20 @@ "@smithy/uuid": ["@smithy/uuid@1.1.0", "", { "dependencies": { "tslib": "^2.6.2" } }, "sha512-4aUIteuyxtBUhVdiQqcDhKFitwfd9hqoSDYY2KRXiWtgoWJ9Bmise+KfEPDiVHWeJepvF8xJO9/9+WDIciMFFw=="], - "@types/bun": ["@types/bun@1.3.6", "", { "dependencies": { "bun-types": "1.3.6" } }, "sha512-uWCv6FO/8LcpREhenN1d1b6fcspAB+cefwD7uti8C8VffIv0Um08TKMn98FynpTiU38+y2dUO55T11NgDt8VAA=="], + "@types/bun": ["@types/bun@1.3.10", "", { "dependencies": { "bun-types": "1.3.10" } }, "sha512-0+rlrUrOrTSskibryHbvQkDOWRJwJZqZlxrUs1u4oOoTln8+WIXBPmAuCF35SWB2z4Zl3E84Nl/D0P7803nigQ=="], "@types/node": ["@types/node@25.0.6", "", { "dependencies": { "undici-types": "~7.16.0" } }, "sha512-NNu0sjyNxpoiW3YuVFfNz7mxSQ+S4X2G28uqg2s+CzoqoQjLPsWSbsFFyztIAqt2vb8kfEAsJNepMGPTxFDx3Q=="], + "@types/node-fetch": ["@types/node-fetch@2.6.13", "", { "dependencies": { "@types/node": "*", "form-data": "^4.0.4" } }, "sha512-QGpRVpzSaUs30JBSGPjOg4Uveu384erbHBoT1zeONvyCfwQxIkUshLAOqN/k9EjGviPRmWTTe6aH2qySWKTVSw=="], + "@types/normalize-package-data": ["@types/normalize-package-data@2.4.4", "", {}, "sha512-37i+OaWTh9qeK4LSHPsyRC7NahnGotNuZvjLSgcPzblpHB3rrCJxAOgI5gCdKm7coonsaX1Of0ILiTcnZjbfxA=="], + "abort-controller": ["abort-controller@3.0.0", "", { "dependencies": { "event-target-shim": "^5.0.0" } }, "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg=="], + "agent-base": ["agent-base@7.1.4", "", {}, "sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ=="], + "agentkeepalive": ["agentkeepalive@4.6.0", "", { "dependencies": { "humanize-ms": "^1.2.1" } }, "sha512-kja8j7PjmncONqaTsB8fQ+wE2mSU2DJ9D4XKoJ5PFWIdRMa6SLSN1ff4mOr4jCbfRSsxR4keIiySJU0N9T5hIQ=="], + "aggregate-error": ["aggregate-error@3.1.0", "", { "dependencies": { "clean-stack": "^2.0.0", "indent-string": "^4.0.0" } }, "sha512-4I7Td01quW/RpocfNayFdFVk1qSuoh0E7JrbRJ16nH01HhKFQ88INq9Sd+nd72zqRySlr9BmDA8xlEJ6vJMrYA=="], "ansi-escapes": ["ansi-escapes@7.2.0", "", { "dependencies": { "environment": "^1.0.0" } }, "sha512-g6LhBsl+GBPRWGWsBtutpzBYuIIdBkLEvad5C/va/74Db018+5TZiyA26cZJAr3Rft5lprVqOIPxf5Vid6tqAw=="], @@ -424,6 +447,8 @@ "array-ify": ["array-ify@1.0.0", "", {}, "sha512-c5AMf34bKdvPhQ7tBGhqkgKNUzMr4WUs+WDtC2ZUGOUncbxKMTvqxYctiseW3+L4bA8ec+GcZ6/A/FW4m8ukng=="], + "asynckit": ["asynckit@0.4.0", "", {}, "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q=="], + "balanced-match": ["balanced-match@1.0.2", "", {}, "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw=="], "base64-js": ["base64-js@1.5.1", "", {}, "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA=="], @@ -442,7 +467,9 @@ "buffer-equal-constant-time": ["buffer-equal-constant-time@1.0.1", "", {}, "sha512-zRpUiDwd/xk6ADqPMATG8vc9VPrkck7T07OIx0gnjmJAnHnTVXNQG3vfvWNuiZIkwu9KrKdA1iJKfsfTVxE6NA=="], - "bun-types": ["bun-types@1.3.6", "", { "dependencies": { "@types/node": "*" } }, "sha512-OlFwHcnNV99r//9v5IIOgQ9Uk37gZqrNMCcqEaExdkVq3Avwqok1bJFmvGMCkCE0FqzdY8VMOZpfpR3lwI+CsQ=="], + "bun-types": ["bun-types@1.3.10", "", { "dependencies": { "@types/node": "*" } }, "sha512-tcpfCCl6XWo6nCVnpcVrxQ+9AYN1iqMIzgrSKYMB/fjLtV2eyAVEg7AxQJuCq/26R6HpKWykQXuSOq/21RYcbg=="], + + "call-bind-apply-helpers": ["call-bind-apply-helpers@1.0.2", "", { "dependencies": { "es-errors": "^1.3.0", "function-bind": "^1.1.2" } }, "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ=="], "callsites": ["callsites@3.1.0", "", {}, "sha512-P8BjAsXvZS+VIDUI11hHCQEv74YT67YUi5JJFNWIqL235sBmjX4+qx9Muvls5ivyNENctx46xQLQ3aTuE7ssaQ=="], @@ -462,6 +489,8 @@ "color-name": ["color-name@1.1.3", "", {}, "sha512-72fSenhMw2HZMTVHeCA9KCmpEIbzWiQsjN+BHcBbS9vr1mtt+vJjPdksIBNUmKAW8TFUDPJK5SUU3QhE9NEXDw=="], + "combined-stream": ["combined-stream@1.0.8", "", { "dependencies": { "delayed-stream": "~1.0.0" } }, "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg=="], + "compare-func": ["compare-func@2.0.0", "", { "dependencies": { "array-ify": "^1.0.0", "dot-prop": "^5.1.0" } }, "sha512-zHig5N+tPWARooBnb0Zx1MFcdfpyJrfTJ3Y5L+IFvUm8rM74hHz66z0gw0x4tijh5CorKkKUCnW82R2vmpeCRA=="], "config-chain": ["config-chain@1.1.13", "", { "dependencies": { "ini": "^1.3.4", "proto-list": "~1.2.1" } }, "sha512-qj+f8APARXHrM0hraqXYb2/bOVSV4PvJQlNZ/DVj0QrmNM2q2euizkeuVckQ57J+W0mRH6Hvi+k50M4Jul2VRQ=="], @@ -490,10 +519,14 @@ "deep-extend": ["deep-extend@0.6.0", "", {}, "sha512-LOHxIOaPYdHlJRtCQfDIVZtfw/ufM8+rVj649RIHzcm/vGwQRXFt6OPqIFWsm2XEMrNIEtWR64sY1LEKD2vAOA=="], + "delayed-stream": ["delayed-stream@1.0.0", "", {}, "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ=="], + "dir-glob": ["dir-glob@3.0.1", "", { "dependencies": { "path-type": "^4.0.0" } }, "sha512-WkrWp9GR4KXfKGYzOLmTuGVi1UWFfws377n9cc55/tb6DuqyF6pcQ5AbiHEshaDpY9v6oaSr2XCDidGmMwdzIA=="], "dot-prop": ["dot-prop@5.3.0", "", { "dependencies": { "is-obj": "^2.0.0" } }, "sha512-QM8q3zDe58hqUqjraQOmzZ1LIH9SWQJTlEKCH4kJ2oQvLZk7RbQXvtDM2XEq3fwkV9CCvvH4LA0AV+ogFsBM2Q=="], + "dunder-proto": ["dunder-proto@1.0.1", "", { "dependencies": { "call-bind-apply-helpers": "^1.0.1", "es-errors": "^1.3.0", "gopd": "^1.2.0" } }, "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A=="], + "duplexer2": ["duplexer2@0.1.4", "", { "dependencies": { "readable-stream": "^2.0.2" } }, "sha512-asLFVfWWtJ90ZyOUHMqk7/S2w2guQKxUI2itj3d92ADHhxUSbCMGi1f1cBcJ7xM1To+pE/Khbwo1yuNbMEPKeA=="], "eastasianwidth": ["eastasianwidth@0.2.0", "", {}, "sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA=="], @@ -512,10 +545,20 @@ "error-ex": ["error-ex@1.3.4", "", { "dependencies": { "is-arrayish": "^0.2.1" } }, "sha512-sqQamAnR14VgCr1A618A3sGrygcpK+HEbenA/HiEAkkUwcZIIB/tgWqHFxWgOyDh4nB4JCRimh79dR5Ywc9MDQ=="], + "es-define-property": ["es-define-property@1.0.1", "", {}, "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g=="], + + "es-errors": ["es-errors@1.3.0", "", {}, "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw=="], + + "es-object-atoms": ["es-object-atoms@1.1.1", "", { "dependencies": { "es-errors": "^1.3.0" } }, "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA=="], + + "es-set-tostringtag": ["es-set-tostringtag@2.1.0", "", { "dependencies": { "es-errors": "^1.3.0", "get-intrinsic": "^1.2.6", "has-tostringtag": "^1.0.2", "hasown": "^2.0.2" } }, "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA=="], + "escalade": ["escalade@3.2.0", "", {}, "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA=="], "escape-string-regexp": ["escape-string-regexp@5.0.0", "", {}, "sha512-/veY75JbMK4j1yjvuUxuVsiS/hr/4iHs9FTT6cgTexxdE0Ly/glccBAkloH/DofkjRbZU3bnoj38mOmhkZ0lHw=="], + "event-target-shim": ["event-target-shim@5.0.1", "", {}, "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ=="], + "execa": ["execa@9.6.1", "", { "dependencies": { "@sindresorhus/merge-streams": "^4.0.0", "cross-spawn": "^7.0.6", "figures": "^6.1.0", "get-stream": "^9.0.0", "human-signals": "^8.0.1", "is-plain-obj": "^4.1.0", "is-stream": "^4.0.1", "npm-run-path": "^6.0.0", "pretty-ms": "^9.2.0", "signal-exit": "^4.1.0", "strip-final-newline": "^4.0.0", "yoctocolors": "^2.1.1" } }, "sha512-9Be3ZoN4LmYR90tUoVu2te2BsbzHfhJyfEiAVfz7N5/zv+jduIfLrV2xdQXOHbaD6KgpGdO9PRPM1Y4Q9QkPkA=="], "extend": ["extend@3.0.2", "", {}, "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g=="], @@ -540,12 +583,20 @@ "foreground-child": ["foreground-child@3.3.1", "", { "dependencies": { "cross-spawn": "^7.0.6", "signal-exit": "^4.0.1" } }, "sha512-gIXjKqtFuWEgzFRJA9WCQeSJLZDjgJUOMCMzxtvFq/37KojM1BFGufqsCy0r4qSQmYLsZYMeyRqzIWOMup03sw=="], + "form-data": ["form-data@4.0.5", "", { "dependencies": { "asynckit": "^0.4.0", "combined-stream": "^1.0.8", "es-set-tostringtag": "^2.1.0", "hasown": "^2.0.2", "mime-types": "^2.1.12" } }, "sha512-8RipRLol37bNs2bhoV67fiTEvdTrbMUYcFTiy3+wuuOnUog2QBHCZWXDRijWQfAkhBj2Uf5UnVaiWwA5vdd82w=="], + + "form-data-encoder": ["form-data-encoder@1.7.2", "", {}, "sha512-qfqtYan3rxrnCk1VYaA4H+Ms9xdpPqvLZa6xmMgFvhO32x7/3J/ExcTd6qpxM0vH2GdMI+poehyBZvqfMTto8A=="], + + "formdata-node": ["formdata-node@4.4.1", "", { "dependencies": { "node-domexception": "1.0.0", "web-streams-polyfill": "4.0.0-beta.3" } }, "sha512-0iirZp3uVDjVGt9p49aTaqjk84TrglENEDuqfdlZQ1roC9CWlPk6Avf8EEnZNcAqPonwkG35x4n3ww/1THYAeQ=="], + "formdata-polyfill": ["formdata-polyfill@4.0.10", "", { "dependencies": { "fetch-blob": "^3.1.2" } }, "sha512-buewHzMvYL29jdeQTVILecSaZKnt/RJWjoZCF5OW60Z67/GmSLBkOFM7qh1PI3zFNtJbaZL5eQu1vLfazOwj4g=="], "from2": ["from2@2.3.0", "", { "dependencies": { "inherits": "^2.0.1", "readable-stream": "^2.0.0" } }, "sha512-OMcX/4IC/uqEPVgGeyfN22LJk6AZrMkRZHxcHBMBvHScDGgwTm2GT2Wkgtocyd3JfZffjj2kYUDXXII0Fk9W0g=="], "fs-extra": ["fs-extra@11.3.3", "", { "dependencies": { "graceful-fs": "^4.2.0", "jsonfile": "^6.0.1", "universalify": "^2.0.0" } }, "sha512-VWSRii4t0AFm6ixFFmLLx1t7wS1gh+ckoa84aOeapGum0h+EZd1EhEumSB+ZdDLnEPuucsVB9oB7cxJHap6Afg=="], + "function-bind": ["function-bind@1.1.2", "", {}, "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA=="], + "function-timeout": ["function-timeout@1.0.2", "", {}, "sha512-939eZS4gJ3htTHAldmyyuzlrD58P03fHG49v2JfFXbV6OhvZKRC9j2yAtdHw/zrp2zXHuv05zMIy40F0ge7spA=="], "gaxios": ["gaxios@7.1.3", "", { "dependencies": { "extend": "^3.0.2", "https-proxy-agent": "^7.0.1", "node-fetch": "^3.3.2", "rimraf": "^5.0.1" } }, "sha512-YGGyuEdVIjqxkxVH1pUTMY/XtmmsApXrCVv5EU25iX6inEPbV+VakJfLealkBtJN69AQmh1eGOdCl9Sm1UP6XQ=="], @@ -556,6 +607,10 @@ "get-east-asian-width": ["get-east-asian-width@1.4.0", "", {}, "sha512-QZjmEOC+IT1uk6Rx0sX22V6uHWVwbdbxf1faPqJ1QhLdGgsRGCZoyaQBm/piRdJy/D2um6hM1UP7ZEeQ4EkP+Q=="], + "get-intrinsic": ["get-intrinsic@1.3.0", "", { "dependencies": { "call-bind-apply-helpers": "^1.0.2", "es-define-property": "^1.0.1", "es-errors": "^1.3.0", "es-object-atoms": "^1.1.1", "function-bind": "^1.1.2", "get-proto": "^1.0.1", "gopd": "^1.2.0", "has-symbols": "^1.1.0", "hasown": "^2.0.2", "math-intrinsics": "^1.1.0" } }, "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ=="], + + "get-proto": ["get-proto@1.0.1", "", { "dependencies": { "dunder-proto": "^1.0.1", "es-object-atoms": "^1.0.0" } }, "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g=="], + "get-stream": ["get-stream@6.0.1", "", {}, "sha512-ts6Wi+2j3jQjqi70w5AlN8DFnkSwC+MqmxEzdEALB2qXZYV3X/b1CTfgPLGJNMeAWxdPfU8FO1ms3NUfaHCPYg=="], "git-log-parser": ["git-log-parser@1.2.1", "", { "dependencies": { "argv-formatter": "~1.0.0", "spawn-error-forwarder": "~1.0.0", "split2": "~1.0.0", "stream-combiner2": "~1.1.1", "through2": "~2.0.0", "traverse": "0.6.8" } }, "sha512-PI+sPDvHXNPl5WNOErAK05s3j0lgwUzMN6o8cyQrDaKfT3qd7TmNJKeXX+SknI5I0QhG5fVPAEwSY4tRGDtYoQ=="], @@ -566,6 +621,8 @@ "google-logging-utils": ["google-logging-utils@1.1.3", "", {}, "sha512-eAmLkjDjAFCVXg7A1unxHsLf961m6y17QFqXqAXGj/gVkKFrEICfStRfwUlGNfeCEjNRa32JEWOUTlYXPyyKvA=="], + "gopd": ["gopd@1.2.0", "", {}, "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg=="], + "graceful-fs": ["graceful-fs@4.2.11", "", {}, "sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ=="], "gtoken": ["gtoken@8.0.0", "", { "dependencies": { "gaxios": "^7.0.0", "jws": "^4.0.0" } }, "sha512-+CqsMbHPiSTdtSO14O51eMNlrp9N79gmeqmXeouJOhfucAedHw9noVe/n5uJk3tbKE6a+6ZCQg3RPhVhHByAIw=="], @@ -574,6 +631,12 @@ "has-flag": ["has-flag@4.0.0", "", {}, "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ=="], + "has-symbols": ["has-symbols@1.1.0", "", {}, "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ=="], + + "has-tostringtag": ["has-tostringtag@1.0.2", "", { "dependencies": { "has-symbols": "^1.0.3" } }, "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw=="], + + "hasown": ["hasown@2.0.2", "", { "dependencies": { "function-bind": "^1.1.2" } }, "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ=="], + "highlight.js": ["highlight.js@10.7.3", "", {}, "sha512-tzcUFauisWKNHaRkN4Wjl/ZA07gENAjFl3J/c480dprkGTg5EQstgaNFqBfUqCq54kZRIEcreTsAgF/m2quD7A=="], "hook-std": ["hook-std@4.0.0", "", {}, "sha512-IHI4bEVOt3vRUDJ+bFA9VUJlo7SzvFARPNLw75pqSmAOP2HmTWfFJtPvLBrDrlgjEYXY9zs7SFdHPQaJShkSCQ=="], @@ -586,6 +649,8 @@ "human-signals": ["human-signals@8.0.1", "", {}, "sha512-eKCa6bwnJhvxj14kZk5NCPc6Hb6BdsU9DZcOnmQKSnO1VKrfV0zCvtttPZUsBvjmNDn8rpcJfpwSYnHBjc95MQ=="], + "humanize-ms": ["humanize-ms@1.2.1", "", { "dependencies": { "ms": "^2.0.0" } }, "sha512-Fl70vYtsAFb/C06PTS9dZBo7ihau+Tu/DNCk/OyHhea07S+aeMWpFFkUaXRa8fI+ScZbEI8dfSxwY7gxZ9SAVQ=="], + "husky": ["husky@9.1.7", "", { "bin": { "husky": "bin.js" } }, "sha512-5gs5ytaNjBrh5Ow3zrvdUUY+0VxIuWVL4i9irt6friV+BqdCfmV11CQTWMiBYWHbXhco+J1kHfTOUkePhCDvMA=="], "import-fresh": ["import-fresh@3.3.1", "", { "dependencies": { "parent-module": "^1.0.0", "resolve-from": "^4.0.0" } }, "sha512-TR3KfrTZTYLPB6jUjfx6MF9WcWrHL9su5TObK4ZkYgBdWKPOFoSoQIdEuTuR82pmtxH2spWG9h6etwfr1pLBqQ=="], @@ -674,6 +739,8 @@ "marked-terminal": ["marked-terminal@7.3.0", "", { "dependencies": { "ansi-escapes": "^7.0.0", "ansi-regex": "^6.1.0", "chalk": "^5.4.1", "cli-highlight": "^2.1.11", "cli-table3": "^0.6.5", "node-emoji": "^2.2.0", "supports-hyperlinks": "^3.1.0" }, "peerDependencies": { "marked": ">=1 <16" } }, "sha512-t4rBvPsHc57uE/2nJOLmMbZCQ4tgAccAED3ngXQqW6g+TxA488JzJ+FK3lQkzBQOI1mRV/r/Kq+1ZlJ4D0owQw=="], + "math-intrinsics": ["math-intrinsics@1.1.0", "", {}, "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g=="], + "meow": ["meow@13.2.0", "", {}, "sha512-pxQJQzB6djGPXh08dacEloMFopsOqGVRKFPYvPOt9XDZ1HasbgDZA74CJGreSU4G3Ak7EFJGoiH2auq+yXISgA=="], "merge-stream": ["merge-stream@2.0.0", "", {}, "sha512-abv/qOcuPfk3URPfDzmZU1LKmuw8kT+0nIHvKrKgFrwifol/doWcdA4ZqsWQ8ENrFKkd67Mfpo/LovbIUsbt3w=="], @@ -682,6 +749,10 @@ "mime": ["mime@4.1.0", "", { "bin": { "mime": "bin/cli.js" } }, "sha512-X5ju04+cAzsojXKes0B/S4tcYtFAJ6tTMuSPBEn9CPGlrWr8Fiw7qYeLT0XyH80HSoAoqWCaz+MWKh22P7G1cw=="], + "mime-db": ["mime-db@1.52.0", "", {}, "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg=="], + + "mime-types": ["mime-types@2.1.35", "", { "dependencies": { "mime-db": "1.52.0" } }, "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw=="], + "mimic-fn": ["mimic-fn@2.1.0", "", {}, "sha512-OqbOk5oEQeAZ8WXWydlu9HJjz9WVdEIvamMCcXmuqUYjTknH/sqsWvhQ3vgwKFRR1HpjvNBKQ37nbJgYzGqGcg=="], "minimatch": ["minimatch@9.0.5", "", { "dependencies": { "brace-expansion": "^2.0.1" } }, "sha512-G6T0ZX48xgozx7587koeX9Ys2NYy6Gmv//P89sEte9V9whIapMNF4idKxnW2QtCcLiTWlb/wfCabAtAFWhhBow=="], @@ -702,7 +773,7 @@ "node-emoji": ["node-emoji@2.2.0", "", { "dependencies": { "@sindresorhus/is": "^4.6.0", "char-regex": "^1.0.2", "emojilib": "^2.4.0", "skin-tone": "^2.0.0" } }, "sha512-Z3lTE9pLaJF47NyMhd4ww1yFTAP8YhYI8SleJiHzM46Fgpm5cnNzSl9XfzFNqbaz+VlJrIj3fXQ4DeN1Rjm6cw=="], - "node-fetch": ["node-fetch@3.3.2", "", { "dependencies": { "data-uri-to-buffer": "^4.0.0", "fetch-blob": "^3.1.4", "formdata-polyfill": "^4.0.10" } }, "sha512-dRB78srN/l6gqWulah9SrxeYnxeddIG30+GOqK/9OlLVyLg3HPnr6SqOWTWOXKRwC2eGYCkZ59NNuSgvSrpgOA=="], + "node-fetch": ["node-fetch@2.7.0", "", { "dependencies": { "whatwg-url": "^5.0.0" }, "peerDependencies": { "encoding": "^0.1.0" }, "optionalPeers": ["encoding"] }, "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A=="], "normalize-package-data": ["normalize-package-data@8.0.0", "", { "dependencies": { "hosted-git-info": "^9.0.0", "semver": "^7.3.5", "validate-npm-package-license": "^3.0.4" } }, "sha512-RWk+PI433eESQ7ounYxIp67CYuVsS1uYSonX3kA6ps/3LWfjVQa/ptEg6Y3T6uAMq1mWpX9PQ+qx+QaHpsc7gQ=="], @@ -866,6 +937,8 @@ "to-regex-range": ["to-regex-range@5.0.1", "", { "dependencies": { "is-number": "^7.0.0" } }, "sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ=="], + "tr46": ["tr46@0.0.3", "", {}, "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw=="], + "traverse": ["traverse@0.6.8", "", {}, "sha512-aXJDbk6SnumuaZSANd21XAo15ucCDE38H4fkqiGsc3MhCK+wOlZvLP9cB/TvpHT0mOyWgC4Z8EwRlzqYSUzdsA=="], "tslib": ["tslib@2.8.1", "", {}, "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w=="], @@ -898,10 +971,14 @@ "validate-npm-package-license": ["validate-npm-package-license@3.0.4", "", { "dependencies": { "spdx-correct": "^3.0.0", "spdx-expression-parse": "^3.0.0" } }, "sha512-DpKm2Ui/xN7/HQKCtpZxoRWBhZ9Z0kqtygG8XCgNQ8ZlDnxuQmWhj566j8fN4Cu3/JmbhsDo7fcAJq4s9h27Ew=="], - "web-streams-polyfill": ["web-streams-polyfill@3.3.3", "", {}, "sha512-d2JWLCivmZYTSIoge9MsgFCZrt571BikcWGYkjC1khllbTeDlGqZ2D8vD8E/lJa8WGWbb7Plm8/XJYV7IJHZZw=="], + "web-streams-polyfill": ["web-streams-polyfill@4.0.0-beta.3", "", {}, "sha512-QW95TCTaHmsYfHDybGMwO5IJIM93I/6vTRk+daHTWFPhwh+C8Cg7j7XyKrwrj8Ib6vYXe0ocYNrmzY4xAAN6ug=="], "web-worker": ["web-worker@1.2.0", "", {}, "sha512-PgF341avzqyx60neE9DD+XS26MMNMoUQRz9NOZwW32nPQrF6p77f1htcnjBSEV8BGMKZ16choqUG4hyI0Hx7mA=="], + "webidl-conversions": ["webidl-conversions@3.0.1", "", {}, "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ=="], + + "whatwg-url": ["whatwg-url@5.0.0", "", { "dependencies": { "tr46": "~0.0.3", "webidl-conversions": "^3.0.0" } }, "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw=="], + "which": ["which@2.0.2", "", { "dependencies": { "isexe": "^2.0.0" }, "bin": { "node-which": "./bin/node-which" } }, "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA=="], "wordwrap": ["wordwrap@1.0.0", "", {}, "sha512-gvVzJFlPycKc5dZN4yPkP8w7Dc37BtP1yczEneOb4uq34pXZcvrtRTmWV8W+Ume+XCxKgbjM+nevkyFPMybd4Q=="], @@ -926,6 +1003,8 @@ "@actions/http-client/undici": ["undici@5.29.0", "", { "dependencies": { "@fastify/busboy": "^2.0.0" } }, "sha512-raqeBD6NQK4SkWhQzeYKd1KmIG6dllBOTt55Rmkt4HtI9mwdWtJljnrXjAFUBLTSN67HWrOIZ3EPF4kjUw80Bg=="], + "@anthropic-ai/sdk/@types/node": ["@types/node@18.19.130", "", { "dependencies": { "undici-types": "~5.26.4" } }, "sha512-GRaXQx6jGfL8sKfaIDD6OupbIHBr9jv7Jnaml9tB7l4v068PAOXqfcujMMo5PhbIs6ggR1XODELqahT2R8v0fg=="], + "@aws-crypto/sha1-browser/@smithy/util-utf8": ["@smithy/util-utf8@2.3.0", "", { "dependencies": { "@smithy/util-buffer-from": "^2.2.0", "tslib": "^2.6.2" } }, "sha512-R8Rdn8Hy72KKcebgLiv8jQcQkXoLMOGGv5uI1/k0l+snqkOzQ1R0ChUBCxWMlBsFMekWjq0wRudIweFs7sKT5A=="], "@aws-crypto/sha256-browser/@smithy/util-utf8": ["@smithy/util-utf8@2.3.0", "", { "dependencies": { "@smithy/util-buffer-from": "^2.2.0", "tslib": "^2.6.2" } }, "sha512-R8Rdn8Hy72KKcebgLiv8jQcQkXoLMOGGv5uI1/k0l+snqkOzQ1R0ChUBCxWMlBsFMekWjq0wRudIweFs7sKT5A=="], @@ -970,6 +1049,10 @@ "fdir/picomatch": ["picomatch@4.0.3", "", {}, "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q=="], + "fetch-blob/web-streams-polyfill": ["web-streams-polyfill@3.3.3", "", {}, "sha512-d2JWLCivmZYTSIoge9MsgFCZrt571BikcWGYkjC1khllbTeDlGqZ2D8vD8E/lJa8WGWbb7Plm8/XJYV7IJHZZw=="], + + "gaxios/node-fetch": ["node-fetch@3.3.2", "", { "dependencies": { "data-uri-to-buffer": "^4.0.0", "fetch-blob": "^3.1.4", "formdata-polyfill": "^4.0.10" } }, "sha512-dRB78srN/l6gqWulah9SrxeYnxeddIG30+GOqK/9OlLVyLg3HPnr6SqOWTWOXKRwC2eGYCkZ59NNuSgvSrpgOA=="], + "import-fresh/resolve-from": ["resolve-from@4.0.0", "", {}, "sha512-pb/MYmXstAkysRFx8piNI1tGFNQIFA3vkE3Gq4EuA1dF6gHp/+vgZqsCGJapvy8N3Q+4o7FwvquPJcnZ7RYy4g=="], "load-json-file/parse-json": ["parse-json@4.0.0", "", { "dependencies": { "error-ex": "^1.3.1", "json-parse-better-errors": "^1.0.1" } }, "sha512-aOIos8bujGN93/8Ox/jPLh7RwVnPEysynVFE+fQZyg6jKELEHwzgKdLRFHUgXJL6kylijVSBC4BvN9OmsB48Rw=="], @@ -1324,6 +1407,8 @@ "wrap-ansi-cjs/strip-ansi": ["strip-ansi@6.0.1", "", { "dependencies": { "ansi-regex": "^5.0.1" } }, "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A=="], + "@anthropic-ai/sdk/@types/node/undici-types": ["undici-types@5.26.5", "", {}, "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA=="], + "@aws-crypto/sha1-browser/@smithy/util-utf8/@smithy/util-buffer-from": ["@smithy/util-buffer-from@2.2.0", "", { "dependencies": { "@smithy/is-array-buffer": "^2.2.0", "tslib": "^2.6.2" } }, "sha512-IJdWBbTcMQ6DA0gdNhh/BwrLkDR+ADW5Kr1aZmd4k3DIF6ezMV4R2NIAmT08wQJ3yUK82thHWmC/TnK/wpMMIA=="], "@aws-crypto/sha256-browser/@smithy/util-utf8/@smithy/util-buffer-from": ["@smithy/util-buffer-from@2.2.0", "", { "dependencies": { "@smithy/is-array-buffer": "^2.2.0", "tslib": "^2.6.2" } }, "sha512-IJdWBbTcMQ6DA0gdNhh/BwrLkDR+ADW5Kr1aZmd4k3DIF6ezMV4R2NIAmT08wQJ3yUK82thHWmC/TnK/wpMMIA=="], diff --git a/packages/shared/AGENTS.md b/packages/shared/AGENTS.md new file mode 120000 index 0000000..681311e --- /dev/null +++ b/packages/shared/AGENTS.md @@ -0,0 +1 @@ +CLAUDE.md \ No newline at end of file diff --git a/packages/shared/CLAUDE.md b/packages/shared/CLAUDE.md new file mode 100644 index 0000000..1c9697d --- /dev/null +++ b/packages/shared/CLAUDE.md @@ -0,0 +1,22 @@ +# @docx-corpus/shared + +Shared utilities used by all TypeScript packages. This is the foundation layer — every TS package depends on it. + +## What's here + +- **`db.ts`** — `DbClient` interface and `createDb()` factory. Uses `Bun.sql` (not pg/postgres.js). All pipeline stages (scrape, extract, embed, classify) read/write through this client. +- **`storage.ts`** — `Storage` interface with `createR2Storage()` and `createLocalStorage()`. Abstracts Cloudflare R2 vs local filesystem. +- **`ui.ts`** — Terminal formatting helpers (progress bars, headers, multi-line progress). +- **`index.ts`** — Barrel exports. + +## Key types + +- `DocumentRecord` — the full row from the `documents` table. Every pipeline stage adds columns to this. +- `DbClient` — interface with methods grouped by pipeline stage (scraping, extraction, embedding, classification). +- `LLMClassificationData` — `{ id, documentType, documentTopic, confidence, model }` for the classification pipeline. + +## When modifying + +- Adding a new pipeline stage? Add fields to `DocumentRecord`, add methods to `DbClient` interface AND the `createDb()` implementation. +- DB uses tagged template literals (`sql\`...\``) for parameterized queries. Use `sql.unsafe()` only when dynamic column names are needed. +- Don't add external dependencies — this package only depends on Bun built-ins and `@aws-sdk/client-s3`. diff --git a/packages/shared/db.ts b/packages/shared/db.ts index 61a66d0..97cb5ac 100644 --- a/packages/shared/db.ts +++ b/packages/shared/db.ts @@ -30,10 +30,16 @@ export interface DocumentRecord { embedding_model: string | null; embedding: number[] | null; - // Classification data + // Classification data (clustering) cluster_id: number | null; cluster_label: string | null; classified_at: string | null; + + // LLM classification data + document_type: string | null; + document_topic: string | null; + classification_confidence: number | null; + classification_model: string | null; } export interface ExtractionData { @@ -62,6 +68,14 @@ export interface ClassificationData { classified_at?: string; } +export interface LLMClassificationData { + id: string; + documentType: string; + documentTopic: string; + confidence: number; + model: string; +} + export interface DbClient { // Scraping methods (existing) upsertDocument(doc: Partial & { id: string }): Promise; @@ -82,18 +96,24 @@ export interface DbClient { // Embedding methods (new) updateEmbedding(data: EmbeddingData): Promise; + markEmbeddingSkipped(id: string, reason: string): Promise; getUnembeddedDocuments(limit: number): Promise; getEmbeddedDocuments(limit: number): Promise; getDocumentsWithEmbeddings(limit: number): Promise<{ id: string; embedding: number[] }[]>; - // Classification methods (new) + // Classification methods (clustering) updateClassification(data: ClassificationData): Promise; updateClassificationBatch(data: ClassificationData[]): Promise; getUnclassifiedDocuments(limit: number): Promise; + // LLM classification methods + updateLLMClassification(data: LLMClassificationData): Promise; + updateLLMClassificationBatch(ids: string[], data: Omit): Promise; + getLLMClassificationStats(): Promise<{ classified: number; pending: number; byType: Record }>; + // Stats getExtractionStats(): Promise<{ extracted: number; pending: number; errors: number }>; - getEmbeddingStats(): Promise<{ embedded: number; pending: number }>; + getEmbeddingStats(): Promise<{ embedded: number; pending: number; skipped: number }>; getClassificationStats(): Promise<{ classified: number; pending: number; clusters: number }>; close(): Promise; @@ -258,6 +278,15 @@ export async function createDb(databaseUrl: string): Promise { ); }, + async markEmbeddingSkipped(id: string, reason: string) { + await sql` + UPDATE documents SET + embedded_at = ${new Date().toISOString()}, + embedding_model = ${`skipped:${reason}`} + WHERE id = ${id} + `; + }, + async getUnembeddedDocuments(limit: number) { return sql` SELECT * FROM documents @@ -327,6 +356,59 @@ export async function createDb(databaseUrl: string): Promise { `; }, + // ==================== LLM Classification Methods ==================== + + async updateLLMClassification(data: LLMClassificationData) { + await sql` + UPDATE documents SET + document_type = ${data.documentType}, + document_topic = ${data.documentTopic}, + classification_confidence = ${data.confidence}, + classification_model = ${data.model} + WHERE id = ${data.id} + `; + }, + + async updateLLMClassificationBatch(ids: string[], data: Omit) { + for (const id of ids) { + await sql` + UPDATE documents SET + document_type = ${data.documentType}, + document_topic = ${data.documentTopic}, + classification_confidence = ${data.confidence}, + classification_model = ${data.model} + WHERE id = ${id} + `; + } + }, + + async getLLMClassificationStats() { + const result = await sql<{ classified: number; pending: number }[]>` + SELECT + COUNT(*) FILTER (WHERE document_type IS NOT NULL)::int as classified, + COUNT(*) FILTER (WHERE extracted_at IS NOT NULL AND extraction_error IS NULL AND document_type IS NULL)::int as pending + FROM documents + `; + + const byTypeRows = await sql<{ type: string; count: number }[]>` + SELECT document_type as type, COUNT(*)::int as count + FROM documents + WHERE document_type IS NOT NULL + GROUP BY document_type + `; + + const byType: Record = {}; + for (const row of byTypeRows) { + byType[row.type] = row.count; + } + + return { + classified: result[0].classified, + pending: result[0].pending, + byType, + }; + }, + // ==================== Stats ==================== async getExtractionStats() { @@ -341,10 +423,11 @@ export async function createDb(databaseUrl: string): Promise { }, async getEmbeddingStats() { - const result = await sql<{ embedded: number; pending: number }[]>` + const result = await sql<{ embedded: number; pending: number; skipped: number }[]>` SELECT - COUNT(*) FILTER (WHERE embedded_at IS NOT NULL)::int as embedded, - COUNT(*) FILTER (WHERE extracted_at IS NOT NULL AND extraction_error IS NULL AND embedded_at IS NULL)::int as pending + COUNT(*) FILTER (WHERE embedded_at IS NOT NULL AND embedding_model NOT LIKE 'skipped:%')::int as embedded, + COUNT(*) FILTER (WHERE extracted_at IS NOT NULL AND extraction_error IS NULL AND embedded_at IS NULL)::int as pending, + COUNT(*) FILTER (WHERE embedding_model LIKE 'skipped:%')::int as skipped FROM documents `; return result[0]; diff --git a/packages/shared/index.ts b/packages/shared/index.ts index 2891613..cc0cd4c 100644 --- a/packages/shared/index.ts +++ b/packages/shared/index.ts @@ -29,4 +29,5 @@ export { type ExtractionData, type EmbeddingData, type ClassificationData, + type LLMClassificationData, } from "./db"; diff --git a/scripts/classification/AGENTS.md b/scripts/classification/AGENTS.md new file mode 120000 index 0000000..681311e --- /dev/null +++ b/scripts/classification/AGENTS.md @@ -0,0 +1 @@ +CLAUDE.md \ No newline at end of file diff --git a/scripts/classification/CLAUDE.md b/scripts/classification/CLAUDE.md new file mode 100644 index 0000000..9d9a0bf --- /dev/null +++ b/scripts/classification/CLAUDE.md @@ -0,0 +1,38 @@ +# Classification Pipeline + +Python ML pipeline that classifies ~800K .docx documents by **document type** (10 classes) and **topic** (9 classes). + +Uses the FineWeb-Edu pattern: LLM labels a small sample → train lightweight classifier → apply at scale. + +## Pipeline steps (run in order) + +1. **`sample.py`** — Stratified sampling from PostgreSQL. Samples proportionally across languages (en, ru, cs, pl, es), stratified by word count terciles and source domain diversity. +2. **`label.py`** — Async LLM labeling with Claude. Supports resume (appends to JSONL). Rate-limited with configurable parallelism. +3. **`train.py`** — Fine-tunes two independent ModernBERT classifiers (document_type and topic). Outputs models to `./models/`. +4. **`classify.py`** — Batch inference on the full corpus. Fetches text from R2, runs both models, writes results to PostgreSQL. +5. **`evaluate.py`** — Quality metrics. Two modes: `labels` (analyzes JSONL) and `corpus` (queries DB). + +## Key files + +- **`taxonomy.json`** — Single source of truth for the 2D taxonomy (10 document types × 9 topics). Both prompt building and model training reference this. +- **`common.py`** — Shared utilities: DB connection (`psycopg2`), text fetching from `https://docxcorp.us/extracted/`, taxonomy loading. +- **`pyproject.toml`** — Python dependencies. Install with `pip install -e .` or `uv pip install -e .`. + +## Database + +Writes to the same `documents` table as the TS pipeline: +- `document_type` — one of 10 types (legal, forms, reports, etc.) +- `document_topic` — one of 9 topics (government, education, healthcare, etc.) +- `classification_confidence` — min(type_confidence, topic_confidence) +- `classification_model` — e.g. "claude-haiku-4-5" or "modernbert-v2.0.0" + +Connection via `DATABASE_URL` env var loaded from `../../.env`. + +## Conventions + +- Python 3.11+, no type stubs needed +- Uses `psycopg2` for DB (not Bun.sql — this is Python) +- Uses `python-dotenv` to load `.env` from project root +- Text is fetched via HTTP from the public R2 endpoint, not direct R2 access +- All scripts support `--help` for usage +- JSONL files are the interchange format between steps diff --git a/scripts/classification/README.md b/scripts/classification/README.md new file mode 100644 index 0000000..bca3f5b --- /dev/null +++ b/scripts/classification/README.md @@ -0,0 +1,81 @@ +# Document Classification Pipeline + +Classifies ~800K .docx documents using the FineWeb-Edu / TnT-LLM pattern: +LLM labels a small sample → train ModernBERT classifier → apply at scale. + +## Two-Dimensional Taxonomy + +Each document gets classified on two independent dimensions: + +- **Document Type** (10 classes): legal, forms, reports, policies, educational, correspondence, technical, administrative, creative, reference +- **Topic** (9 classes): government, education, healthcare, finance, legal_judicial, technology, environment, nonprofit, general + +## Pipeline Steps + +### 1. Sample (`sample.py`) + +Stratified sampling across languages, word count, and source domains. + +```bash +python sample.py --total 3500 --output sampled_docs.jsonl +``` + +### 2. Label (`label.py`) + +LLM classification with Claude. Supports resume — safe to interrupt and restart. + +```bash +python label.py --input sampled_docs.jsonl --output labeled_docs.jsonl +python label.py --input sampled_docs.jsonl --output labeled_docs.jsonl --model claude-haiku-4-5 --parallel 5 +``` + +### 3. Evaluate Labels (`evaluate.py labels`) + +Check label quality before training. + +```bash +python evaluate.py labels --input labeled_docs.jsonl +``` + +### 4. Train (`train.py`) + +Fine-tune ModernBERT on labeled data. Trains two independent classifiers. + +```bash +python train.py --input labeled_docs.jsonl +python train.py --input labeled_docs.jsonl --epochs 5 --lr 2e-5 --output-dir ./models +``` + +### 5. Classify (`classify.py`) + +Apply trained models to the full corpus. + +```bash +python classify.py --models-dir ./models +python classify.py --models-dir ./models --batch-size 256 --dry-run --limit 100 +``` + +### 6. Evaluate Corpus (`evaluate.py corpus`) + +Check full corpus classification distribution. + +```bash +python evaluate.py corpus +python evaluate.py corpus --languages en,ru +``` + +## Setup + +```bash +pip install -r requirements.txt +``` + +Required environment variables (`.env` in project root): +- `DATABASE_URL` — PostgreSQL connection string +- `ANTHROPIC_API_KEY` — For LLM labeling step only + +## Cost Estimate + +- **Labeling**: ~3,500 docs × Claude Haiku ≈ $2-5 +- **Training**: ~30 min on GPU (or ~2h on CPU) +- **Inference**: ~800K docs, ~200-500 docs/sec on GPU diff --git a/scripts/classification/classify.py b/scripts/classification/classify.py new file mode 100644 index 0000000..411e743 --- /dev/null +++ b/scripts/classification/classify.py @@ -0,0 +1,384 @@ +#!/usr/bin/env python3 +""" +Phase 4: Apply trained classifiers to the full corpus. + +Loads the trained ModernBERT models and classifies all unclassified documents. +Fetches text from R2, runs inference, updates the database. + +Supports resume — already-classified documents are skipped. + +Usage: + python classify.py --models-dir ./models + python classify.py --models-dir ./models --batch-size 256 --languages en,ru,cs,pl,es + python classify.py --models-dir ./models --dry-run --limit 100 +""" + +import argparse +import json +import os +import sys +import time +from pathlib import Path + +import numpy as np +import torch +from tqdm import tqdm +from transformers import AutoModelForSequenceClassification, AutoTokenizer + +from common import ( + fetch_documents_text_parallel, + get_db_connection, + load_taxonomy, + save_labels_to_db, +) + +DEFAULT_BATCH_SIZE = 128 +DEFAULT_MAX_LENGTH = 512 +DEFAULT_MAX_CHARS = 2000 + + +def load_classifier(model_dir: str, device: torch.device): + """Load a trained classifier and tokenizer.""" + tokenizer = AutoTokenizer.from_pretrained(model_dir) + model = AutoModelForSequenceClassification.from_pretrained(model_dir) + model.to(device) + model.eval() + return tokenizer, model + + +def get_unclassified_documents( + languages: list[str] | None = None, + limit: int | None = None, +) -> list[dict]: + """Fetch documents that haven't been classified yet.""" + conn = get_db_connection() + try: + with conn.cursor() as cur: + query = """ + SELECT id, source_url, original_filename, word_count, language + FROM documents + WHERE extracted_at IS NOT NULL + AND extraction_error IS NULL + AND word_count > 0 + AND classification_model IS NULL + """ + params: list = [] + + if languages: + placeholders = ",".join(["%s"] * len(languages)) + query += f" AND language IN ({placeholders})" + params.extend(languages) + + query += " ORDER BY random()" + + if limit: + query += " LIMIT %s" + params.append(limit) + + cur.execute(query, params) + return [ + { + "id": row[0], + "source_url": row[1], + "original_filename": row[2], + "word_count": row[3], + "language": row[4], + } + for row in cur.fetchall() + ] + finally: + conn.close() + + +def get_classification_stats() -> dict: + """Get current classification progress.""" + conn = get_db_connection() + try: + with conn.cursor() as cur: + cur.execute(""" + SELECT + COUNT(*)::int as total, + COUNT(CASE WHEN classification_model IS NOT NULL THEN 1 END)::int as classified, + COUNT(CASE WHEN extracted_at IS NOT NULL AND extraction_error IS NULL AND word_count > 0 THEN 1 END)::int as classifiable + FROM documents + """) + row = cur.fetchone() + return { + "total": row[0], + "classified": row[1], + "classifiable": row[2], + "remaining": row[2] - row[1], + } + finally: + conn.close() + + +@torch.no_grad() +def classify_batch( + texts: list[str], + tokenizer, + model, + max_length: int, + device: torch.device, +) -> list[tuple[str, float]]: + """Classify a batch of texts. Returns list of (label, confidence).""" + inputs = tokenizer( + texts, + truncation=True, + max_length=max_length, + padding=True, + return_tensors="pt", + ).to(device) + + outputs = model(**inputs) + probs = torch.softmax(outputs.logits, dim=-1) + confidences, pred_ids = torch.max(probs, dim=-1) + + results = [] + for pred_id, conf in zip(pred_ids.cpu().numpy(), confidences.cpu().numpy()): + label = model.config.id2label[int(pred_id)] + results.append((label, float(conf))) + + return results + + +def process_batch( + docs: list[dict], + type_tokenizer, + type_model, + topic_tokenizer, + topic_model, + max_length: int, + max_chars: int, + device: torch.device, + model_name: str, +) -> list[dict]: + """Process a batch: fetch texts, classify, return label dicts.""" + # Fetch texts + doc_ids = [d["id"] for d in docs] + texts = fetch_documents_text_parallel(doc_ids, max_chars=max_chars) + + # Filter docs with text + valid_docs = [] + valid_texts = [] + for doc in docs: + text = texts.get(doc["id"], "") + if text: + valid_docs.append(doc) + valid_texts.append(text) + + if not valid_texts: + return [] + + # Classify with both models + type_results = classify_batch( + valid_texts, type_tokenizer, type_model, max_length, device + ) + topic_results = classify_batch( + valid_texts, topic_tokenizer, topic_model, max_length, device + ) + + # Build label dicts for DB update + labels = [] + for doc, (doc_type, type_conf), (topic, topic_conf) in zip( + valid_docs, type_results, topic_results + ): + labels.append( + { + "id": doc["id"], + "document_type": doc_type, + "document_topic": topic, + "confidence": min(type_conf, topic_conf), + "model": model_name, + } + ) + + return labels + + +def main(): + parser = argparse.ArgumentParser( + description="Classify full corpus with trained ModernBERT models" + ) + parser.add_argument( + "--models-dir", + type=str, + required=True, + help="Directory containing trained models (from train.py)", + ) + parser.add_argument( + "--batch-size", + type=int, + default=DEFAULT_BATCH_SIZE, + help=f"Inference batch size (default: {DEFAULT_BATCH_SIZE})", + ) + parser.add_argument( + "--max-length", + type=int, + default=DEFAULT_MAX_LENGTH, + help=f"Max token length (default: {DEFAULT_MAX_LENGTH})", + ) + parser.add_argument( + "--max-chars", + type=int, + default=DEFAULT_MAX_CHARS, + help=f"Max text characters to fetch (default: {DEFAULT_MAX_CHARS})", + ) + parser.add_argument( + "--languages", + type=str, + default=None, + help="Comma-separated language codes to classify (default: all)", + ) + parser.add_argument( + "--limit", + type=int, + default=None, + help="Max documents to classify (default: all)", + ) + parser.add_argument( + "--db-batch-size", + type=int, + default=500, + help="DB update batch size (default: 500)", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Classify but don't write to DB", + ) + args = parser.parse_args() + + # Validate model directories + type_model_dir = os.path.join(args.models_dir, "document_type", "best") + topic_model_dir = os.path.join(args.models_dir, "topic", "best") + + for d in [type_model_dir, topic_model_dir]: + if not os.path.exists(d): + print(f"ERROR: Model directory not found: {d}") + sys.exit(1) + + # Load training config for model name + config_path = os.path.join(args.models_dir, "training_config.json") + if os.path.exists(config_path): + with open(config_path) as f: + train_config = json.load(f) + model_name = f"modernbert-{train_config.get('taxonomy_version', 'v2')}" + else: + model_name = "modernbert-v2" + + # Device + if torch.cuda.is_available(): + device = torch.device("cuda") + elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + device = torch.device("mps") + else: + device = torch.device("cpu") + print(f"Device: {device}") + + # Load models + print(f"\nLoading document_type model from {type_model_dir}...") + type_tokenizer, type_model = load_classifier(type_model_dir, device) + print(f"Loading topic model from {topic_model_dir}...") + topic_tokenizer, topic_model = load_classifier(topic_model_dir, device) + + # Stats + stats = get_classification_stats() + print(f"\nCorpus stats:") + print(f" Total documents: {stats['total']:,}") + print(f" Classifiable: {stats['classifiable']:,}") + print(f" Already classified: {stats['classified']:,}") + print(f" Remaining: {stats['remaining']:,}") + + # Get unclassified docs + languages = ( + [l.strip() for l in args.languages.split(",")] + if args.languages + else None + ) + print(f"\nFetching unclassified documents...") + docs = get_unclassified_documents(languages=languages, limit=args.limit) + print(f" Found {len(docs):,} documents to classify") + + if not docs: + print("Nothing to classify!") + return + + if args.dry_run: + print(" (DRY RUN — will not write to database)") + + # Process in batches + total_classified = 0 + total_errors = 0 + start_time = time.time() + + # Use smaller fetch batches for text retrieval + fetch_batch_size = min(args.batch_size, 100) + + pbar = tqdm(total=len(docs), desc="Classifying", unit="doc") + + for i in range(0, len(docs), fetch_batch_size): + batch_docs = docs[i : i + fetch_batch_size] + + labels = process_batch( + docs=batch_docs, + type_tokenizer=type_tokenizer, + type_model=type_model, + topic_tokenizer=topic_tokenizer, + topic_model=topic_model, + max_length=args.max_length, + max_chars=args.max_chars, + device=device, + model_name=model_name, + ) + + if labels and not args.dry_run: + save_labels_to_db(labels, batch_size=args.db_batch_size) + + total_classified += len(labels) + total_errors += len(batch_docs) - len(labels) + pbar.update(len(batch_docs)) + + # Show throughput + elapsed = time.time() - start_time + rate = total_classified / elapsed if elapsed > 0 else 0 + pbar.set_postfix_str(f"{rate:.0f} docs/s, {total_errors} errors") + + pbar.close() + + elapsed = time.time() - start_time + rate = total_classified / elapsed if elapsed > 0 else 0 + + print(f"\n{'=' * 60}") + print("Classification Complete") + print(f"{'=' * 60}") + print(f" Classified: {total_classified:,}") + print(f" Errors (no text): {total_errors:,}") + print(f" Time: {elapsed:.1f}s ({rate:.0f} docs/s)") + print(f" Model: {model_name}") + + if not args.dry_run: + final_stats = get_classification_stats() + print(f"\n Total classified in DB: {final_stats['classified']:,}") + print(f" Remaining: {final_stats['remaining']:,}") + + if args.dry_run: + print("\n (DRY RUN — no changes written to database)") + + # Print distribution of this batch + if total_classified > 0 and labels: + print(f"\nSample distribution (last batch):") + type_counts: dict[str, int] = {} + topic_counts: dict[str, int] = {} + for label in labels: + dt = label["document_type"] + tp = label["document_topic"] + type_counts[dt] = type_counts.get(dt, 0) + 1 + topic_counts[tp] = topic_counts.get(tp, 0) + 1 + + print(" Types:", dict(sorted(type_counts.items(), key=lambda x: -x[1]))) + print(" Topics:", dict(sorted(topic_counts.items(), key=lambda x: -x[1]))) + + +if __name__ == "__main__": + main() diff --git a/scripts/classification/common.py b/scripts/classification/common.py new file mode 100644 index 0000000..a83abdc --- /dev/null +++ b/scripts/classification/common.py @@ -0,0 +1,130 @@ +""" +Shared utilities for the classification pipeline. +DB connection, text fetching, and common helpers. +""" + +import json +import os +import urllib.error +import urllib.request +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path +from typing import Optional + +import psycopg2 +from dotenv import load_dotenv + +# Load .env from project root +env_path = Path(__file__).parent.parent.parent / ".env" +load_dotenv(env_path) + +TEXT_BASE_URL = "https://docxcorp.us/extracted" + + +def get_db_connection(): + """Create a connection to the PostgreSQL database.""" + database_url = os.getenv("DATABASE_URL") + if not database_url: + raise ValueError("DATABASE_URL environment variable not set") + return psycopg2.connect(database_url) + + +def load_taxonomy(path: Optional[str] = None) -> dict: + """Load taxonomy from JSON file.""" + if path is None: + path = Path(__file__).parent / "taxonomy.json" + with open(path) as f: + return json.load(f) + + +def fetch_document_text(doc_id: str, max_chars: int = 2000) -> str: + """Fetch extracted text for a document from public URL.""" + url = f"{TEXT_BASE_URL}/{doc_id}.txt" + try: + req = urllib.request.Request(url, headers={"User-Agent": "docx-classifier/2.0"}) + with urllib.request.urlopen(req, timeout=15) as response: + text = response.read().decode("utf-8") + return text[:max_chars] + except urllib.error.HTTPError as e: + if e.code == 404: + return "" + return "" + except Exception: + return "" + + +def fetch_documents_text_parallel( + doc_ids: list[str], max_chars: int = 2000, max_workers: int = 20 +) -> dict[str, str]: + """Fetch text for multiple documents in parallel.""" + results = {} + + def fetch_one(doc_id: str) -> tuple[str, str]: + return doc_id, fetch_document_text(doc_id, max_chars) + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + for doc_id, text in executor.map(fetch_one, doc_ids): + results[doc_id] = text + + return results + + +def get_extraction_stats_by_language() -> list[dict]: + """Get document counts per language for extracted docs.""" + conn = get_db_connection() + try: + with conn.cursor() as cur: + cur.execute(""" + SELECT + language, + COUNT(*)::int as count, + ROUND(AVG(word_count))::int as avg_words + FROM documents + WHERE extracted_at IS NOT NULL + AND extraction_error IS NULL + AND language IS NOT NULL + ORDER BY count DESC + """) + return [ + {"language": row[0], "count": row[1], "avg_words": row[2]} + for row in cur.fetchall() + ] + finally: + conn.close() + + +def save_labels_to_db(labels: list[dict], batch_size: int = 500) -> int: + """ + Save classification labels to the database. + + Each label dict: {id, document_type, document_topic, confidence, model} + """ + conn = get_db_connection() + total = 0 + try: + with conn.cursor() as cur: + for i in range(0, len(labels), batch_size): + batch = labels[i : i + batch_size] + for label in batch: + cur.execute( + """ + UPDATE documents SET + document_type = %s, + document_topic = %s, + classification_confidence = %s, + classification_model = %s + WHERE id = %s + """, + ( + label["document_type"], + label["document_topic"], + label["confidence"], + label["model"], + label["id"], + ), + ) + total += 1 + conn.commit() + finally: + conn.close() + return total diff --git a/scripts/classification/evaluate.py b/scripts/classification/evaluate.py new file mode 100644 index 0000000..e229dcf --- /dev/null +++ b/scripts/classification/evaluate.py @@ -0,0 +1,303 @@ +#!/usr/bin/env python3 +""" +Evaluate classification quality. + +Can evaluate: + 1. LLM labels (from label.py) — confidence distribution, class balance + 2. Trained models (from train.py) — val set metrics + 3. Full corpus (from classify.py) — distribution analysis from DB + +Usage: + python evaluate.py labels --input labeled_docs.jsonl + python evaluate.py corpus + python evaluate.py corpus --languages en,ru +""" + +import argparse +import json +import os +import sys + +from common import get_db_connection, load_taxonomy + + +def evaluate_labels(input_path: str, taxonomy: dict): + """Evaluate LLM-labeled data: distribution, confidence, quality signals.""" + docs = [] + with open(input_path) as f: + for line in f: + if line.strip(): + docs.append(json.loads(line)) + + print(f"\n{'=' * 60}") + print(f"LLM Label Evaluation ({len(docs)} documents)") + print(f"{'=' * 60}") + + # Class distribution - document types + type_counts: dict[str, int] = {} + topic_counts: dict[str, int] = {} + lang_counts: dict[str, int] = {} + + type_confs: dict[str, list[float]] = {} + topic_confs: dict[str, list[float]] = {} + + for doc in docs: + dt = doc.get("document_type", "unknown") + tp = doc.get("document_topic", "unknown") + lang = doc.get("language", "unknown") + + type_counts[dt] = type_counts.get(dt, 0) + 1 + topic_counts[tp] = topic_counts.get(tp, 0) + 1 + lang_counts[lang] = lang_counts.get(lang, 0) + 1 + + type_confs.setdefault(dt, []).append(doc.get("type_confidence", 0)) + topic_confs.setdefault(tp, []).append(doc.get("topic_confidence", 0)) + + # Document type distribution + print("\nDocument Type Distribution:") + print(f" {'Type':<20s} {'Count':>6s} {'%':>7s} {'Avg Conf':>10s}") + print(f" {'-' * 45}") + for t, c in sorted(type_counts.items(), key=lambda x: -x[1]): + pct = 100 * c / len(docs) + avg_conf = sum(type_confs[t]) / len(type_confs[t]) if type_confs.get(t) else 0 + print(f" {t:<20s} {c:>6d} {pct:>6.1f}% {avg_conf:>9.3f}") + + # Topic distribution + print("\nTopic Distribution:") + print(f" {'Topic':<20s} {'Count':>6s} {'%':>7s} {'Avg Conf':>10s}") + print(f" {'-' * 45}") + for t, c in sorted(topic_counts.items(), key=lambda x: -x[1]): + pct = 100 * c / len(docs) + avg_conf = sum(topic_confs[t]) / len(topic_confs[t]) if topic_confs.get(t) else 0 + print(f" {t:<20s} {c:>6d} {pct:>6.1f}% {avg_conf:>9.3f}") + + # Language distribution + print("\nLanguage Distribution:") + for lang, c in sorted(lang_counts.items(), key=lambda x: -x[1]): + pct = 100 * c / len(docs) + print(f" {lang:<5s}: {c:>6d} ({pct:.1f}%)") + + # Overall confidence stats + all_confs = [doc.get("confidence", 0) for doc in docs] + if all_confs: + avg = sum(all_confs) / len(all_confs) + low = sum(1 for c in all_confs if c < 0.5) + med = sum(1 for c in all_confs if 0.5 <= c < 0.8) + high = sum(1 for c in all_confs if c >= 0.8) + print(f"\nConfidence Distribution:") + print(f" Mean: {avg:.3f}") + print(f" High (>=0.8): {high:>5d} ({100 * high / len(all_confs):.1f}%)") + print(f" Medium (0.5-0.8): {med:>5d} ({100 * med / len(all_confs):.1f}%)") + print(f" Low (<0.5): {low:>5d} ({100 * low / len(all_confs):.1f}%)") + + # Parse failures + failed = sum(1 for doc in docs if doc.get("reasoning") == "Failed to parse LLM response") + if failed: + print(f"\n Parse failures: {failed} ({100 * failed / len(docs):.1f}%)") + + # Cross-tabulation (type x topic) + print("\nType x Topic Cross-tabulation (top 5 combos):") + combos: dict[str, int] = {} + for doc in docs: + key = f"{doc.get('document_type', '?')} + {doc.get('document_topic', '?')}" + combos[key] = combos.get(key, 0) + 1 + for combo, c in sorted(combos.items(), key=lambda x: -x[1])[:10]: + pct = 100 * c / len(docs) + print(f" {combo:<40s} {c:>5d} ({pct:.1f}%)") + + # Check for taxonomy coverage + valid_types = {t["id"] for t in taxonomy["document_types"]} + valid_topics = {t["id"] for t in taxonomy["topics"]} + missing_types = valid_types - set(type_counts.keys()) + missing_topics = valid_topics - set(topic_counts.keys()) + if missing_types: + print(f"\n Unused document types: {missing_types}") + if missing_topics: + print(f" Unused topics: {missing_topics}") + + +def evaluate_corpus(languages: list[str] | None = None): + """Evaluate classification results from the database.""" + conn = get_db_connection() + try: + with conn.cursor() as cur: + # Overall stats + cur.execute(""" + SELECT + COUNT(*)::int as total, + COUNT(CASE WHEN classification_model IS NOT NULL THEN 1 END)::int as classified, + COUNT(CASE WHEN extracted_at IS NOT NULL AND extraction_error IS NULL THEN 1 END)::int as extracted + FROM documents + """) + row = cur.fetchone() + total, classified, extracted = row + + print(f"\n{'=' * 60}") + print(f"Corpus Classification Status") + print(f"{'=' * 60}") + print(f" Total documents: {total:,}") + print(f" Extracted: {extracted:,}") + print(f" Classified: {classified:,} ({100 * classified / max(extracted, 1):.1f}% of extracted)") + + if classified == 0: + print("\n No classified documents yet.") + return + + # Build language filter + lang_filter = "" + params: list = [] + if languages: + placeholders = ",".join(["%s"] * len(languages)) + lang_filter = f"AND language IN ({placeholders})" + params = list(languages) + + # Document type distribution + cur.execute( + f""" + SELECT document_type, COUNT(*)::int as count, + ROUND(AVG(classification_confidence)::numeric, 3) as avg_conf + FROM documents + WHERE classification_model IS NOT NULL {lang_filter} + GROUP BY document_type + ORDER BY count DESC + """, + params, + ) + rows = cur.fetchall() + + scope = f" (languages: {','.join(languages)})" if languages else "" + print(f"\nDocument Type Distribution{scope}:") + print(f" {'Type':<20s} {'Count':>8s} {'%':>7s} {'Avg Conf':>10s}") + print(f" {'-' * 47}") + row_total = sum(r[1] for r in rows) + for dt, count, avg_conf in rows: + pct = 100 * count / row_total + print(f" {dt or 'null':<20s} {count:>8,d} {pct:>6.1f}% {avg_conf or 0:>9.3f}") + + # Topic distribution + cur.execute( + f""" + SELECT document_topic, COUNT(*)::int as count, + ROUND(AVG(classification_confidence)::numeric, 3) as avg_conf + FROM documents + WHERE classification_model IS NOT NULL {lang_filter} + GROUP BY document_topic + ORDER BY count DESC + """, + params, + ) + rows = cur.fetchall() + + print(f"\nTopic Distribution{scope}:") + print(f" {'Topic':<20s} {'Count':>8s} {'%':>7s} {'Avg Conf':>10s}") + print(f" {'-' * 47}") + for tp, count, avg_conf in rows: + pct = 100 * count / row_total + print(f" {tp or 'null':<20s} {count:>8,d} {pct:>6.1f}% {avg_conf or 0:>9.3f}") + + # By language + cur.execute( + f""" + SELECT language, COUNT(*)::int as count + FROM documents + WHERE classification_model IS NOT NULL {lang_filter} + GROUP BY language + ORDER BY count DESC + LIMIT 20 + """, + params, + ) + rows = cur.fetchall() + + print(f"\nBy Language (top 20):") + for lang, count in rows: + pct = 100 * count / row_total + print(f" {lang or '?':<5s}: {count:>8,d} ({pct:.1f}%)") + + # Confidence distribution + cur.execute( + f""" + SELECT + COUNT(CASE WHEN classification_confidence >= 0.8 THEN 1 END)::int as high, + COUNT(CASE WHEN classification_confidence >= 0.5 AND classification_confidence < 0.8 THEN 1 END)::int as med, + COUNT(CASE WHEN classification_confidence < 0.5 THEN 1 END)::int as low, + ROUND(AVG(classification_confidence)::numeric, 3) as avg + FROM documents + WHERE classification_model IS NOT NULL {lang_filter} + """, + params, + ) + row = cur.fetchone() + print(f"\nConfidence Distribution:") + print(f" Mean: {row[3]}") + print(f" High (>=0.8): {row[0]:>8,d} ({100 * row[0] / row_total:.1f}%)") + print(f" Medium (0.5-0.8): {row[1]:>8,d} ({100 * row[1] / row_total:.1f}%)") + print(f" Low (<0.5): {row[2]:>8,d} ({100 * row[2] / row_total:.1f}%)") + + # Classification model used + cur.execute( + f""" + SELECT classification_model, COUNT(*)::int + FROM documents + WHERE classification_model IS NOT NULL {lang_filter} + GROUP BY classification_model + """, + params, + ) + rows = cur.fetchall() + print(f"\nModels Used:") + for model, count in rows: + print(f" {model}: {count:,}") + + finally: + conn.close() + + +def main(): + parser = argparse.ArgumentParser(description="Evaluate classification quality") + subparsers = parser.add_subparsers(dest="command", help="Evaluation mode") + + # Labels subcommand + labels_parser = subparsers.add_parser( + "labels", help="Evaluate LLM-labeled data" + ) + labels_parser.add_argument( + "--input", type=str, required=True, help="Labeled JSONL file" + ) + + # Corpus subcommand + corpus_parser = subparsers.add_parser( + "corpus", help="Evaluate corpus classification from DB" + ) + corpus_parser.add_argument( + "--languages", + type=str, + default=None, + help="Comma-separated language codes to filter", + ) + + args = parser.parse_args() + + if not args.command: + parser.print_help() + sys.exit(1) + + taxonomy = load_taxonomy() + + if args.command == "labels": + if not os.path.exists(args.input): + print(f"ERROR: File not found: {args.input}") + sys.exit(1) + evaluate_labels(args.input, taxonomy) + + elif args.command == "corpus": + languages = ( + [l.strip() for l in args.languages.split(",")] + if args.languages + else None + ) + evaluate_corpus(languages=languages) + + +if __name__ == "__main__": + main() diff --git a/scripts/classification/label.py b/scripts/classification/label.py new file mode 100644 index 0000000..8eb0575 --- /dev/null +++ b/scripts/classification/label.py @@ -0,0 +1,375 @@ +#!/usr/bin/env python3 +""" +Phase 2, Step 2: LLM-label sampled documents using Claude. + +Reads a JSONL of sampled documents, fetches their text, sends to Claude +for classification, and saves labeled results. + +Supports resume — already-labeled documents are skipped. + +Usage: + python label.py --input sampled_docs.jsonl --output labeled_docs.jsonl + python label.py --input sampled_docs.jsonl --output labeled_docs.jsonl --model claude-haiku-4-5 + python label.py --input sampled_docs.jsonl --output labeled_docs.jsonl --parallel 5 +""" + +import argparse +import asyncio +import json +import os +import sys +import time +from pathlib import Path + +import anthropic +from tqdm import tqdm + +from common import fetch_documents_text_parallel, load_taxonomy + +# Rate limiting defaults +DEFAULT_PARALLEL = 5 +DEFAULT_RPM = 50 +DEFAULT_DELAY = 1.5 + + +def build_classification_prompt(taxonomy: dict, text: str, filename: str | None) -> str: + """Build the prompt for Claude to classify a single document.""" + doc_types = "\n".join( + f" - **{t['id']}**: {t['label']} — {t['description']}" + for t in taxonomy["document_types"] + ) + topics = "\n".join( + f" - **{t['id']}**: {t['label']} — {t['description']}" + for t in taxonomy["topics"] + ) + + filename_line = f"\n- Filename: {filename}" if filename else "" + + return f"""Classify this document along two independent dimensions. + +## Dimension 1: Document Type (what kind of document is this?) +{doc_types} + +## Dimension 2: Topic (what subject domain does it belong to?) +{topics} + +## Document{filename_line} +\"\"\" +{text} +\"\"\" + +## Instructions +1. Read the document text carefully +2. Choose the BEST matching document_type from Dimension 1 +3. Choose the BEST matching topic from Dimension 2 +4. These are INDEPENDENT — a "legal" type document can have any topic, and vice versa +5. Provide confidence scores (0.0-1.0) for each choice + +Respond with ONLY this JSON: +{{"document_type": {{"id": "", "confidence": <0.0-1.0>}}, "topic": {{"id": "", "confidence": <0.0-1.0>}}, "reasoning": "<1 sentence>"}}""" + + +def parse_llm_response(text: str, taxonomy: dict) -> dict: + """Parse and validate LLM classification response.""" + import re + + valid_types = {t["id"] for t in taxonomy["document_types"]} + valid_topics = {t["id"] for t in taxonomy["topics"]} + + try: + json_match = re.search(r"\{[\s\S]*\}", text) + if json_match: + result = json.loads(json_match.group()) + doc_type = result["document_type"]["id"] + topic = result["topic"]["id"] + + # Validate against taxonomy + if doc_type not in valid_types: + doc_type = "general" if "general" in valid_types else list(valid_types)[0] + if topic not in valid_topics: + topic = "general" if "general" in valid_topics else list(valid_topics)[0] + + return { + "document_type": doc_type, + "document_topic": topic, + "type_confidence": float(result["document_type"]["confidence"]), + "topic_confidence": float(result["topic"]["confidence"]), + "confidence": min( + float(result["document_type"]["confidence"]), + float(result["topic"]["confidence"]), + ), + "reasoning": result.get("reasoning", ""), + } + except (json.JSONDecodeError, KeyError, TypeError, ValueError): + pass + + return { + "document_type": "general", + "document_topic": "general", + "type_confidence": 0.0, + "topic_confidence": 0.0, + "confidence": 0.0, + "reasoning": "Failed to parse LLM response", + } + + +def load_existing_results(output_path: str) -> dict[str, dict]: + """Load already-labeled documents for resume support.""" + results = {} + if os.path.exists(output_path): + with open(output_path) as f: + for line in f: + if line.strip(): + entry = json.loads(line) + results[entry["id"]] = entry + return results + + +def append_result(output_path: str, result: dict): + """Append a single labeled result to the output file.""" + with open(output_path, "a") as f: + f.write(json.dumps(result) + "\n") + + +async def label_documents( + docs: list[dict], + taxonomy: dict, + output_path: str, + model: str, + max_parallel: int, + rpm: int, + delay: float, +): + """Label all documents with Claude, with rate limiting and resume.""" + api_key = os.environ.get("ANTHROPIC_API_KEY") + if not api_key: + print("ERROR: ANTHROPIC_API_KEY environment variable not set") + sys.exit(1) + + client = anthropic.AsyncAnthropic(api_key=api_key) + semaphore = asyncio.Semaphore(max_parallel) + + # Load existing results for resume + existing = load_existing_results(output_path) + remaining = [d for d in docs if d["id"] not in existing] + + if existing: + print(f" Resuming: {len(existing)} already labeled, {len(remaining)} remaining") + + if not remaining: + print(" All documents already labeled!") + return existing + + # Prefetch text for all remaining documents + print(f" Fetching text for {len(remaining)} documents...") + doc_ids = [d["id"] for d in remaining] + + # Fetch in batches to avoid overwhelming the server + all_texts = {} + batch_size = 100 + for i in range(0, len(doc_ids), batch_size): + batch = doc_ids[i : i + batch_size] + texts = fetch_documents_text_parallel(batch, max_chars=2000) + all_texts.update(texts) + if i + batch_size < len(doc_ids): + print(f" Fetched text: {min(i + batch_size, len(doc_ids))}/{len(doc_ids)}") + + print(f" Text fetched. Starting LLM classification with {model}...") + + # Rate limiting + interval = 60.0 / rpm + last_request = [0.0] + lock = asyncio.Lock() + + async def rate_limit(): + async with lock: + now = time.time() + wait = last_request[0] + max(interval, delay) - now + if wait > 0: + await asyncio.sleep(wait) + last_request[0] = time.time() + + completed = [0] + errors = [0] + pbar = tqdm(total=len(remaining), desc="Labeling", unit="doc") + + async def classify_one(doc: dict) -> dict | None: + text = all_texts.get(doc["id"], "") + if not text: + errors[0] += 1 + pbar.update(1) + return None + + prompt = build_classification_prompt(taxonomy, text, doc.get("original_filename")) + + max_retries = 5 + for attempt in range(max_retries): + await rate_limit() + async with semaphore: + try: + response = await client.messages.create( + model=model, + max_tokens=256, + messages=[{"role": "user", "content": prompt}], + ) + classification = parse_llm_response( + response.content[0].text, taxonomy + ) + + result = { + "id": doc["id"], + "language": doc.get("language"), + "word_count": doc.get("word_count"), + "original_filename": doc.get("original_filename"), + "source_url": doc.get("source_url"), + **classification, + "model": model, + } + + append_result(output_path, result) + completed[0] += 1 + pbar.update(1) + return result + + except anthropic.RateLimitError: + wait = min(10 * (2**attempt), 60) + pbar.set_postfix_str(f"rate limited, waiting {wait}s") + await asyncio.sleep(wait) + except Exception as e: + if attempt == max_retries - 1: + errors[0] += 1 + pbar.update(1) + pbar.set_postfix_str(f"error: {str(e)[:50]}") + return None + await asyncio.sleep(2) + + return None + + # Process all documents + tasks = [classify_one(doc) for doc in remaining] + results = await asyncio.gather(*tasks) + pbar.close() + + # Merge with existing + all_results = dict(existing) + for r in results: + if r: + all_results[r["id"]] = r + + print(f"\n Completed: {completed[0]}, Errors: {errors[0]}") + return all_results + + +def print_summary(output_path: str, taxonomy: dict): + """Print classification distribution summary.""" + results = load_existing_results(output_path) + if not results: + return + + print(f"\n{'=' * 60}") + print(f"Classification Summary ({len(results)} documents)") + print(f"{'=' * 60}") + + # By document type + type_counts: dict[str, int] = {} + topic_counts: dict[str, int] = {} + lang_counts: dict[str, int] = {} + + for r in results.values(): + dt = r.get("document_type", "unknown") + tp = r.get("document_topic", "unknown") + lang = r.get("language", "unknown") + type_counts[dt] = type_counts.get(dt, 0) + 1 + topic_counts[tp] = topic_counts.get(tp, 0) + 1 + lang_counts[lang] = lang_counts.get(lang, 0) + 1 + + print("\nBy Document Type:") + for t, c in sorted(type_counts.items(), key=lambda x: -x[1]): + pct = 100 * c / len(results) + print(f" {t:20s}: {c:5d} ({pct:5.1f}%)") + + print("\nBy Topic:") + for t, c in sorted(topic_counts.items(), key=lambda x: -x[1]): + pct = 100 * c / len(results) + print(f" {t:20s}: {c:5d} ({pct:5.1f}%)") + + print("\nBy Language:") + for t, c in sorted(lang_counts.items(), key=lambda x: -x[1]): + pct = 100 * c / len(results) + print(f" {t:5s}: {c:5d} ({pct:5.1f}%)") + + # Confidence stats + confidences = [r["confidence"] for r in results.values() if "confidence" in r] + if confidences: + avg = sum(confidences) / len(confidences) + low = sum(1 for c in confidences if c < 0.6) + print(f"\nConfidence: avg={avg:.2f}, <60%={low} ({100*low/len(confidences):.1f}%)") + + +def main(): + parser = argparse.ArgumentParser( + description="LLM-label sampled documents using Claude" + ) + parser.add_argument( + "--input", type=str, required=True, help="Input JSONL from sample.py" + ) + parser.add_argument( + "--output", type=str, default="labeled_docs.jsonl", help="Output JSONL" + ) + parser.add_argument( + "--model", type=str, default="claude-haiku-4-5", help="Claude model" + ) + parser.add_argument( + "--parallel", type=int, default=DEFAULT_PARALLEL, help="Max parallel requests" + ) + parser.add_argument( + "--rpm", type=int, default=DEFAULT_RPM, help="Requests per minute limit" + ) + parser.add_argument( + "--delay", type=float, default=DEFAULT_DELAY, help="Min delay between requests" + ) + args = parser.parse_args() + + if not os.path.exists(args.input): + print(f"ERROR: Input file not found: {args.input}") + sys.exit(1) + + # Load taxonomy + taxonomy = load_taxonomy() + print(f"Taxonomy: {taxonomy['name']} v{taxonomy['version']}") + print(f" Document types: {len(taxonomy['document_types'])}") + print(f" Topics: {len(taxonomy['topics'])}") + + # Load sampled documents + docs = [] + with open(args.input) as f: + for line in f: + if line.strip(): + docs.append(json.loads(line)) + + print(f"\nDocuments to label: {len(docs)}") + print(f"Model: {args.model}") + print(f"Parallel: {args.parallel}, RPM: {args.rpm}, Delay: {args.delay}s") + print() + + # Run labeling + asyncio.run( + label_documents( + docs=docs, + taxonomy=taxonomy, + output_path=args.output, + model=args.model, + max_parallel=args.parallel, + rpm=args.rpm, + delay=args.delay, + ) + ) + + # Print summary + print_summary(args.output, taxonomy) + + print(f"\nNext step: python train.py --input {args.output}") + + +if __name__ == "__main__": + main() diff --git a/scripts/classification/pyproject.toml b/scripts/classification/pyproject.toml new file mode 100644 index 0000000..9e2aa75 --- /dev/null +++ b/scripts/classification/pyproject.toml @@ -0,0 +1,18 @@ +[project] +name = "docx-corpus-classifier" +version = "0.1.0" +description = "ML classification pipeline for docx-corpus" +requires-python = ">=3.11" +dependencies = [ + "anthropic>=0.39.0", + "psycopg2-binary>=2.9.0", + "python-dotenv>=1.0.0", + "numpy>=1.26.0", + "tqdm>=4.66.0", + # ML / Training + "transformers>=4.47.0", + "torch>=2.5.0", + "datasets>=3.0.0", + "scikit-learn>=1.5.0", + "accelerate>=1.0.0", +] diff --git a/scripts/classification/sample.py b/scripts/classification/sample.py new file mode 100644 index 0000000..2b00ea6 --- /dev/null +++ b/scripts/classification/sample.py @@ -0,0 +1,250 @@ +#!/usr/bin/env python3 +""" +Phase 2, Step 1: Create a stratified sample of documents for LLM labeling. + +Samples documents across the top 5 languages, stratified by: +- Language (proportional to corpus representation) +- Word count (small/medium/large terciles) +- Source domain diversity + +Usage: + python sample.py --total 3500 --output sampled_docs.jsonl + python sample.py --total 3500 --output sampled_docs.jsonl --languages en,ru,cs,pl,es +""" + +import argparse +import json +import random +import sys +from urllib.parse import urlparse + +from common import get_db_connection + +# Default top 5 languages and their approximate sample allocation +DEFAULT_LANGUAGES = ["en", "ru", "cs", "pl", "es"] + + +def get_documents_for_language( + language: str, limit: int = 100000 +) -> list[dict]: + """Fetch extracted documents for a given language.""" + conn = get_db_connection() + try: + with conn.cursor() as cur: + cur.execute( + """ + SELECT id, source_url, original_filename, word_count, file_size_bytes + FROM documents + WHERE extracted_at IS NOT NULL + AND extraction_error IS NULL + AND language = %s + AND word_count > 0 + ORDER BY random() + LIMIT %s + """, + (language, limit), + ) + return [ + { + "id": row[0], + "source_url": row[1], + "original_filename": row[2], + "word_count": row[3], + "file_size_bytes": row[4], + "language": language, + } + for row in cur.fetchall() + ] + finally: + conn.close() + + +def get_language_counts(languages: list[str]) -> dict[str, int]: + """Get document counts for the specified languages.""" + conn = get_db_connection() + try: + with conn.cursor() as cur: + placeholders = ",".join(["%s"] * len(languages)) + cur.execute( + f""" + SELECT language, COUNT(*)::int as count + FROM documents + WHERE extracted_at IS NOT NULL + AND extraction_error IS NULL + AND language IN ({placeholders}) + AND word_count > 0 + GROUP BY language + ORDER BY count DESC + """, + languages, + ) + return {row[0]: row[1] for row in cur.fetchall()} + finally: + conn.close() + + +def stratified_sample( + docs: list[dict], n: int, seed: int = 42 +) -> list[dict]: + """ + Stratified sample by word count terciles and source domain diversity. + + Splits documents into 3 word-count bins (short/medium/long), + samples proportionally from each, preferring diverse source domains. + """ + rng = random.Random(seed) + + if len(docs) <= n: + return docs + + # Sort by word count and split into terciles + sorted_docs = sorted(docs, key=lambda d: d["word_count"]) + third = len(sorted_docs) // 3 + bins = [ + sorted_docs[:third], # short + sorted_docs[third : 2 * third], # medium + sorted_docs[2 * third :], # long + ] + + # Sample proportionally from each bin with domain diversity + samples_per_bin = n // 3 + remainder = n - (samples_per_bin * 3) + + result = [] + for i, bin_docs in enumerate(bins): + target = samples_per_bin + (1 if i < remainder else 0) + result.extend(_diverse_sample(bin_docs, target, rng)) + + return result + + +def _diverse_sample( + docs: list[dict], n: int, rng: random.Random +) -> list[dict]: + """Sample n documents, preferring diverse source domains.""" + if len(docs) <= n: + return docs + + # Group by domain + by_domain: dict[str, list[dict]] = {} + for doc in docs: + try: + domain = urlparse(doc["source_url"]).netloc + except Exception: + domain = "unknown" + by_domain.setdefault(domain, []).append(doc) + + # Round-robin from domains until we have enough + result = [] + domains = list(by_domain.keys()) + rng.shuffle(domains) + + # Shuffle within each domain + for domain in domains: + rng.shuffle(by_domain[domain]) + + idx = {d: 0 for d in domains} + while len(result) < n: + added_any = False + for domain in domains: + if len(result) >= n: + break + if idx[domain] < len(by_domain[domain]): + result.append(by_domain[domain][idx[domain]]) + idx[domain] += 1 + added_any = True + if not added_any: + break + + return result[:n] + + +def main(): + parser = argparse.ArgumentParser( + description="Create stratified sample for LLM labeling" + ) + parser.add_argument( + "--total", type=int, default=3500, help="Total documents to sample (default: 3500)" + ) + parser.add_argument( + "--output", type=str, default="sampled_docs.jsonl", help="Output JSONL file" + ) + parser.add_argument( + "--languages", + type=str, + default=",".join(DEFAULT_LANGUAGES), + help="Comma-separated language codes (default: en,ru,cs,pl,es)", + ) + parser.add_argument("--seed", type=int, default=42, help="Random seed") + args = parser.parse_args() + + languages = [l.strip() for l in args.languages.split(",")] + + print("=" * 60) + print("Stratified Document Sampling") + print("=" * 60) + + # Get language counts + print(f"\nFetching counts for languages: {languages}") + counts = get_language_counts(languages) + total_docs = sum(counts.values()) + + print(f"\nLanguage distribution:") + for lang, count in sorted(counts.items(), key=lambda x: -x[1]): + pct = 100 * count / total_docs + print(f" {lang}: {count:,} ({pct:.1f}%)") + print(f" Total: {total_docs:,}") + + # Calculate per-language sample sizes (proportional) + min_per_lang = min(50, args.total // len(counts)) + allocations = {} + for lang, count in counts.items(): + proportion = count / total_docs + allocations[lang] = max(min_per_lang, round(args.total * proportion)) + + # Adjust to hit exact total + allocated = sum(allocations.values()) + if allocated != args.total: + diff = args.total - allocated + # Add/remove from largest language + largest = max(allocations, key=allocations.get) + allocations[largest] = max(1, allocations[largest] + diff) + + print(f"\nSample allocation (total={args.total}):") + for lang in sorted(allocations, key=lambda l: -allocations[l]): + print(f" {lang}: {allocations[lang]}") + + # Sample from each language + all_samples = [] + for lang, n_samples in allocations.items(): + print(f"\nSampling {n_samples} from {lang}...") + # Fetch more than needed to allow stratification + docs = get_documents_for_language(lang, limit=min(n_samples * 10, 100000)) + print(f" Fetched {len(docs):,} candidates") + + sampled = stratified_sample(docs, n_samples, seed=args.seed) + all_samples.extend(sampled) + print(f" Selected {len(sampled)} documents") + + # Show word count distribution of sample + word_counts = [d["word_count"] for d in sampled if d["word_count"]] + if word_counts: + print( + f" Word count: min={min(word_counts):,}, " + f"median={sorted(word_counts)[len(word_counts)//2]:,}, " + f"max={max(word_counts):,}" + ) + + # Save + with open(args.output, "w") as f: + for doc in all_samples: + f.write(json.dumps(doc) + "\n") + + print(f"\n{'=' * 60}") + print(f"Saved {len(all_samples)} documents to {args.output}") + print(f"{'=' * 60}") + print(f"\nNext step: python label.py --input {args.output} --output labeled_docs.jsonl") + + +if __name__ == "__main__": + main() diff --git a/scripts/classification/taxonomy.json b/scripts/classification/taxonomy.json new file mode 100644 index 0000000..27fc3fb --- /dev/null +++ b/scripts/classification/taxonomy.json @@ -0,0 +1,114 @@ +{ + "name": "docx-corpus-v2", + "version": "2.0.0", + "description": "Two-dimensional taxonomy for .docx document classification. Dimension 1 (document_type) describes the document's form/structure. Dimension 2 (topic) describes its subject domain. Each document gets one label per dimension.", + "document_types": [ + { + "id": "legal", + "label": "Legal Documents", + "description": "Contracts, agreements, legal notices, terms of service, regulations, statutes, and other legally binding or law-related documents", + "examples": ["employment agreement", "NDA", "terms and conditions", "legal notice", "legislation", "statute", "regulation"] + }, + { + "id": "forms", + "label": "Forms & Applications", + "description": "Fillable forms, applications, registration documents, surveys, questionnaires, and other documents designed for data collection", + "examples": ["application form", "registration form", "survey", "questionnaire", "ballot", "submission form"] + }, + { + "id": "reports", + "label": "Reports & Analysis", + "description": "Research reports, analysis documents, studies, assessments, evaluations, annual reports, and other documents presenting findings or data", + "examples": ["annual report", "research paper", "case study", "analysis", "assessment", "evaluation", "white paper"] + }, + { + "id": "policies", + "label": "Policies & Procedures", + "description": "Policy documents, procedures, guidelines, manuals, handbooks, and other documents establishing rules or processes", + "examples": ["privacy policy", "employee handbook", "procedure manual", "guidelines", "standard operating procedure"] + }, + { + "id": "educational", + "label": "Educational Materials", + "description": "Curricula, syllabi, lesson plans, educational content, worksheets, study guides, academic papers, theses, and other teaching or learning materials", + "examples": ["syllabus", "lesson plan", "course outline", "study guide", "worksheet", "thesis", "dissertation"] + }, + { + "id": "correspondence", + "label": "Correspondence", + "description": "Letters, memos, announcements, communications, press releases, newsletters, and other direct communications", + "examples": ["letter", "memo", "press release", "announcement", "notice", "circular", "newsletter"] + }, + { + "id": "technical", + "label": "Technical Documentation", + "description": "Technical specifications, manuals, documentation, standards, API docs, user guides, and other technical reference materials", + "examples": ["technical manual", "API documentation", "specifications", "user guide", "technical standard", "datasheet"] + }, + { + "id": "administrative", + "label": "Administrative Documents", + "description": "Meeting minutes, agendas, schedules, organizational documents, resolutions, records, and other administrative materials", + "examples": ["meeting minutes", "agenda", "organizational chart", "resolution", "schedule", "roster", "directory"] + }, + { + "id": "creative", + "label": "Creative & Marketing", + "description": "Marketing materials, brochures, proposals, presentations, promotional content, and other persuasive or creative documents", + "examples": ["brochure", "proposal", "marketing plan", "presentation script", "pitch deck content", "catalog"] + }, + { + "id": "reference", + "label": "Reference & Catalogs", + "description": "Directories, catalogs, glossaries, FAQs, indices, lists, and other reference or lookup documents", + "examples": ["product catalog", "directory", "glossary", "FAQ", "index", "bibliography", "inventory list"] + } + ], + "topics": [ + { + "id": "government", + "label": "Government & Public Sector", + "description": "Government agencies, public administration, civic affairs, municipal matters, public policy, elections" + }, + { + "id": "education", + "label": "Education & Academia", + "description": "Schools, universities, educational institutions, academic programs, student affairs, research" + }, + { + "id": "healthcare", + "label": "Healthcare & Medicine", + "description": "Medical, health services, hospitals, clinics, public health, pharmaceuticals, biotech" + }, + { + "id": "finance", + "label": "Finance & Business", + "description": "Banking, finance, corporate, business operations, commerce, investment, insurance, real estate" + }, + { + "id": "legal_judicial", + "label": "Legal & Judicial", + "description": "Law firms, courts, legal services, judicial matters, compliance, law enforcement" + }, + { + "id": "technology", + "label": "Technology & Engineering", + "description": "IT, software, tech companies, digital services, engineering, telecommunications, R&D" + }, + { + "id": "environment", + "label": "Environment & Energy", + "description": "Environmental protection, sustainability, conservation, climate, energy, natural resources, agriculture" + }, + { + "id": "nonprofit", + "label": "Nonprofit & NGO", + "description": "Charitable organizations, NGOs, community organizations, social services, international aid" + }, + { + "id": "general", + "label": "General / Other", + "description": "Documents that don't fit clearly into other topic categories, or span multiple domains equally" + } + ] +} diff --git a/scripts/classification/train.py b/scripts/classification/train.py new file mode 100644 index 0000000..821b423 --- /dev/null +++ b/scripts/classification/train.py @@ -0,0 +1,443 @@ +#!/usr/bin/env python3 +""" +Phase 3: Train ModernBERT classifiers on LLM-labeled documents. + +Trains two independent classifiers: + 1. Document type (10 classes) + 2. Topic (9 classes) + +Uses HuggingFace Transformers with the answerdotai/ModernBERT-base model. +Supports configurable train/val split, epochs, learning rate, etc. + +Usage: + python train.py --input labeled_docs.jsonl + python train.py --input labeled_docs.jsonl --epochs 5 --lr 2e-5 + python train.py --input labeled_docs.jsonl --output-dir ./models +""" + +import argparse +import json +import os +import sys +from pathlib import Path + +import numpy as np +import torch +from datasets import Dataset +from sklearn.metrics import ( + accuracy_score, + classification_report, + f1_score, +) +from sklearn.model_selection import train_test_split +from transformers import ( + AutoModelForSequenceClassification, + AutoTokenizer, + EarlyStoppingCallback, + Trainer, + TrainingArguments, +) + +from common import fetch_documents_text_parallel, load_taxonomy + +DEFAULT_MODEL = "answerdotai/ModernBERT-base" +DEFAULT_EPOCHS = 5 +DEFAULT_LR = 2e-5 +DEFAULT_BATCH_SIZE = 16 +DEFAULT_MAX_LENGTH = 512 +DEFAULT_VAL_SPLIT = 0.15 +DEFAULT_OUTPUT_DIR = "./models" + + +def load_labeled_data(input_path: str, min_confidence: float = 0.0) -> list[dict]: + """Load labeled documents, optionally filtering by confidence.""" + docs = [] + with open(input_path) as f: + for line in f: + if line.strip(): + entry = json.loads(line) + if entry.get("confidence", 0) >= min_confidence: + docs.append(entry) + return docs + + +def fetch_texts(docs: list[dict], max_chars: int = 2000) -> dict[str, str]: + """Fetch document texts in parallel batches.""" + doc_ids = [d["id"] for d in docs] + all_texts = {} + batch_size = 100 + for i in range(0, len(doc_ids), batch_size): + batch = doc_ids[i : i + batch_size] + texts = fetch_documents_text_parallel(batch, max_chars=max_chars) + all_texts.update(texts) + fetched = min(i + batch_size, len(doc_ids)) + if fetched < len(doc_ids): + print(f" Fetched text: {fetched}/{len(doc_ids)}") + return all_texts + + +def build_label_maps(taxonomy: dict) -> tuple[dict, dict, dict, dict]: + """Build label-to-id and id-to-label mappings for both dimensions.""" + type_labels = [t["id"] for t in taxonomy["document_types"]] + topic_labels = [t["id"] for t in taxonomy["topics"]] + + type2id = {label: i for i, label in enumerate(type_labels)} + id2type = {i: label for label, i in type2id.items()} + topic2id = {label: i for i, label in enumerate(topic_labels)} + id2topic = {i: label for label, i in topic2id.items()} + + return type2id, id2type, topic2id, id2topic + + +def prepare_dataset( + docs: list[dict], + texts: dict[str, str], + label_map: dict[str, int], + label_field: str, +) -> tuple[list[str], list[int]]: + """Prepare text/label pairs, skipping docs without text or valid labels.""" + input_texts = [] + labels = [] + skipped = 0 + for doc in docs: + text = texts.get(doc["id"], "") + label = doc.get(label_field, "") + if not text or label not in label_map: + skipped += 1 + continue + input_texts.append(text) + labels.append(label_map[label]) + if skipped: + print(f" Skipped {skipped} docs (missing text or invalid label)") + return input_texts, labels + + +def compute_metrics(eval_pred): + """Compute accuracy and macro F1 for evaluation.""" + predictions, labels = eval_pred + preds = np.argmax(predictions, axis=-1) + acc = accuracy_score(labels, preds) + f1 = f1_score(labels, preds, average="macro") + return {"accuracy": acc, "f1_macro": f1} + + +def train_classifier( + train_texts: list[str], + train_labels: list[int], + val_texts: list[str], + val_labels: list[int], + num_labels: int, + id2label: dict[int, str], + label2id: dict[str, int], + output_dir: str, + model_name: str, + epochs: int, + lr: float, + batch_size: int, + max_length: int, + classifier_name: str, +): + """Train a single classifier (type or topic).""" + print(f"\n{'=' * 60}") + print(f"Training: {classifier_name}") + print(f" Train: {len(train_texts)}, Val: {len(val_texts)}") + print(f" Classes: {num_labels}") + print(f" Model: {model_name}") + print(f" Epochs: {epochs}, LR: {lr}, Batch: {batch_size}") + print(f"{'=' * 60}") + + tokenizer = AutoTokenizer.from_pretrained(model_name) + model = AutoModelForSequenceClassification.from_pretrained( + model_name, + num_labels=num_labels, + id2label=id2label, + label2id=label2id, + ) + + def tokenize(examples): + return tokenizer( + examples["text"], + truncation=True, + max_length=max_length, + padding="max_length", + ) + + train_ds = Dataset.from_dict({"text": train_texts, "label": train_labels}) + val_ds = Dataset.from_dict({"text": val_texts, "label": val_labels}) + + train_ds = train_ds.map(tokenize, batched=True, remove_columns=["text"]) + val_ds = val_ds.map(tokenize, batched=True, remove_columns=["text"]) + + train_ds.set_format("torch") + val_ds.set_format("torch") + + save_dir = os.path.join(output_dir, classifier_name) + + training_args = TrainingArguments( + output_dir=save_dir, + num_train_epochs=epochs, + per_device_train_batch_size=batch_size, + per_device_eval_batch_size=batch_size * 2, + learning_rate=lr, + weight_decay=0.01, + warmup_ratio=0.1, + eval_strategy="epoch", + save_strategy="epoch", + load_best_model_at_end=True, + metric_for_best_model="f1_macro", + greater_is_better=True, + save_total_limit=2, + logging_steps=50, + fp16=torch.cuda.is_available(), + report_to="none", + seed=42, + ) + + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_ds, + eval_dataset=val_ds, + compute_metrics=compute_metrics, + callbacks=[EarlyStoppingCallback(early_stopping_patience=2)], + ) + + trainer.train() + + # Evaluate on validation set + eval_results = trainer.evaluate() + print(f"\n Val accuracy: {eval_results['eval_accuracy']:.4f}") + print(f" Val F1 macro: {eval_results['eval_f1_macro']:.4f}") + + # Save best model + best_dir = os.path.join(save_dir, "best") + trainer.save_model(best_dir) + tokenizer.save_pretrained(best_dir) + + # Full classification report on val set + preds = trainer.predict(val_ds) + pred_labels = np.argmax(preds.predictions, axis=-1) + report = classification_report( + val_ds["label"], + pred_labels, + target_names=[id2label[i] for i in range(num_labels)], + ) + print(f"\nClassification Report ({classifier_name}):\n{report}") + + # Save report + report_path = os.path.join(save_dir, "eval_report.txt") + with open(report_path, "w") as f: + f.write(f"Model: {model_name}\n") + f.write(f"Classifier: {classifier_name}\n") + f.write(f"Train size: {len(train_texts)}\n") + f.write(f"Val size: {len(val_texts)}\n") + f.write(f"Epochs: {epochs}\n") + f.write(f"Val accuracy: {eval_results['eval_accuracy']:.4f}\n") + f.write(f"Val F1 macro: {eval_results['eval_f1_macro']:.4f}\n\n") + f.write(report) + + return eval_results + + +def main(): + parser = argparse.ArgumentParser( + description="Train ModernBERT classifiers on labeled documents" + ) + parser.add_argument( + "--input", type=str, required=True, help="Labeled JSONL from label.py" + ) + parser.add_argument( + "--output-dir", + type=str, + default=DEFAULT_OUTPUT_DIR, + help=f"Output directory for models (default: {DEFAULT_OUTPUT_DIR})", + ) + parser.add_argument( + "--model", + type=str, + default=DEFAULT_MODEL, + help=f"Base model (default: {DEFAULT_MODEL})", + ) + parser.add_argument( + "--epochs", + type=int, + default=DEFAULT_EPOCHS, + help=f"Training epochs (default: {DEFAULT_EPOCHS})", + ) + parser.add_argument( + "--lr", + type=float, + default=DEFAULT_LR, + help=f"Learning rate (default: {DEFAULT_LR})", + ) + parser.add_argument( + "--batch-size", + type=int, + default=DEFAULT_BATCH_SIZE, + help=f"Batch size (default: {DEFAULT_BATCH_SIZE})", + ) + parser.add_argument( + "--max-length", + type=int, + default=DEFAULT_MAX_LENGTH, + help=f"Max token length (default: {DEFAULT_MAX_LENGTH})", + ) + parser.add_argument( + "--val-split", + type=float, + default=DEFAULT_VAL_SPLIT, + help=f"Validation split ratio (default: {DEFAULT_VAL_SPLIT})", + ) + parser.add_argument( + "--min-confidence", + type=float, + default=0.0, + help="Minimum LLM confidence to include (default: 0.0)", + ) + parser.add_argument( + "--max-chars", + type=int, + default=2000, + help="Max characters of document text (default: 2000)", + ) + parser.add_argument("--seed", type=int, default=42, help="Random seed") + args = parser.parse_args() + + if not os.path.exists(args.input): + print(f"ERROR: Input file not found: {args.input}") + sys.exit(1) + + # Load taxonomy and label maps + taxonomy = load_taxonomy() + type2id, id2type, topic2id, id2topic = build_label_maps(taxonomy) + print(f"Taxonomy: {taxonomy['name']} v{taxonomy['version']}") + print(f" Document types: {len(type2id)} classes") + print(f" Topics: {len(topic2id)} classes") + + # Load labeled data + docs = load_labeled_data(args.input, min_confidence=args.min_confidence) + print(f"\nLoaded {len(docs)} labeled documents") + if args.min_confidence > 0: + print(f" (filtered by confidence >= {args.min_confidence})") + + # Fetch texts + print(f"\nFetching document texts (max {args.max_chars} chars)...") + texts = fetch_texts(docs, max_chars=args.max_chars) + print(f" Got text for {sum(1 for t in texts.values() if t)}/{len(docs)} docs") + + # Prepare datasets for both classifiers + print("\nPreparing document_type dataset...") + type_texts, type_labels = prepare_dataset(docs, texts, type2id, "document_type") + print(f" {len(type_texts)} examples across {len(set(type_labels))} classes") + + print("Preparing topic dataset...") + topic_texts, topic_labels = prepare_dataset(docs, texts, topic2id, "document_topic") + print(f" {len(topic_texts)} examples across {len(set(topic_labels))} classes") + + # Split into train/val (same split for both to keep comparable) + print(f"\nSplitting: {1 - args.val_split:.0%} train / {args.val_split:.0%} val") + + type_train_texts, type_val_texts, type_train_labels, type_val_labels = ( + train_test_split( + type_texts, + type_labels, + test_size=args.val_split, + random_state=args.seed, + stratify=type_labels, + ) + ) + + topic_train_texts, topic_val_texts, topic_train_labels, topic_val_labels = ( + train_test_split( + topic_texts, + topic_labels, + test_size=args.val_split, + random_state=args.seed, + stratify=topic_labels, + ) + ) + + os.makedirs(args.output_dir, exist_ok=True) + + # Train document_type classifier + type_results = train_classifier( + train_texts=type_train_texts, + train_labels=type_train_labels, + val_texts=type_val_texts, + val_labels=type_val_labels, + num_labels=len(type2id), + id2label=id2type, + label2id=type2id, + output_dir=args.output_dir, + model_name=args.model, + epochs=args.epochs, + lr=args.lr, + batch_size=args.batch_size, + max_length=args.max_length, + classifier_name="document_type", + ) + + # Train topic classifier + topic_results = train_classifier( + train_texts=topic_train_texts, + train_labels=topic_train_labels, + val_texts=topic_val_texts, + val_labels=topic_val_labels, + num_labels=len(topic2id), + id2label=id2topic, + label2id=topic2id, + output_dir=args.output_dir, + model_name=args.model, + epochs=args.epochs, + lr=args.lr, + batch_size=args.batch_size, + max_length=args.max_length, + classifier_name="topic", + ) + + # Summary + print(f"\n{'=' * 60}") + print("Training Complete") + print(f"{'=' * 60}") + print(f" Document Type — Acc: {type_results['eval_accuracy']:.4f}, F1: {type_results['eval_f1_macro']:.4f}") + print(f" Topic — Acc: {topic_results['eval_accuracy']:.4f}, F1: {topic_results['eval_f1_macro']:.4f}") + print(f"\n Models saved to: {args.output_dir}/") + print(f" {args.output_dir}/document_type/best/") + print(f" {args.output_dir}/topic/best/") + + # Save training config + config = { + "base_model": args.model, + "taxonomy": taxonomy["name"], + "taxonomy_version": taxonomy["version"], + "input_file": args.input, + "total_docs": len(docs), + "min_confidence": args.min_confidence, + "max_chars": args.max_chars, + "max_length": args.max_length, + "epochs": args.epochs, + "learning_rate": args.lr, + "batch_size": args.batch_size, + "val_split": args.val_split, + "seed": args.seed, + "results": { + "document_type": { + "accuracy": type_results["eval_accuracy"], + "f1_macro": type_results["eval_f1_macro"], + }, + "topic": { + "accuracy": topic_results["eval_accuracy"], + "f1_macro": topic_results["eval_f1_macro"], + }, + }, + } + config_path = os.path.join(args.output_dir, "training_config.json") + with open(config_path, "w") as f: + json.dump(config, f, indent=2) + print(f" Config saved to: {config_path}") + + print(f"\nNext step: python classify.py --models-dir {args.output_dir}") + + +if __name__ == "__main__": + main() From eb0aaf9ecf5d10e9ade85b25e830647c1cc49797 Mon Sep 17 00:00:00 2001 From: Caio Pizzol Date: Mon, 9 Mar 2026 14:09:41 -0300 Subject: [PATCH 2/2] feat: add Modal cloud GPU support and clean up classification pipeline - Merge train_modal.py into train.py (--modal flag) - Merge classify_modal.py into classify.py (--modal --workers N) - Switch base model to xlm-roberta-base (multilingual) - Add class-weighted loss for imbalanced classes - Add --exclude flag to sample.py for iterative sampling - Gitignore models/ and *.jsonl artifacts - Update docs for Modal setup and cost estimates --- .gitignore | 4 + scripts/classification/CLAUDE.md | 15 +- scripts/classification/README.md | 64 ++- scripts/classification/classify.py | 468 +++++++++++++--------- scripts/classification/sample.py | 17 + scripts/classification/train.py | 603 +++++++++++++++++++---------- 6 files changed, 743 insertions(+), 428 deletions(-) diff --git a/.gitignore b/.gitignore index 7473c68..450ae9f 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,7 @@ coverage/ # OS .DS_Store + +# Classification pipeline artifacts +scripts/classification/models/ +scripts/classification/*.jsonl diff --git a/scripts/classification/CLAUDE.md b/scripts/classification/CLAUDE.md index 9d9a0bf..7b20bc0 100644 --- a/scripts/classification/CLAUDE.md +++ b/scripts/classification/CLAUDE.md @@ -8,8 +8,8 @@ Uses the FineWeb-Edu pattern: LLM labels a small sample → train lightweight cl 1. **`sample.py`** — Stratified sampling from PostgreSQL. Samples proportionally across languages (en, ru, cs, pl, es), stratified by word count terciles and source domain diversity. 2. **`label.py`** — Async LLM labeling with Claude. Supports resume (appends to JSONL). Rate-limited with configurable parallelism. -3. **`train.py`** — Fine-tunes two independent ModernBERT classifiers (document_type and topic). Outputs models to `./models/`. -4. **`classify.py`** — Batch inference on the full corpus. Fetches text from R2, runs both models, writes results to PostgreSQL. +3. **`train.py`** — Fine-tunes two independent xlm-roberta-base classifiers (document_type and topic). Supports `--modal` for cloud GPU training. Outputs models to `./models/`. +4. **`classify.py`** — Batch inference on the full corpus. Supports `--modal` for parallel cloud workers (20x speedup). Fetches text from R2, runs both models, writes results to PostgreSQL. 5. **`evaluate.py`** — Quality metrics. Two modes: `labels` (analyzes JSONL) and `corpus` (queries DB). ## Key files @@ -24,10 +24,18 @@ Writes to the same `documents` table as the TS pipeline: - `document_type` — one of 10 types (legal, forms, reports, etc.) - `document_topic` — one of 9 topics (government, education, healthcare, etc.) - `classification_confidence` — min(type_confidence, topic_confidence) -- `classification_model` — e.g. "claude-haiku-4-5" or "modernbert-v2.0.0" +- `classification_model` — e.g. "claude-haiku-4-5" or "modernbert-2.0.0" Connection via `DATABASE_URL` env var loaded from `../../.env`. +## Modal (cloud GPU) + +Both `train.py` and `classify.py` support a `--modal` flag for cloud execution: +- Training uses a single GPU (T4 default, configurable with `--gpu`) +- Classification fans out across `--workers` parallel containers for ~160 docs/s aggregate +- Models are persisted in a Modal Volume (`classifier-models`) +- DB credentials are stored in a Modal Secret (`docx-db`) + ## Conventions - Python 3.11+, no type stubs needed @@ -36,3 +44,4 @@ Connection via `DATABASE_URL` env var loaded from `../../.env`. - Text is fetched via HTTP from the public R2 endpoint, not direct R2 access - All scripts support `--help` for usage - JSONL files are the interchange format between steps +- Data files (*.jsonl, models/) are gitignored — store locally in `~/data/docx-corpus/classification/` diff --git a/scripts/classification/README.md b/scripts/classification/README.md index bca3f5b..5ef6db5 100644 --- a/scripts/classification/README.md +++ b/scripts/classification/README.md @@ -1,7 +1,7 @@ # Document Classification Pipeline -Classifies ~800K .docx documents using the FineWeb-Edu / TnT-LLM pattern: -LLM labels a small sample → train ModernBERT classifier → apply at scale. +Classifies ~800K .docx documents using the [FineWeb-Edu / TnT-LLM](https://huggingface.co/spaces/HuggingFaceFW/blogpost-fineweb-v1) pattern: +LLM labels a small sample → train classifier → apply at scale. ## Two-Dimensional Taxonomy @@ -10,6 +10,8 @@ Each document gets classified on two independent dimensions: - **Document Type** (10 classes): legal, forms, reports, policies, educational, correspondence, technical, administrative, creative, reference - **Topic** (9 classes): government, education, healthcare, finance, legal_judicial, technology, environment, nonprofit, general +See [`taxonomy.json`](taxonomy.json) for full definitions and examples. + ## Pipeline Steps ### 1. Sample (`sample.py`) @@ -29,37 +31,37 @@ python label.py --input sampled_docs.jsonl --output labeled_docs.jsonl python label.py --input sampled_docs.jsonl --output labeled_docs.jsonl --model claude-haiku-4-5 --parallel 5 ``` -### 3. Evaluate Labels (`evaluate.py labels`) - -Check label quality before training. - -```bash -python evaluate.py labels --input labeled_docs.jsonl -``` - -### 4. Train (`train.py`) +### 3. Train (`train.py`) -Fine-tune ModernBERT on labeled data. Trains two independent classifiers. +Fine-tune xlm-roberta-base on labeled data. Trains two independent classifiers with class-weighted loss. ```bash +# Local training (CPU/MPS/CUDA) python train.py --input labeled_docs.jsonl -python train.py --input labeled_docs.jsonl --epochs 5 --lr 2e-5 --output-dir ./models + +# Cloud training on Modal GPU +python train.py --input labeled_docs.jsonl --modal +python train.py --input labeled_docs.jsonl --modal --gpu a10g ``` -### 5. Classify (`classify.py`) +### 4. Classify (`classify.py`) -Apply trained models to the full corpus. +Apply trained models to the full corpus. Supports parallel cloud workers via Modal. ```bash +# Local inference python classify.py --models-dir ./models -python classify.py --models-dir ./models --batch-size 256 --dry-run --limit 100 + +# Cloud inference with 20 parallel GPU workers +python classify.py --models-dir ./models --modal --workers 20 ``` -### 6. Evaluate Corpus (`evaluate.py corpus`) +### 5. Evaluate (`evaluate.py`) -Check full corpus classification distribution. +Check label quality or corpus classification distribution. ```bash +python evaluate.py labels --input labeled_docs.jsonl python evaluate.py corpus python evaluate.py corpus --languages en,ru ``` @@ -67,15 +69,35 @@ python evaluate.py corpus --languages en,ru ## Setup ```bash -pip install -r requirements.txt +pip install -e . ``` Required environment variables (`.env` in project root): - `DATABASE_URL` — PostgreSQL connection string - `ANTHROPIC_API_KEY` — For LLM labeling step only +### Modal Setup (optional, for cloud training/inference) + +```bash +pip install modal +python -m modal setup +modal secret create docx-db DATABASE_URL="postgres://..." +``` + +## Key Files + +| File | Purpose | +|------|---------| +| `taxonomy.json` | Two-dimensional taxonomy definition (source of truth) | +| `common.py` | Shared utilities: DB, text fetching, taxonomy loading | +| `sample.py` | Stratified document sampling from PostgreSQL | +| `label.py` | Async LLM labeling with Claude | +| `train.py` | Fine-tune classifiers (local or Modal) | +| `classify.py` | Batch inference on full corpus (local or Modal) | +| `evaluate.py` | Quality metrics and distribution analysis | + ## Cost Estimate - **Labeling**: ~3,500 docs × Claude Haiku ≈ $2-5 -- **Training**: ~30 min on GPU (or ~2h on CPU) -- **Inference**: ~800K docs, ~200-500 docs/sec on GPU +- **Training**: ~30 min on T4 GPU (~$0.30 on Modal, free tier covers it) +- **Inference**: ~800K docs with 20 Modal workers ≈ 75 min (~$12 or within free tier) diff --git a/scripts/classification/classify.py b/scripts/classification/classify.py index 411e743..067826a 100644 --- a/scripts/classification/classify.py +++ b/scripts/classification/classify.py @@ -1,20 +1,24 @@ #!/usr/bin/env python3 """ -Phase 4: Apply trained classifiers to the full corpus. +Apply trained classifiers to the full corpus. -Loads the trained ModernBERT models and classifies all unclassified documents. +Loads the trained models and classifies all unclassified documents. Fetches text from R2, runs inference, updates the database. - Supports resume — already-classified documents are skipped. Usage: + # Local classification python classify.py --models-dir ./models - python classify.py --models-dir ./models --batch-size 256 --languages en,ru,cs,pl,es - python classify.py --models-dir ./models --dry-run --limit 100 + python classify.py --models-dir ./models --batch-size 256 --languages en,ru + + # Cloud classification on Modal (parallel GPU workers) + python classify.py --models-dir ./models --modal + python classify.py --models-dir ./models --modal --workers 20 --gpu a10g """ import argparse import json +import math import os import sys import time @@ -37,6 +41,11 @@ DEFAULT_MAX_CHARS = 2000 +# --------------------------------------------------------------------------- +# Shared helpers +# --------------------------------------------------------------------------- + + def load_classifier(model_dir: str, device: torch.device): """Load a trained classifier and tokenizer.""" tokenizer = AutoTokenizer.from_pretrained(model_dir) @@ -63,28 +72,19 @@ def get_unclassified_documents( AND classification_model IS NULL """ params: list = [] - if languages: placeholders = ",".join(["%s"] * len(languages)) query += f" AND language IN ({placeholders})" params.extend(languages) - query += " ORDER BY random()" - if limit: query += " LIMIT %s" params.append(limit) - cur.execute(query, params) return [ - { - "id": row[0], - "source_url": row[1], - "original_filename": row[2], - "word_count": row[3], - "language": row[4], - } - for row in cur.fetchall() + {"id": r[0], "source_url": r[1], "original_filename": r[2], + "word_count": r[3], "language": r[4]} + for r in cur.fetchall() ] finally: conn.close() @@ -104,10 +104,8 @@ def get_classification_stats() -> dict: """) row = cur.fetchone() return { - "total": row[0], - "classified": row[1], - "classifiable": row[2], - "remaining": row[2] - row[1], + "total": row[0], "classified": row[1], + "classifiable": row[2], "remaining": row[2] - row[1], } finally: conn.close() @@ -115,50 +113,29 @@ def get_classification_stats() -> dict: @torch.no_grad() def classify_batch( - texts: list[str], - tokenizer, - model, - max_length: int, - device: torch.device, + texts: list[str], tokenizer, model, max_length: int, device: torch.device, ) -> list[tuple[str, float]]: """Classify a batch of texts. Returns list of (label, confidence).""" inputs = tokenizer( - texts, - truncation=True, - max_length=max_length, - padding=True, - return_tensors="pt", + texts, truncation=True, max_length=max_length, + padding=True, return_tensors="pt", ).to(device) - outputs = model(**inputs) probs = torch.softmax(outputs.logits, dim=-1) confidences, pred_ids = torch.max(probs, dim=-1) - - results = [] - for pred_id, conf in zip(pred_ids.cpu().numpy(), confidences.cpu().numpy()): - label = model.config.id2label[int(pred_id)] - results.append((label, float(conf))) - - return results + return [ + (model.config.id2label[int(pid)], float(conf)) + for pid, conf in zip(pred_ids.cpu().numpy(), confidences.cpu().numpy()) + ] def process_batch( - docs: list[dict], - type_tokenizer, - type_model, - topic_tokenizer, - topic_model, - max_length: int, - max_chars: int, - device: torch.device, - model_name: str, + docs, type_tokenizer, type_model, topic_tokenizer, topic_model, + max_length, max_chars, device, model_name, ) -> list[dict]: """Process a batch: fetch texts, classify, return label dicts.""" - # Fetch texts doc_ids = [d["id"] for d in docs] texts = fetch_documents_text_parallel(doc_ids, max_chars=max_chars) - - # Filter docs with text valid_docs = [] valid_texts = [] for doc in docs: @@ -166,99 +143,33 @@ def process_batch( if text: valid_docs.append(doc) valid_texts.append(text) - if not valid_texts: return [] + type_results = classify_batch(valid_texts, type_tokenizer, type_model, max_length, device) + topic_results = classify_batch(valid_texts, topic_tokenizer, topic_model, max_length, device) + return [ + { + "id": doc["id"], "document_type": dt, "document_topic": tp, + "confidence": min(tc, tpc), "model": model_name, + } + for doc, (dt, tc), (tp, tpc) in zip(valid_docs, type_results, topic_results) + ] - # Classify with both models - type_results = classify_batch( - valid_texts, type_tokenizer, type_model, max_length, device - ) - topic_results = classify_batch( - valid_texts, topic_tokenizer, topic_model, max_length, device - ) - # Build label dicts for DB update - labels = [] - for doc, (doc_type, type_conf), (topic, topic_conf) in zip( - valid_docs, type_results, topic_results - ): - labels.append( - { - "id": doc["id"], - "document_type": doc_type, - "document_topic": topic, - "confidence": min(type_conf, topic_conf), - "model": model_name, - } - ) - - return labels +# --------------------------------------------------------------------------- +# Local classification +# --------------------------------------------------------------------------- -def main(): - parser = argparse.ArgumentParser( - description="Classify full corpus with trained ModernBERT models" - ) - parser.add_argument( - "--models-dir", - type=str, - required=True, - help="Directory containing trained models (from train.py)", - ) - parser.add_argument( - "--batch-size", - type=int, - default=DEFAULT_BATCH_SIZE, - help=f"Inference batch size (default: {DEFAULT_BATCH_SIZE})", - ) - parser.add_argument( - "--max-length", - type=int, - default=DEFAULT_MAX_LENGTH, - help=f"Max token length (default: {DEFAULT_MAX_LENGTH})", - ) - parser.add_argument( - "--max-chars", - type=int, - default=DEFAULT_MAX_CHARS, - help=f"Max text characters to fetch (default: {DEFAULT_MAX_CHARS})", - ) - parser.add_argument( - "--languages", - type=str, - default=None, - help="Comma-separated language codes to classify (default: all)", - ) - parser.add_argument( - "--limit", - type=int, - default=None, - help="Max documents to classify (default: all)", - ) - parser.add_argument( - "--db-batch-size", - type=int, - default=500, - help="DB update batch size (default: 500)", - ) - parser.add_argument( - "--dry-run", - action="store_true", - help="Classify but don't write to DB", - ) - args = parser.parse_args() - - # Validate model directories +def run_local(args): + """Classify documents locally using available device.""" type_model_dir = os.path.join(args.models_dir, "document_type", "best") topic_model_dir = os.path.join(args.models_dir, "topic", "best") - for d in [type_model_dir, topic_model_dir]: if not os.path.exists(d): print(f"ERROR: Model directory not found: {d}") sys.exit(1) - # Load training config for model name config_path = os.path.join(args.models_dir, "training_config.json") if os.path.exists(config_path): with open(config_path) as f: @@ -267,7 +178,6 @@ def main(): else: model_name = "modernbert-v2" - # Device if torch.cuda.is_available(): device = torch.device("cuda") elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): @@ -276,108 +186,282 @@ def main(): device = torch.device("cpu") print(f"Device: {device}") - # Load models print(f"\nLoading document_type model from {type_model_dir}...") type_tokenizer, type_model = load_classifier(type_model_dir, device) print(f"Loading topic model from {topic_model_dir}...") topic_tokenizer, topic_model = load_classifier(topic_model_dir, device) - # Stats stats = get_classification_stats() - print(f"\nCorpus stats:") - print(f" Total documents: {stats['total']:,}") - print(f" Classifiable: {stats['classifiable']:,}") - print(f" Already classified: {stats['classified']:,}") - print(f" Remaining: {stats['remaining']:,}") - - # Get unclassified docs - languages = ( - [l.strip() for l in args.languages.split(",")] - if args.languages - else None - ) - print(f"\nFetching unclassified documents...") - docs = get_unclassified_documents(languages=languages, limit=args.limit) - print(f" Found {len(docs):,} documents to classify") + print(f"\nCorpus: {stats['total']:,} total, {stats['classifiable']:,} classifiable, " + f"{stats['classified']:,} done, {stats['remaining']:,} remaining") + languages = [l.strip() for l in args.languages.split(",")] if args.languages else None + docs = get_unclassified_documents(languages=languages, limit=args.limit) + print(f"Found {len(docs):,} documents to classify") if not docs: print("Nothing to classify!") return - if args.dry_run: print(" (DRY RUN — will not write to database)") - # Process in batches total_classified = 0 total_errors = 0 start_time = time.time() - - # Use smaller fetch batches for text retrieval fetch_batch_size = min(args.batch_size, 100) - pbar = tqdm(total=len(docs), desc="Classifying", unit="doc") for i in range(0, len(docs), fetch_batch_size): batch_docs = docs[i : i + fetch_batch_size] - labels = process_batch( - docs=batch_docs, - type_tokenizer=type_tokenizer, - type_model=type_model, - topic_tokenizer=topic_tokenizer, - topic_model=topic_model, - max_length=args.max_length, - max_chars=args.max_chars, - device=device, - model_name=model_name, + batch_docs, type_tokenizer, type_model, topic_tokenizer, topic_model, + args.max_length, args.max_chars, device, model_name, ) - if labels and not args.dry_run: save_labels_to_db(labels, batch_size=args.db_batch_size) - total_classified += len(labels) total_errors += len(batch_docs) - len(labels) pbar.update(len(batch_docs)) - - # Show throughput elapsed = time.time() - start_time rate = total_classified / elapsed if elapsed > 0 else 0 pbar.set_postfix_str(f"{rate:.0f} docs/s, {total_errors} errors") - pbar.close() elapsed = time.time() - start_time rate = total_classified / elapsed if elapsed > 0 else 0 - print(f"\n{'=' * 60}") print("Classification Complete") print(f"{'=' * 60}") - print(f" Classified: {total_classified:,}") - print(f" Errors (no text): {total_errors:,}") + print(f" Classified: {total_classified:,}, Errors: {total_errors:,}") print(f" Time: {elapsed:.1f}s ({rate:.0f} docs/s)") - print(f" Model: {model_name}") - if not args.dry_run: - final_stats = get_classification_stats() - print(f"\n Total classified in DB: {final_stats['classified']:,}") - print(f" Remaining: {final_stats['remaining']:,}") + final = get_classification_stats() + print(f" DB classified: {final['classified']:,}, remaining: {final['remaining']:,}") + if args.dry_run: + print(" (DRY RUN — no changes written)") + + +# --------------------------------------------------------------------------- +# Modal cloud classification (parallel workers) +# --------------------------------------------------------------------------- + +def run_modal(args): + """Classify on Modal with parallel GPU workers.""" + import modal + + app = modal.App("docx-classifier-inference") + inference_image = ( + modal.Image.debian_slim(python_version="3.11") + .pip_install("torch", "transformers", "numpy", "psycopg2-binary") + ) + model_volume = modal.Volume.from_name("classifier-models") + db_secret = modal.Secret.from_name("docx-db") + + gpu_map = {"t4": "T4", "a10g": "a10g", "l4": "l4", "a100": "a100"} + gpu = gpu_map.get(args.gpu.lower(), args.gpu) + + @app.function(image=inference_image, timeout=300, secrets=[db_secret]) + def fetch_unclassified_ids(languages: list[str] | None, limit: int | None) -> list[str]: + import os + import psycopg2 + conn = psycopg2.connect(os.environ["DATABASE_URL"]) + try: + with conn.cursor() as cur: + query = """ + SELECT id FROM documents + WHERE extracted_at IS NOT NULL AND extraction_error IS NULL + AND word_count > 0 AND classification_model IS NULL + """ + params = [] + if languages: + query += f" AND language IN ({','.join(['%s'] * len(languages))})" + params.extend(languages) + query += " ORDER BY random()" + if limit: + query += " LIMIT %s" + params.append(limit) + cur.execute(query, params) + return [r[0] for r in cur.fetchall()] + finally: + conn.close() + + @app.function( + image=inference_image, gpu=gpu, timeout=7200, + volumes={"/models": model_volume}, secrets=[db_secret], + ) + def classify_chunk( + doc_ids: list[str], worker_id: int, total_workers: int, + max_length: int, max_chars: int, dry_run: bool, + ) -> dict: + import os + import time + import urllib.request + from concurrent.futures import ThreadPoolExecutor + import psycopg2 + import torch + from transformers import AutoModelForSequenceClassification, AutoTokenizer + + TEXT_BASE_URL = "https://docxcorp.us/extracted" + + def fetch_text(did, mc=2000): + try: + req = urllib.request.Request(f"{TEXT_BASE_URL}/{did}.txt", headers={"User-Agent": "docx-classifier/2.0"}) + with urllib.request.urlopen(req, timeout=15) as r: + return r.read().decode("utf-8")[:mc] + except Exception: + return "" + + def fetch_parallel(ids, mc): + res = {} + with ThreadPoolExecutor(max_workers=50) as ex: + for did, txt in ex.map(lambda d: (d, fetch_text(d, mc)), ids): + res[did] = txt + return res + + def save_batch(labels): + conn = psycopg2.connect(os.environ["DATABASE_URL"]) + try: + with conn.cursor() as cur: + for l in labels: + cur.execute(""" + UPDATE documents SET document_type=%s, document_topic=%s, + classification_confidence=%s, classification_model=%s + WHERE id=%s + """, (l["document_type"], l["document_topic"], l["confidence"], l["model"], l["id"])) + conn.commit() + finally: + conn.close() + + model_volume.reload() + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + gpu_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "cpu" + print(f"[Worker {worker_id}/{total_workers}] {len(doc_ids):,} docs, device={gpu_name}") + + cfg_path = "/models/training_config.json" + model_name = "modernbert-v2" + if os.path.exists(cfg_path): + with open(cfg_path) as f: + model_name = f"modernbert-{json.load(f).get('taxonomy_version', 'v2')}" + + type_tok = AutoTokenizer.from_pretrained("/models/document_type/best") + type_mdl = AutoModelForSequenceClassification.from_pretrained("/models/document_type/best").to(device).eval() + topic_tok = AutoTokenizer.from_pretrained("/models/topic/best") + topic_mdl = AutoModelForSequenceClassification.from_pretrained("/models/topic/best").to(device).eval() + + @torch.no_grad() + def infer(texts, tok, mdl): + inp = tok(texts, truncation=True, max_length=max_length, padding=True, return_tensors="pt").to(device) + probs = torch.softmax(mdl(**inp).logits, dim=-1) + confs, preds = torch.max(probs, dim=-1) + return [(mdl.config.id2label[int(p)], float(c)) for p, c in zip(preds.cpu().numpy(), confs.cpu().numpy())] + + total_classified = 0 + total_errors = 0 + start = time.time() + + for i in range(0, len(doc_ids), 100): + batch = doc_ids[i:i+100] + texts = fetch_parallel(batch, max_chars) + valid = [(did, texts[did]) for did in batch if texts.get(did)] + if not valid: + total_errors += len(batch) + continue + vids, vtxts = zip(*valid) + vtxts = list(vtxts) + tr = infer(vtxts, type_tok, type_mdl) + tpr = infer(vtxts, topic_tok, topic_mdl) + labels = [ + {"id": did, "document_type": dt, "document_topic": tp, + "confidence": min(tc, tpc), "model": model_name} + for did, (dt, tc), (tp, tpc) in zip(vids, tr, tpr) + ] + if labels and not dry_run: + save_batch(labels) + total_classified += len(labels) + total_errors += len(batch) - len(labels) + if (i // 100) % 10 == 0: + elapsed = time.time() - start + rate = total_classified / elapsed if elapsed > 0 else 0 + pct = (i + len(batch)) / len(doc_ids) * 100 + print(f" [Worker {worker_id}] [{pct:5.1f}%] {total_classified:,} done, {rate:.0f} docs/s") + + elapsed = time.time() - start + rate = total_classified / elapsed if elapsed > 0 else 0 + print(f" [Worker {worker_id}] DONE — {total_classified:,} in {elapsed:.0f}s ({rate:.0f} docs/s)") + return {"worker_id": worker_id, "classified": total_classified, "errors": total_errors, + "elapsed_seconds": round(elapsed, 1), "docs_per_second": round(rate, 1)} + + languages = [l.strip() for l in args.languages.split(",")] if args.languages else None + n_workers = args.workers + + print(f"Modal parallel classification ({gpu} GPU, {n_workers} workers)") if args.dry_run: - print("\n (DRY RUN — no changes written to database)") - - # Print distribution of this batch - if total_classified > 0 and labels: - print(f"\nSample distribution (last batch):") - type_counts: dict[str, int] = {} - topic_counts: dict[str, int] = {} - for label in labels: - dt = label["document_type"] - tp = label["document_topic"] - type_counts[dt] = type_counts.get(dt, 0) + 1 - topic_counts[tp] = topic_counts.get(tp, 0) + 1 - - print(" Types:", dict(sorted(type_counts.items(), key=lambda x: -x[1]))) - print(" Topics:", dict(sorted(topic_counts.items(), key=lambda x: -x[1]))) + print(" DRY RUN mode") + print() + + with app.run(): + print("Fetching unclassified document IDs...") + all_ids = fetch_unclassified_ids.remote(languages=languages, limit=args.limit) + print(f" Found {len(all_ids):,} documents to classify") + if not all_ids: + print("Nothing to classify!") + return + + n_workers = min(n_workers, len(all_ids)) + chunk_size = math.ceil(len(all_ids) / n_workers) + chunks = [all_ids[i:i+chunk_size] for i in range(0, len(all_ids), chunk_size)] + print(f" Split into {len(chunks)} chunks of ~{chunk_size:,} docs") + print(f" Estimated: ~{len(all_ids) / (n_workers * 8) / 60:.0f} minutes\n") + + results = list(classify_chunk.map( + chunks, + [i for i in range(len(chunks))], + [len(chunks)] * len(chunks), + [args.max_length] * len(chunks), + [args.max_chars] * len(chunks), + [args.dry_run] * len(chunks), + )) + + total_classified = sum(r["classified"] for r in results) + total_errors = sum(r["errors"] for r in results) + max_elapsed = max(r["elapsed_seconds"] for r in results) + agg_rate = total_classified / max_elapsed if max_elapsed > 0 else 0 + + print(f"\n{'=' * 60}") + print("Classification Complete") + print(f"{'=' * 60}") + print(f" Workers: {len(results)}") + print(f" Classified: {total_classified:,}, Errors: {total_errors:,}") + print(f" Wall time: {max_elapsed:.0f}s ({max_elapsed/60:.1f} min)") + print(f" Aggregate: {agg_rate:.0f} docs/s") + if args.dry_run: + print(" (DRY RUN — no changes written)") + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def main(): + parser = argparse.ArgumentParser(description="Classify corpus with trained models") + parser.add_argument("--models-dir", type=str, required=True) + parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE) + parser.add_argument("--max-length", type=int, default=DEFAULT_MAX_LENGTH) + parser.add_argument("--max-chars", type=int, default=DEFAULT_MAX_CHARS) + parser.add_argument("--languages", type=str, default=None) + parser.add_argument("--limit", type=int, default=None) + parser.add_argument("--db-batch-size", type=int, default=500) + parser.add_argument("--dry-run", action="store_true") + parser.add_argument("--modal", action="store_true", help="Run on Modal cloud GPUs") + parser.add_argument("--workers", type=int, default=20, help="Modal parallel workers (default: 20)") + parser.add_argument("--gpu", type=str, default="T4", help="Modal GPU type: T4, a10g, l4, a100") + args = parser.parse_args() + + if args.modal: + run_modal(args) + else: + run_local(args) if __name__ == "__main__": diff --git a/scripts/classification/sample.py b/scripts/classification/sample.py index 2b00ea6..b0cb5e5 100644 --- a/scripts/classification/sample.py +++ b/scripts/classification/sample.py @@ -175,11 +175,26 @@ def main(): default=",".join(DEFAULT_LANGUAGES), help="Comma-separated language codes (default: en,ru,cs,pl,es)", ) + parser.add_argument( + "--exclude", + type=str, + default=None, + help="JSONL file of docs to exclude (already sampled)", + ) parser.add_argument("--seed", type=int, default=42, help="Random seed") args = parser.parse_args() languages = [l.strip() for l in args.languages.split(",")] + # Load exclusion set + exclude_ids: set[str] = set() + if args.exclude: + with open(args.exclude) as f: + for line in f: + if line.strip(): + exclude_ids.add(json.loads(line)["id"]) + print(f"Excluding {len(exclude_ids)} already-sampled documents") + print("=" * 60) print("Stratified Document Sampling") print("=" * 60) @@ -220,6 +235,8 @@ def main(): print(f"\nSampling {n_samples} from {lang}...") # Fetch more than needed to allow stratification docs = get_documents_for_language(lang, limit=min(n_samples * 10, 100000)) + if exclude_ids: + docs = [d for d in docs if d["id"] not in exclude_ids] print(f" Fetched {len(docs):,} candidates") sampled = stratified_sample(docs, n_samples, seed=args.seed) diff --git a/scripts/classification/train.py b/scripts/classification/train.py index 821b423..75a1533 100644 --- a/scripts/classification/train.py +++ b/scripts/classification/train.py @@ -1,18 +1,22 @@ #!/usr/bin/env python3 """ -Phase 3: Train ModernBERT classifiers on LLM-labeled documents. +Train classifiers on LLM-labeled documents. Trains two independent classifiers: 1. Document type (10 classes) 2. Topic (9 classes) -Uses HuggingFace Transformers with the answerdotai/ModernBERT-base model. -Supports configurable train/val split, epochs, learning rate, etc. +Uses HuggingFace Transformers with xlm-roberta-base (multilingual). +Supports class-weighted loss, configurable train/val split, epochs, etc. Usage: + # Local training python train.py --input labeled_docs.jsonl python train.py --input labeled_docs.jsonl --epochs 5 --lr 2e-5 - python train.py --input labeled_docs.jsonl --output-dir ./models + + # Cloud training on Modal (GPU) + python train.py --input labeled_docs.jsonl --modal + python train.py --input labeled_docs.jsonl --modal --gpu a10g """ import argparse @@ -24,11 +28,7 @@ import numpy as np import torch from datasets import Dataset -from sklearn.metrics import ( - accuracy_score, - classification_report, - f1_score, -) +from sklearn.metrics import accuracy_score, classification_report, f1_score from sklearn.model_selection import train_test_split from transformers import ( AutoModelForSequenceClassification, @@ -40,7 +40,7 @@ from common import fetch_documents_text_parallel, load_taxonomy -DEFAULT_MODEL = "answerdotai/ModernBERT-base" +DEFAULT_MODEL = "xlm-roberta-base" DEFAULT_EPOCHS = 5 DEFAULT_LR = 2e-5 DEFAULT_BATCH_SIZE = 16 @@ -49,6 +49,11 @@ DEFAULT_OUTPUT_DIR = "./models" +# --------------------------------------------------------------------------- +# Shared helpers (used by both local and Modal paths) +# --------------------------------------------------------------------------- + + def load_labeled_data(input_path: str, min_confidence: float = 0.0) -> list[dict]: """Load labeled documents, optionally filtering by confidence.""" docs = [] @@ -61,31 +66,14 @@ def load_labeled_data(input_path: str, min_confidence: float = 0.0) -> list[dict return docs -def fetch_texts(docs: list[dict], max_chars: int = 2000) -> dict[str, str]: - """Fetch document texts in parallel batches.""" - doc_ids = [d["id"] for d in docs] - all_texts = {} - batch_size = 100 - for i in range(0, len(doc_ids), batch_size): - batch = doc_ids[i : i + batch_size] - texts = fetch_documents_text_parallel(batch, max_chars=max_chars) - all_texts.update(texts) - fetched = min(i + batch_size, len(doc_ids)) - if fetched < len(doc_ids): - print(f" Fetched text: {fetched}/{len(doc_ids)}") - return all_texts - - def build_label_maps(taxonomy: dict) -> tuple[dict, dict, dict, dict]: """Build label-to-id and id-to-label mappings for both dimensions.""" type_labels = [t["id"] for t in taxonomy["document_types"]] topic_labels = [t["id"] for t in taxonomy["topics"]] - type2id = {label: i for i, label in enumerate(type_labels)} id2type = {i: label for label, i in type2id.items()} topic2id = {label: i for i, label in enumerate(topic_labels)} id2topic = {i: label for label, i in topic2id.items()} - return type2id, id2type, topic2id, id2topic @@ -116,9 +104,39 @@ def compute_metrics(eval_pred): """Compute accuracy and macro F1 for evaluation.""" predictions, labels = eval_pred preds = np.argmax(predictions, axis=-1) - acc = accuracy_score(labels, preds) - f1 = f1_score(labels, preds, average="macro") - return {"accuracy": acc, "f1_macro": f1} + return { + "accuracy": accuracy_score(labels, preds), + "f1_macro": f1_score(labels, preds, average="macro"), + } + + +def compute_class_weights(labels: list[int], num_classes: int) -> torch.Tensor: + """Compute inverse-frequency class weights.""" + from collections import Counter + + counts = Counter(labels) + total = len(labels) + weights = [total / (num_classes * counts.get(i, 1)) for i in range(num_classes)] + return torch.tensor(weights, dtype=torch.float32) + + +class WeightedTrainer(Trainer): + """Trainer with class-weighted cross-entropy loss.""" + + def __init__(self, class_weights: torch.Tensor | None = None, **kwargs): + super().__init__(**kwargs) + self.class_weights = class_weights + + def compute_loss(self, model, inputs, return_outputs=False, **kwargs): + labels = inputs.pop("labels") + outputs = model(**inputs) + logits = outputs.logits + if self.class_weights is not None: + weight = self.class_weights.to(logits.device) + loss = torch.nn.functional.cross_entropy(logits, labels, weight=weight) + else: + loss = torch.nn.functional.cross_entropy(logits, labels) + return (loss, outputs) if return_outputs else loss def train_classifier( @@ -141,33 +159,25 @@ def train_classifier( print(f"\n{'=' * 60}") print(f"Training: {classifier_name}") print(f" Train: {len(train_texts)}, Val: {len(val_texts)}") - print(f" Classes: {num_labels}") - print(f" Model: {model_name}") + print(f" Classes: {num_labels}, Model: {model_name}") print(f" Epochs: {epochs}, LR: {lr}, Batch: {batch_size}") print(f"{'=' * 60}") tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSequenceClassification.from_pretrained( - model_name, - num_labels=num_labels, - id2label=id2label, - label2id=label2id, + model_name, num_labels=num_labels, id2label=id2label, label2id=label2id, ) def tokenize(examples): return tokenizer( - examples["text"], - truncation=True, - max_length=max_length, - padding="max_length", + examples["text"], truncation=True, + max_length=max_length, padding="max_length", ) train_ds = Dataset.from_dict({"text": train_texts, "label": train_labels}) val_ds = Dataset.from_dict({"text": val_texts, "label": val_labels}) - train_ds = train_ds.map(tokenize, batched=True, remove_columns=["text"]) val_ds = val_ds.map(tokenize, batched=True, remove_columns=["text"]) - train_ds.set_format("torch") val_ds.set_format("torch") @@ -193,44 +203,38 @@ def tokenize(examples): seed=42, ) - trainer = Trainer( - model=model, - args=training_args, - train_dataset=train_ds, - eval_dataset=val_ds, + class_weights = compute_class_weights(train_labels, num_labels) + print(f" Class weights: {[f'{w:.2f}' for w in class_weights.tolist()]}") + + trainer = WeightedTrainer( + class_weights=class_weights, model=model, args=training_args, + train_dataset=train_ds, eval_dataset=val_ds, compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=2)], ) trainer.train() - # Evaluate on validation set eval_results = trainer.evaluate() print(f"\n Val accuracy: {eval_results['eval_accuracy']:.4f}") print(f" Val F1 macro: {eval_results['eval_f1_macro']:.4f}") - # Save best model best_dir = os.path.join(save_dir, "best") trainer.save_model(best_dir) tokenizer.save_pretrained(best_dir) - # Full classification report on val set preds = trainer.predict(val_ds) pred_labels = np.argmax(preds.predictions, axis=-1) report = classification_report( - val_ds["label"], - pred_labels, + val_ds["label"], pred_labels, target_names=[id2label[i] for i in range(num_labels)], ) print(f"\nClassification Report ({classifier_name}):\n{report}") - # Save report report_path = os.path.join(save_dir, "eval_report.txt") with open(report_path, "w") as f: - f.write(f"Model: {model_name}\n") - f.write(f"Classifier: {classifier_name}\n") - f.write(f"Train size: {len(train_texts)}\n") - f.write(f"Val size: {len(val_texts)}\n") + f.write(f"Model: {model_name}\nClassifier: {classifier_name}\n") + f.write(f"Train size: {len(train_texts)}\nVal size: {len(val_texts)}\n") f.write(f"Epochs: {epochs}\n") f.write(f"Val accuracy: {eval_results['eval_accuracy']:.4f}\n") f.write(f"Val F1 macro: {eval_results['eval_f1_macro']:.4f}\n\n") @@ -239,93 +243,12 @@ def tokenize(examples): return eval_results -def main(): - parser = argparse.ArgumentParser( - description="Train ModernBERT classifiers on labeled documents" - ) - parser.add_argument( - "--input", type=str, required=True, help="Labeled JSONL from label.py" - ) - parser.add_argument( - "--output-dir", - type=str, - default=DEFAULT_OUTPUT_DIR, - help=f"Output directory for models (default: {DEFAULT_OUTPUT_DIR})", - ) - parser.add_argument( - "--model", - type=str, - default=DEFAULT_MODEL, - help=f"Base model (default: {DEFAULT_MODEL})", - ) - parser.add_argument( - "--epochs", - type=int, - default=DEFAULT_EPOCHS, - help=f"Training epochs (default: {DEFAULT_EPOCHS})", - ) - parser.add_argument( - "--lr", - type=float, - default=DEFAULT_LR, - help=f"Learning rate (default: {DEFAULT_LR})", - ) - parser.add_argument( - "--batch-size", - type=int, - default=DEFAULT_BATCH_SIZE, - help=f"Batch size (default: {DEFAULT_BATCH_SIZE})", - ) - parser.add_argument( - "--max-length", - type=int, - default=DEFAULT_MAX_LENGTH, - help=f"Max token length (default: {DEFAULT_MAX_LENGTH})", - ) - parser.add_argument( - "--val-split", - type=float, - default=DEFAULT_VAL_SPLIT, - help=f"Validation split ratio (default: {DEFAULT_VAL_SPLIT})", - ) - parser.add_argument( - "--min-confidence", - type=float, - default=0.0, - help="Minimum LLM confidence to include (default: 0.0)", - ) - parser.add_argument( - "--max-chars", - type=int, - default=2000, - help="Max characters of document text (default: 2000)", - ) - parser.add_argument("--seed", type=int, default=42, help="Random seed") - args = parser.parse_args() - - if not os.path.exists(args.input): - print(f"ERROR: Input file not found: {args.input}") - sys.exit(1) - - # Load taxonomy and label maps - taxonomy = load_taxonomy() - type2id, id2type, topic2id, id2topic = build_label_maps(taxonomy) - print(f"Taxonomy: {taxonomy['name']} v{taxonomy['version']}") - print(f" Document types: {len(type2id)} classes") - print(f" Topics: {len(topic2id)} classes") - - # Load labeled data - docs = load_labeled_data(args.input, min_confidence=args.min_confidence) - print(f"\nLoaded {len(docs)} labeled documents") - if args.min_confidence > 0: - print(f" (filtered by confidence >= {args.min_confidence})") - - # Fetch texts - print(f"\nFetching document texts (max {args.max_chars} chars)...") - texts = fetch_texts(docs, max_chars=args.max_chars) - print(f" Got text for {sum(1 for t in texts.values() if t)}/{len(docs)} docs") - - # Prepare datasets for both classifiers +def run_training_pipeline( + docs, texts, taxonomy, type2id, id2type, topic2id, id2topic, + output_dir, model_name, epochs, lr, batch_size, max_length, val_split, seed, +): + """Shared training pipeline used by both local and Modal paths.""" + # Prepare datasets print("\nPreparing document_type dataset...") type_texts, type_labels = prepare_dataset(docs, texts, type2id, "document_type") print(f" {len(type_texts)} examples across {len(set(type_labels))} classes") @@ -334,92 +257,45 @@ def main(): topic_texts, topic_labels = prepare_dataset(docs, texts, topic2id, "document_topic") print(f" {len(topic_texts)} examples across {len(set(topic_labels))} classes") - # Split into train/val (same split for both to keep comparable) - print(f"\nSplitting: {1 - args.val_split:.0%} train / {args.val_split:.0%} val") + print(f"\nSplitting: {1 - val_split:.0%} train / {val_split:.0%} val") - type_train_texts, type_val_texts, type_train_labels, type_val_labels = ( - train_test_split( - type_texts, - type_labels, - test_size=args.val_split, - random_state=args.seed, - stratify=type_labels, - ) + type_train_t, type_val_t, type_train_l, type_val_l = train_test_split( + type_texts, type_labels, test_size=val_split, + random_state=seed, stratify=type_labels, ) - - topic_train_texts, topic_val_texts, topic_train_labels, topic_val_labels = ( - train_test_split( - topic_texts, - topic_labels, - test_size=args.val_split, - random_state=args.seed, - stratify=topic_labels, - ) + topic_train_t, topic_val_t, topic_train_l, topic_val_l = train_test_split( + topic_texts, topic_labels, test_size=val_split, + random_state=seed, stratify=topic_labels, ) - os.makedirs(args.output_dir, exist_ok=True) + os.makedirs(output_dir, exist_ok=True) - # Train document_type classifier type_results = train_classifier( - train_texts=type_train_texts, - train_labels=type_train_labels, - val_texts=type_val_texts, - val_labels=type_val_labels, - num_labels=len(type2id), - id2label=id2type, - label2id=type2id, - output_dir=args.output_dir, - model_name=args.model, - epochs=args.epochs, - lr=args.lr, - batch_size=args.batch_size, - max_length=args.max_length, - classifier_name="document_type", + type_train_t, type_train_l, type_val_t, type_val_l, + len(type2id), id2type, type2id, output_dir, + model_name, epochs, lr, batch_size, max_length, "document_type", ) - # Train topic classifier topic_results = train_classifier( - train_texts=topic_train_texts, - train_labels=topic_train_labels, - val_texts=topic_val_texts, - val_labels=topic_val_labels, - num_labels=len(topic2id), - id2label=id2topic, - label2id=topic2id, - output_dir=args.output_dir, - model_name=args.model, - epochs=args.epochs, - lr=args.lr, - batch_size=args.batch_size, - max_length=args.max_length, - classifier_name="topic", + topic_train_t, topic_train_l, topic_val_t, topic_val_l, + len(topic2id), id2topic, topic2id, output_dir, + model_name, epochs, lr, batch_size, max_length, "topic", ) - # Summary print(f"\n{'=' * 60}") print("Training Complete") print(f"{'=' * 60}") print(f" Document Type — Acc: {type_results['eval_accuracy']:.4f}, F1: {type_results['eval_f1_macro']:.4f}") print(f" Topic — Acc: {topic_results['eval_accuracy']:.4f}, F1: {topic_results['eval_f1_macro']:.4f}") - print(f"\n Models saved to: {args.output_dir}/") - print(f" {args.output_dir}/document_type/best/") - print(f" {args.output_dir}/topic/best/") - # Save training config config = { - "base_model": args.model, + "base_model": model_name, "taxonomy": taxonomy["name"], "taxonomy_version": taxonomy["version"], - "input_file": args.input, "total_docs": len(docs), - "min_confidence": args.min_confidence, - "max_chars": args.max_chars, - "max_length": args.max_length, - "epochs": args.epochs, - "learning_rate": args.lr, - "batch_size": args.batch_size, - "val_split": args.val_split, - "seed": args.seed, + "epochs": epochs, + "learning_rate": lr, + "batch_size": batch_size, "results": { "document_type": { "accuracy": type_results["eval_accuracy"], @@ -431,12 +307,315 @@ def main(): }, }, } - config_path = os.path.join(args.output_dir, "training_config.json") + config_path = os.path.join(output_dir, "training_config.json") with open(config_path, "w") as f: json.dump(config, f, indent=2) - print(f" Config saved to: {config_path}") + print(f" Config: {config_path}") + + return config + + +# --------------------------------------------------------------------------- +# Local training +# --------------------------------------------------------------------------- + + +def run_local(args): + """Train locally using available device (CPU/MPS/CUDA).""" + taxonomy = load_taxonomy() + type2id, id2type, topic2id, id2topic = build_label_maps(taxonomy) + print(f"Taxonomy: {taxonomy['name']} v{taxonomy['version']}") + print(f" Document types: {len(type2id)}, Topics: {len(topic2id)}") + + docs = load_labeled_data(args.input, min_confidence=args.min_confidence) + print(f"\nLoaded {len(docs)} labeled documents") + + print(f"\nFetching document texts (max {args.max_chars} chars)...") + doc_ids = [d["id"] for d in docs] + all_texts = {} + for i in range(0, len(doc_ids), 100): + batch = doc_ids[i : i + 100] + all_texts.update(fetch_documents_text_parallel(batch, max_chars=args.max_chars)) + print(f" Got text for {sum(1 for t in all_texts.values() if t)}/{len(docs)} docs") + + config = run_training_pipeline( + docs, all_texts, taxonomy, type2id, id2type, topic2id, id2topic, + args.output_dir, args.model, args.epochs, args.lr, + args.batch_size, args.max_length, args.val_split, args.seed, + ) + + print(f"\n Models saved to: {args.output_dir}/") + print(f" Next step: python classify.py --models-dir {args.output_dir}") + return config + + +# --------------------------------------------------------------------------- +# Modal cloud training +# --------------------------------------------------------------------------- + + +def run_modal(args): + """Train on Modal with a cloud GPU. Downloads models to --output-dir.""" + import modal + + app = modal.App("docx-classifier-training") + + training_image = ( + modal.Image.debian_slim(python_version="3.11") + .pip_install("torch", "transformers", "datasets", "scikit-learn", "accelerate", "numpy") + ) + model_volume = modal.Volume.from_name("classifier-models", create_if_missing=True) + + # Read local files to send to Modal + labeled_jsonl = Path(args.input).read_text() + taxonomy_path = Path(__file__).parent / "taxonomy.json" + with open(taxonomy_path) as f: + taxonomy = json.load(f) + + gpu_map = {"t4": "T4", "a10g": "a10g", "l4": "l4", "a100": "a100"} + gpu = gpu_map.get(args.gpu.lower(), args.gpu) + + @app.function(image=training_image, gpu=gpu, timeout=3600, volumes={"/models": model_volume}) + def train_remote(labeled_jsonl: str, taxonomy: dict, **kwargs): + """Self-contained training function running on Modal GPU.""" + import json + import os + import urllib.request + from collections import Counter + from concurrent.futures import ThreadPoolExecutor + + import numpy as np + import torch + from datasets import Dataset + from sklearn.metrics import accuracy_score, classification_report, f1_score + from sklearn.model_selection import train_test_split + from transformers import ( + AutoModelForSequenceClassification, AutoTokenizer, + EarlyStoppingCallback, Trainer, TrainingArguments, + ) + + TEXT_BASE_URL = "https://docxcorp.us/extracted" + + def fetch_text(doc_id, max_chars=2000): + try: + req = urllib.request.Request( + f"{TEXT_BASE_URL}/{doc_id}.txt", + headers={"User-Agent": "docx-classifier/2.0"}, + ) + with urllib.request.urlopen(req, timeout=15) as resp: + return resp.read().decode("utf-8")[:max_chars] + except Exception: + return "" + + def fetch_texts_parallel(docs, max_chars): + results = {} + def fetch_one(did): + return did, fetch_text(did, max_chars) + with ThreadPoolExecutor(max_workers=40) as ex: + for did, text in ex.map(fetch_one, [d["id"] for d in docs]): + results[did] = text + print(f" Fetched text for {sum(1 for t in results.values() if t)}/{len(docs)} docs") + return results + + class _WeightedTrainer(Trainer): + def __init__(self, class_weights=None, **kw): + super().__init__(**kw) + self.class_weights = class_weights + + def compute_loss(self, model, inputs, return_outputs=False, **kw): + labels = inputs.pop("labels") + outputs = model(**inputs) + logits = outputs.logits + if self.class_weights is not None: + w = self.class_weights.to(logits.device) + loss = torch.nn.functional.cross_entropy(logits, labels, weight=w) + else: + loss = torch.nn.functional.cross_entropy(logits, labels) + return (loss, outputs) if return_outputs else loss + + def _compute_metrics(eval_pred): + preds = np.argmax(eval_pred.predictions, axis=-1) + return { + "accuracy": accuracy_score(eval_pred.label_ids, preds), + "f1_macro": f1_score(eval_pred.label_ids, preds, average="macro"), + } + + def _class_weights(labels, n): + counts = Counter(labels) + total = len(labels) + return torch.tensor([total / (n * counts.get(i, 1)) for i in range(n)], dtype=torch.float32) + + def _train_one(train_t, train_l, val_t, val_l, n_labels, id2l, l2id, out, mname, ep, lr, bs, ml, name): + print(f"\n{'='*60}\nTraining: {name}\n Train: {len(train_t)}, Val: {len(val_t)}, Classes: {n_labels}") + print(f" Device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}") + + tok = AutoTokenizer.from_pretrained(mname) + mdl = AutoModelForSequenceClassification.from_pretrained(mname, num_labels=n_labels, id2label=id2l, label2id=l2id) + + def tokenize(ex): + return tok(ex["text"], truncation=True, max_length=ml, padding="max_length") + + tds = Dataset.from_dict({"text": train_t, "label": train_l}).map(tokenize, batched=True, remove_columns=["text"]) + vds = Dataset.from_dict({"text": val_t, "label": val_l}).map(tokenize, batched=True, remove_columns=["text"]) + tds.set_format("torch"); vds.set_format("torch") + + sd = os.path.join(out, name) + args = TrainingArguments( + output_dir=sd, num_train_epochs=ep, per_device_train_batch_size=bs, + per_device_eval_batch_size=bs*2, learning_rate=lr, weight_decay=0.01, + warmup_ratio=0.1, eval_strategy="epoch", save_strategy="epoch", + load_best_model_at_end=True, metric_for_best_model="f1_macro", + greater_is_better=True, save_total_limit=2, logging_steps=50, + fp16=torch.cuda.is_available(), report_to="none", seed=42, + ) + cw = _class_weights(train_l, n_labels) + trainer = _WeightedTrainer( + class_weights=cw, model=mdl, args=args, + train_dataset=tds, eval_dataset=vds, compute_metrics=_compute_metrics, + callbacks=[EarlyStoppingCallback(early_stopping_patience=2)], + ) + trainer.train() + res = trainer.evaluate() + best = os.path.join(sd, "best") + trainer.save_model(best); tok.save_pretrained(best) + + preds = trainer.predict(vds) + pl = np.argmax(preds.predictions, axis=-1) + report = classification_report(vds["label"], pl, target_names=[id2l[i] for i in range(n_labels)]) + print(f"\nClassification Report ({name}):\n{report}") + return res + + # --- Main pipeline --- + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + print(f"Device: {device} ({torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'cpu'})") + + docs = [json.loads(l) for l in labeled_jsonl.strip().split("\n") if l.strip()] + docs = [d for d in docs if d.get("confidence", 0) >= kwargs.get("min_confidence", 0)] + print(f"Loaded {len(docs)} documents") + + texts = fetch_texts_parallel(docs, kwargs.get("max_chars", 2000)) + + type_labels = [t["id"] for t in taxonomy["document_types"]] + topic_labels = [t["id"] for t in taxonomy["topics"]] + type2id = {l: i for i, l in enumerate(type_labels)} + id2type = {i: l for l, i in type2id.items()} + topic2id = {l: i for i, l in enumerate(topic_labels)} + id2topic = {i: l for l, i in topic2id.items()} + + def prep(field, lmap): + it, il = [], [] + for d in docs: + t, lab = texts.get(d["id"], ""), d.get(field, "") + if t and lab in lmap: + it.append(t); il.append(lmap[lab]) + return it, il + + tt, tl = prep("document_type", type2id) + tpt, tpl = prep("document_topic", topic2id) + + vs = kwargs.get("val_split", 0.15) + sd = kwargs.get("seed", 42) + ttt, tvt, ttl, tvl = train_test_split(tt, tl, test_size=vs, random_state=sd, stratify=tl) + tptt, tpvt, tptl, tpvl = train_test_split(tpt, tpl, test_size=vs, random_state=sd, stratify=tpl) + + out = "/models" + os.makedirs(out, exist_ok=True) + mn = kwargs.get("model_name", "xlm-roberta-base") + ep = kwargs.get("epochs", 5) + lr = kwargs.get("lr", 2e-5) + bs = kwargs.get("batch_size", 16) + ml = kwargs.get("max_length", 512) + + tr = _train_one(ttt, ttl, tvt, tvl, len(type2id), id2type, type2id, out, mn, ep, lr, bs, ml, "document_type") + tpr = _train_one(tptt, tptl, tpvt, tpvl, len(topic2id), id2topic, topic2id, out, mn, ep, lr, bs, ml, "topic") + + config = { + "base_model": mn, "taxonomy": taxonomy["name"], + "taxonomy_version": taxonomy["version"], "total_docs": len(docs), + "epochs": ep, "learning_rate": lr, "batch_size": bs, + "results": { + "document_type": {"accuracy": tr["eval_accuracy"], "f1_macro": tr["eval_f1_macro"]}, + "topic": {"accuracy": tpr["eval_accuracy"], "f1_macro": tpr["eval_f1_macro"]}, + }, + } + with open(os.path.join(out, "training_config.json"), "w") as f: + json.dump(config, f, indent=2) + return config + + @app.function(image=training_image, volumes={"/models": model_volume}) + def collect_models() -> dict[str, bytes]: + model_volume.reload() + files = {} + for root, _dirs, filenames in os.walk("/models"): + if "/best" in root or root == "/models": + for fname in filenames: + full = os.path.join(root, fname) + files[full.replace("/models/", "")] = open(full, "rb").read() + return files + + print(f"Submitting training job to Modal ({gpu} GPU)...") + print(f" Input: {args.input} ({labeled_jsonl.count(chr(10))} lines)") + print(f" Model: {args.model}, Epochs: {args.epochs}") + print() + + with app.run(): + config = train_remote.remote( + labeled_jsonl=labeled_jsonl, taxonomy=taxonomy, + model_name=args.model, epochs=args.epochs, lr=args.lr, + batch_size=args.batch_size, max_length=args.max_length, + val_split=args.val_split, min_confidence=args.min_confidence, + max_chars=args.max_chars, seed=args.seed, + ) + + print("\n--- Results ---") + print(json.dumps(config, indent=2)) + + # Download models + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + print(f"\nDownloading models to {output_dir}...") + + files = collect_models.remote() + for rel_path, data in files.items(): + local_path = output_dir / rel_path + local_path.parent.mkdir(parents=True, exist_ok=True) + local_path.write_bytes(data) + print(f" {rel_path}") + print(f"\nModels saved to {output_dir}/") + + return config + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def main(): + parser = argparse.ArgumentParser(description="Train classifiers on labeled documents") + parser.add_argument("--input", type=str, required=True, help="Labeled JSONL from label.py") + parser.add_argument("--output-dir", type=str, default=DEFAULT_OUTPUT_DIR) + parser.add_argument("--model", type=str, default=DEFAULT_MODEL) + parser.add_argument("--epochs", type=int, default=DEFAULT_EPOCHS) + parser.add_argument("--lr", type=float, default=DEFAULT_LR) + parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE) + parser.add_argument("--max-length", type=int, default=DEFAULT_MAX_LENGTH) + parser.add_argument("--val-split", type=float, default=DEFAULT_VAL_SPLIT) + parser.add_argument("--min-confidence", type=float, default=0.0) + parser.add_argument("--max-chars", type=int, default=2000) + parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--modal", action="store_true", help="Train on Modal cloud GPU") + parser.add_argument("--gpu", type=str, default="T4", help="Modal GPU type: T4, a10g, l4, a100") + args = parser.parse_args() + + if not os.path.exists(args.input): + print(f"ERROR: Input file not found: {args.input}") + sys.exit(1) - print(f"\nNext step: python classify.py --models-dir {args.output_dir}") + if args.modal: + run_modal(args) + else: + run_local(args) if __name__ == "__main__":