diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3031477..4dc01e4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -16,6 +16,7 @@ jobs: python-version: "3.12" - run: pip install ruff - run: ruff check . + - run: ruff format --check . test: runs-on: ubuntu-latest diff --git a/CLAUDE.md b/CLAUDE.md index 6a25086..6589826 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -14,17 +14,18 @@ ETL pipeline for building and maintaining a PostgreSQL cache of Discogs release 1. **Download** Discogs monthly data dumps (XML) from https://discogs-data-dumps.s3.us-west-2.amazonaws.com/index.html 2. **Convert** XML to CSV using [discogs-xml2db](https://github.com/philipmat/discogs-xml2db) (not a PyPI package; must be cloned separately) 3. **Fix newlines** in CSV fields (`scripts/fix_csv_newlines.py`) -4. **Filter** CSVs to library-matching artists only (`scripts/filter_csv.py`) -- ~70% data reduction -5. **Create schema** (`schema/create_database.sql`) -6. **Import** filtered CSVs into PostgreSQL (`scripts/import_csv.py`) -7. **Create indexes** including trigram GIN indexes (`schema/create_indexes.sql`) -8. **Deduplicate** by master_id (`scripts/dedup_releases.py`) -9. **Prune** to library matches (`scripts/verify_cache.py --prune`) -- ~89% data reduction (3 GB -> 340 MB) -10. **Vacuum** to reclaim disk space (`VACUUM FULL`) +4. **Enrich** `library_artists.txt` with WXYC cross-references (`scripts/enrich_library_artists.py`, optional) +5. **Filter** CSVs to library-matching artists only (`scripts/filter_csv.py`) -- ~70% data reduction +6. **Create schema** (`schema/create_database.sql`) +7. **Import** filtered CSVs into PostgreSQL (`scripts/import_csv.py`) +8. **Create indexes** including trigram GIN indexes (`schema/create_indexes.sql`) +9. **Deduplicate** by master_id (`scripts/dedup_releases.py`) +10. **Prune** to library matches (`scripts/verify_cache.py --prune`) -- ~89% data reduction (3 GB -> 340 MB) +11. **Vacuum** to reclaim disk space (`VACUUM FULL`) `scripts/run_pipeline.py` supports two modes: -- `--xml` mode: runs steps 2-10 (XML conversion through vacuum) -- `--csv-dir` mode: runs steps 5-10 (database build from pre-filtered CSVs) +- `--xml` mode: runs steps 2-11 (XML conversion through vacuum) +- `--csv-dir` mode: runs steps 6-11 (database build from pre-filtered CSVs) Step 1 (download) is always manual. @@ -54,7 +55,8 @@ docker compose up db -d # just the database (for tests) ### Key Files -- `scripts/run_pipeline.py` -- Pipeline orchestrator (--xml for steps 2-10, --csv-dir for steps 5-10) +- `scripts/run_pipeline.py` -- Pipeline orchestrator (--xml for steps 2-11, --csv-dir for steps 6-11) +- `scripts/enrich_library_artists.py` -- Enrich artist list with WXYC cross-references (pymysql) - `scripts/filter_csv.py` -- Filter Discogs CSVs to library artists - `scripts/import_csv.py` -- Import CSVs into PostgreSQL (psycopg COPY) - `scripts/dedup_releases.py` -- Deduplicate releases by master_id (copy-swap with `DROP CASCADE`) @@ -86,12 +88,15 @@ pytest tests/unit/ -v DATABASE_URL_TEST=postgresql://discogs:discogs@localhost:5433/postgres \ pytest -m postgres -v +# MySQL integration tests (needs WXYC MySQL on port 3307) +pytest -m mysql -v + # E2E tests (runs full pipeline as subprocess against test Postgres) DATABASE_URL_TEST=postgresql://discogs:discogs@localhost:5433/postgres \ pytest -m e2e -v ``` -Markers: `postgres` (needs PostgreSQL), `e2e` (full pipeline), `integration` (needs library.db). Integration and E2E tests are excluded from the default `pytest` run via `addopts` in `pyproject.toml`. +Markers: `postgres` (needs PostgreSQL), `mysql` (needs WXYC MySQL), `e2e` (full pipeline), `integration` (needs library.db). Integration and E2E tests are excluded from the default `pytest` run via `addopts` in `pyproject.toml`. Test fixtures are in `tests/fixtures/` (CSV files, library.db, library_artists.txt). Regenerate with `python tests/fixtures/create_fixtures.py`. diff --git a/Dockerfile b/Dockerfile index b6f60f3..ba17ea2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,7 +13,8 @@ RUN pip install --no-cache-dir \ "psycopg[binary]>=3.1.0" \ "asyncpg>=0.29.0" \ "rapidfuzz>=3.0.0" \ - "lxml>=4.9.0" + "lxml>=4.9.0" \ + "pymysql>=1.0.0" # Copy application code COPY scripts/ scripts/ diff --git a/README.md b/README.md index 75139e5..a4e244f 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,7 @@ All 9 steps are automated by `run_pipeline.py` (or Docker Compose). The script s |------|--------|-------------| | 1. Convert | discogs-xml2db | XML data dump to CSV | | 2. Fix newlines | `scripts/fix_csv_newlines.py` | Clean embedded newlines in CSV fields | +| 2.5. Enrich | `scripts/enrich_library_artists.py` | Enrich artist list with cross-references (optional) | | 3. Filter | `scripts/filter_csv.py` | Keep only library artists (~70% reduction) | | 4. Create schema | `schema/create_database.sql` | Set up tables and constraints | | 5. Import | `scripts/import_csv.py` | Bulk load CSVs via psycopg COPY | @@ -40,6 +41,8 @@ All 9 steps are automated by `run_pipeline.py` (or Docker Compose). The script s | 8. Prune | `scripts/verify_cache.py --prune` | Remove non-library releases (~89% reduction) | | 9. Vacuum | `VACUUM FULL` | Reclaim disk space | +Step 2.5 generates `library_artists.txt` from `library.db` and optionally enriches it with alternate artist names and cross-references from the WXYC MySQL catalog database. This reduces false negatives at the filtering stage for artists known by multiple names (e.g., "Body Count" filed under Ice-T). + ### Docker Compose The easiest way to run the full pipeline: @@ -70,6 +73,18 @@ python scripts/run_pipeline.py \ --database-url postgresql://localhost:5432/discogs ``` +To enrich `library_artists.txt` with alternate names and cross-references from the WXYC catalog database, add `--wxyc-db-url`: + +```bash +python scripts/run_pipeline.py \ + --xml /path/to/releases.xml.gz \ + --xml2db /path/to/discogs-xml2db/ \ + --library-artists /path/to/library_artists.txt \ + --library-db /path/to/library.db \ + --wxyc-db-url mysql://user:pass@host:port/wxycmusic \ + --database-url postgresql://localhost:5432/discogs +``` + Database build from pre-filtered CSVs (steps 4-9): ```bash @@ -161,6 +176,9 @@ pytest tests/unit/ -v DATABASE_URL_TEST=postgresql://discogs:discogs@localhost:5433/postgres \ pytest -m postgres -v +# MySQL integration tests (needs WXYC MySQL on port 3307) +pytest -m mysql -v + # E2E tests (needs PostgreSQL, runs full pipeline as subprocess) DATABASE_URL_TEST=postgresql://discogs:discogs@localhost:5433/postgres \ pytest -m e2e -v diff --git a/docs/discogs-cache-technical-overview.md b/docs/discogs-cache-technical-overview.md new file mode 100644 index 0000000..4839074 --- /dev/null +++ b/docs/discogs-cache-technical-overview.md @@ -0,0 +1,221 @@ +# Discogs Cache: Technical Overview + +Several WXYC tools — song search, catalog lookup, flowsheet entries, and the request bot — need artist and album metadata from Discogs. Previously, every lookup hit the Discogs API in real time. A single request could require 2-22 API calls depending on the search path, and the API rate limit is 60 requests/minute. This made searches slow (6-30 seconds) and bulk operations impractical. + +## Solution + +The Discogs cache is an ETL pipeline that produces a local PostgreSQL database from Discogs' monthly data dumps. It filters the full 48 GB Discogs database down to a 340 MB database containing only releases by artists in the WXYC library. + +**Key results** (benchmarked on staging, 2026-02-10): + +- **52x average speedup** — median response drops from 19.6 seconds to 379 ms +- **Worst case**: 30s song lookup down to 454 ms (66x) +- **Best case**: artist-only search down from 6.3s to 216 ms (29x) +- **Zero Discogs API calls** on cached requests (previously 2-22 per request) + +Full benchmark methodology and results are in the [Performance](#performance) section. + +## Pipeline + +The pipeline runs monthly when Discogs publishes new data dumps. It can be run via Docker Compose or the orchestration script. + +| Step | Description | Tool | +|------|-------------|------| +| 1. Download | Fetch monthly XML dump from Discogs | Manual | +| 2. Convert | XML to CSV | discogs-xml2db | +| 3. Fix newlines | Clean embedded newlines in CSV fields | `fix_csv_newlines.py` | +| 4. Enrich artists | Add alternate names and cross-references to artist list | `enrich_library_artists.py` | +| 5. Filter | Keep only releases by library artists (~70% reduction) | `filter_csv.py` | +| 6. Create schema | Set up PostgreSQL tables and constraints | `create_database.sql` | +| 7. Import | Bulk load CSVs via psycopg COPY | `import_csv.py` | +| 8. Create indexes | Trigram GIN indexes for fuzzy text search | `create_indexes.sql` | +| 9. Deduplicate | Keep best release per master_id | `dedup_releases.py` | +| 10. Prune | Remove releases that don't match library entries (~89% reduction) | `verify_cache.py` | +| 11. Vacuum | Reclaim disk space | `VACUUM FULL` | + +### Two-stage filtering + +The pipeline filters data in two stages to make the 48 GB dump manageable: + +**Stage 1 (step 5):** Filters by artist name. If an artist in the Discogs data matches an artist in the WXYC library, all of that artist's releases are kept. This is a coarse cut that removes ~70% of the data. + +**Stage 2 (step 10):** Filters by release. Uses multi-index fuzzy matching to compare each remaining release against the WXYC library catalog. Releases that don't match any library entry are pruned. This is a fine-grained cut that removes another ~89% of what survived Stage 1. + +### Artist name enrichment (step 4) + +Stage 1 uses exact name matching, which misses releases credited under alternate names. The enrichment step addresses this by expanding the artist list with data from the WXYC catalog database: + +##### Alternate artist names +Releases filed under one artist but credited to another (e.g., "Body Count" filed under Ice-T, "Bobby Digital" filed under RZA). Source: `LIBRARY_RELEASE.ALTERNATE_ARTIST_NAME` (~3,935 names). + +##### Artist cross-references +Links between related artists: solo projects, band members, name variations (e.g., "Crooked Fingers" cross-referenced with Eric Bachmann). Source: `LIBRARY_CODE_CROSS_REFERENCE` (~189 names). + +##### Release cross-references +Artists linked to specific releases filed under other artists, such as collaborations and remixes. Source: `RELEASE_CROSS_REFERENCE` (~29 names). + +### Fuzzy text search + +The database uses PostgreSQL's `pg_trgm` extension with GIN indexes for fuzzy matching. This handles: + +- Spelling variations ("Thee Oh Sees" vs "OHSEES") +- Data entry inconsistencies in the WXYC catalog +- Partial matches and typos + +Four trigram indexes cover track titles, artist names on releases, artist names on tracks (for compilations), and release titles. + +## Database schema + +| Table | Description | +|-------|-------------| +| `release` | Release metadata: id, title, year, artwork URL | +| `release_artist` | Artists credited on releases | +| `release_track` | Tracks with position and duration | +| `release_track_artist` | Artists on specific tracks (compilations) | +| `cache_metadata` | Data freshness tracking | + +Consumers connect via the `DATABASE_URL_DISCOGS` environment variable. + +## Performance + +### What's being measured + +The benchmarks measure end-to-end request latency through the full `/request` pipeline: AI parsing, library search, Discogs lookups, and artwork fetching. Each query exercises a different search path, and each path makes a different number of Discogs API calls. + +Two modes are compared: + +- **Cached**: Normal operation. The in-memory TTL cache and PostgreSQL cache serve repeat queries without hitting the Discogs API. +- **Uncached** (`skip_cache=True`): Bypasses all caches, forcing every Discogs lookup through the API. This simulates a cold start or first-time query. + +### Network flow + +#### Cached request + +When caches are warm, most Discogs data is served from the in-memory TTL cache. No external API calls are needed for repeat queries. + +```mermaid +sequenceDiagram + participant Client + participant FastAPI + participant Groq as Groq AI + participant MemCache as In-Memory Cache + participant Library as Library DB (SQLite) + + Client->>FastAPI: POST /request + FastAPI->>Groq: Parse message + Groq-->>FastAPI: {artist, song, album} + + FastAPI->>MemCache: Album lookup? + MemCache-->>FastAPI: Cached result + + FastAPI->>Library: Search (artist + album) + Library-->>FastAPI: Library results + + FastAPI->>MemCache: Artwork search + MemCache-->>FastAPI: Cached artwork + + FastAPI-->>Client: Response (~300ms) +``` + +#### Uncached request (`skip_cache=True`) + +With caches bypassed, every Discogs lookup hits the external API. A single request can make 2-22 API calls depending on the search path, each subject to network latency and rate limiting. + +```mermaid +sequenceDiagram + participant Client + participant FastAPI + participant Groq as Groq AI + participant Discogs as Discogs API + participant Library as Library DB (SQLite) + + Client->>FastAPI: POST /request (skip_cache=true) + FastAPI->>Groq: Parse message + Groq-->>FastAPI: {artist, song, album} + + rect rgb(255, 240, 240) + note right of Discogs: 1-2 API calls + FastAPI->>Discogs: Search releases by track + Discogs-->>FastAPI: Release list + end + + FastAPI->>Library: Search (artist + album) + Library-->>FastAPI: Library results + + rect rgb(255, 240, 240) + note right of Discogs: 1-5 API calls per result + loop Each library result + FastAPI->>Discogs: Search for artwork + Discogs-->>FastAPI: Artwork URL + end + end + + rect rgb(255, 240, 240) + note right of Discogs: 2-3 API calls per result (Path C only) + loop Track validation (if fallback) + FastAPI->>Discogs: Search release + Discogs-->>FastAPI: Release ID + FastAPI->>Discogs: Get release tracklist + Discogs-->>FastAPI: Tracklist + end + end + + FastAPI-->>Client: Response (~6-30 sec) +``` + +### Search paths + +| Path | Description | Trigger | Discogs API Calls | +|------|-------------|---------|-------------------| +| **A** | Artist + Album | Album provided in query | 1-5 (artwork only) | +| **B** | Song lookup | Song without album; Discogs resolves album | 2-7 | +| **C** | Track validation | Library falls back to artist-only; validates each album's tracklist | 12-22 | +| **D** | Compilation search | Primary search finds nothing; cross-references Discogs tracklists | 3-9 | +| **E** | Artist only | No song or album parsed | 1-5 (artwork only) | + +### Results + +Server: `https://request-o-matic-staging.up.railway.app` +Date: 2026-02-10 + +| Path | Label | Uncached | Cached (median) | Cached (p95) | Speedup | API Calls | +|------|-------|----------|-----------------|--------------|---------|-----------| +| A | Artist + Album | 18,804 ms | 273 ms | 308 ms | 68.8x | 0 (1-5) | +| B | Song lookup | 30,137 ms | 454 ms | 492 ms | 66.4x | 23 (2-7) | +| C | Track validation | 19,331 ms | 551 ms | 580 ms | 35.1x | 18 (12-22) | +| D | Compilation | 23,624 ms | 402 ms | 461 ms | 58.8x | 22 (3-9) | +| E | Artist only | 6,335 ms | 216 ms | 219 ms | 29.3x | 7 (1-5) | +| | **Average** | **19,646 ms** | **379 ms** | | **51.8x** | | + +Cached iterations per query: 5. Uncached iterations per query: 1 (to preserve API rate limits). + +#### Notes + +- **API calls column** shows actual calls observed (uncached), with the expected range in parentheses. Some observed values exceed the expected range because the Discogs API sometimes returns no results on the strict search, triggering a fuzzy fallback (a second API call per lookup). +- **Path A shows 0 API calls** because staging does not have the PostgreSQL cache connected (`discogs_cache: unavailable`), so the telemetry counters undercount in some code paths. With PG cache enabled, this column would be more accurate. +- **Path C** is marked `xfail` in integration tests due to a known inconsistency in the track validation fallback. Despite this, the benchmark completes successfully. + +### Reproducing + +```bash +# Against staging (default 10 cached iterations) +venv/bin/python scripts/benchmark_requests.py --staging + +# More iterations for tighter confidence +venv/bin/python scripts/benchmark_requests.py --staging -n 50 + +# Against local server +venv/bin/python scripts/benchmark_requests.py --local + +# Skip warmup if caches are already populated +venv/bin/python scripts/benchmark_requests.py --staging --skip-warmup +``` + +## Consumers + +- **request-parser** (Python/FastAPI) — `discogs/cache_service.py` queries the cache for album lookups, track validation, and artwork +- **Backend-Service** (TypeScript/Node.js) — planned + +## Repository + +https://github.com/WXYC/discogs-cache diff --git a/hooks/pre-commit b/hooks/pre-commit new file mode 100755 index 0000000..11cbab9 --- /dev/null +++ b/hooks/pre-commit @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +# Pre-commit hook: run ruff check and ruff format on staged Python files. +# Install: ln -sf ../../hooks/pre-commit .git/hooks/pre-commit + +set -euo pipefail + +# Collect staged .py files (added, copied, modified, renamed) +staged_files=$(git diff --cached --name-only --diff-filter=ACMR -- '*.py') + +if [ -z "$staged_files" ]; then + exit 0 +fi + +# Use venv ruff if available, otherwise fall back to system ruff +if [ -x ".venv/bin/ruff" ]; then + RUFF=".venv/bin/ruff" +elif command -v ruff &>/dev/null; then + RUFF="ruff" +else + echo "ruff not found. Install it or create a .venv with ruff." + exit 1 +fi + +# shellcheck disable=SC2086 +$RUFF check $staged_files || { + echo "" + echo "ruff check failed. Fix the issues above before committing." + exit 1 +} + +# shellcheck disable=SC2086 +$RUFF format --check $staged_files || { + echo "" + echo "ruff format check failed. Run 'ruff format .' to fix." + exit 1 +} diff --git a/pyproject.toml b/pyproject.toml index 06fcb13..50bf7f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,16 +10,23 @@ dependencies = [ ] [project.optional-dependencies] +mysql = [ + "pymysql>=1.0.0", +] dev = [ "pytest>=7.0.0", "pytest-asyncio>=0.21.0", "ruff>=0.4.0", + "pymysql>=1.0.0", ] [build-system] requires = ["setuptools>=65.0", "wheel"] build-backend = "setuptools.build_meta" +[tool.setuptools] +packages = [] + [tool.ruff] line-length = 100 target-version = "py39" @@ -41,7 +48,8 @@ ignore = [ markers = [ "integration: marks tests as integration tests (needs library.db)", "postgres: marks tests that need a PostgreSQL database", + "mysql: marks tests that need a MySQL database", "e2e: full pipeline end-to-end test", ] -addopts = "-m 'not integration and not postgres and not e2e'" +addopts = "-m 'not integration and not postgres and not mysql and not e2e'" testpaths = ["tests"] diff --git a/scripts/enrich_library_artists.py b/scripts/enrich_library_artists.py new file mode 100644 index 0000000..94d2af4 --- /dev/null +++ b/scripts/enrich_library_artists.py @@ -0,0 +1,270 @@ +#!/usr/bin/env python3 +"""Generate an enriched library_artists.txt from library.db and WXYC MySQL data. + +Extracts base artist names from the SQLite library database, then optionally +enriches with alternate names, cross-references, and release cross-references +from the WXYC MySQL catalog database. + +Usage: + # From library.db only (no MySQL enrichment): + python scripts/enrich_library_artists.py \\ + --library-db library.db \\ + --output library_artists.txt + + # With WXYC MySQL enrichment: + python scripts/enrich_library_artists.py \\ + --library-db library.db \\ + --wxyc-db-url mysql://user:pass@host:port/dbname \\ + --output library_artists.txt +""" + +from __future__ import annotations + +import argparse +import logging +import sqlite3 +import sys +from pathlib import Path +from urllib.parse import urlparse + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", +) +logger = logging.getLogger(__name__) + +# Import compilation detection from lib/matching.py +_LIB_DIR = Path(__file__).parent.parent / "lib" +sys.path.insert(0, str(_LIB_DIR.parent)) +from lib.matching import is_compilation_artist # noqa: E402 + + +def parse_args(argv: list[str] | None = None) -> argparse.Namespace: + """Parse command-line arguments.""" + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "--library-db", + type=Path, + required=True, + metavar="FILE", + help="Path to library.db (SQLite database with artist/title pairs).", + ) + parser.add_argument( + "--wxyc-db-url", + type=str, + default=None, + metavar="URL", + help="MySQL connection URL for WXYC catalog database " + "(e.g. mysql://user:pass@host:port/dbname). " + "If omitted, only base artists from library.db are extracted.", + ) + parser.add_argument( + "--output", + type=Path, + required=True, + metavar="FILE", + help="Output path for library_artists.txt.", + ) + return parser.parse_args(argv) + + +def extract_base_artists(library_db_path: Path) -> set[str]: + """Extract unique artist names from library.db, excluding compilations. + + Args: + library_db_path: Path to the SQLite library database. + + Returns: + Set of distinct artist names (original case, no compilations). + """ + logger.info("Extracting base artists from %s", library_db_path) + conn = sqlite3.connect(library_db_path) + try: + cur = conn.execute("SELECT DISTINCT artist FROM library") + artists = set() + for (name,) in cur: + if name and name.strip() and not is_compilation_artist(name): + artists.add(name.strip()) + finally: + conn.close() + logger.info("Extracted %d base artists", len(artists)) + return artists + + +def connect_mysql(db_url: str): + """Connect to MySQL using a URL string. + + Args: + db_url: MySQL connection URL (mysql://user:pass@host:port/dbname). + + Returns: + A pymysql connection object. + """ + import pymysql + + parsed = urlparse(db_url) + return pymysql.connect( + host=parsed.hostname or "localhost", + port=parsed.port or 3306, + user=parsed.username or "root", + password=parsed.password or "", + database=parsed.path.lstrip("/"), + charset="utf8", + ) + + +def extract_alternate_names(conn) -> set[str]: + """Extract alternate artist names from LIBRARY_RELEASE. + + These are releases filed under one artist but credited to a different name + (e.g., "Body Count" filed under Ice-T). + + Args: + conn: MySQL connection. + + Returns: + Set of alternate artist name strings. + """ + logger.info("Extracting alternate artist names from LIBRARY_RELEASE") + with conn.cursor() as cur: + cur.execute(""" + SELECT DISTINCT ALTERNATE_ARTIST_NAME + FROM LIBRARY_RELEASE + WHERE ALTERNATE_ARTIST_NAME IS NOT NULL + AND ALTERNATE_ARTIST_NAME != '' + """) + names = {row[0].strip() for row in cur if row[0] and row[0].strip()} + logger.info("Found %d alternate artist names", len(names)) + return names + + +def extract_cross_referenced_artists(conn) -> set[str]: + """Extract artist names from both sides of LIBRARY_CODE_CROSS_REFERENCE. + + These link related artists (solo projects, band members, name variants). + + Args: + conn: MySQL connection. + + Returns: + Set of cross-referenced artist names. + """ + logger.info("Extracting cross-referenced artists from LIBRARY_CODE_CROSS_REFERENCE") + with conn.cursor() as cur: + cur.execute(""" + SELECT DISTINCT lc.PRESENTATION_NAME + FROM LIBRARY_CODE_CROSS_REFERENCE cr + JOIN LIBRARY_CODE lc ON lc.ID = cr.CROSS_REFERENCING_ARTIST_ID + UNION + SELECT DISTINCT lc.PRESENTATION_NAME + FROM LIBRARY_CODE_CROSS_REFERENCE cr + JOIN LIBRARY_CODE lc ON lc.ID = cr.CROSS_REFERENCED_LIBRARY_CODE_ID + """) + names = {row[0].strip() for row in cur if row[0] and row[0].strip()} + logger.info("Found %d cross-referenced artist names", len(names)) + return names + + +def extract_release_cross_ref_artists(conn) -> set[str]: + """Extract artist names linked via RELEASE_CROSS_REFERENCE. + + These are artists associated with specific releases filed under other artists + (collaborations, featured artists, remixers). + + Args: + conn: MySQL connection. + + Returns: + Set of artist names from release cross-references. + """ + logger.info("Extracting artists from RELEASE_CROSS_REFERENCE") + with conn.cursor() as cur: + cur.execute(""" + SELECT DISTINCT lc.PRESENTATION_NAME + FROM RELEASE_CROSS_REFERENCE rcr + JOIN LIBRARY_CODE lc ON lc.ID = rcr.CROSS_REFERENCING_ARTIST_ID + """) + names = {row[0].strip() for row in cur if row[0] and row[0].strip()} + logger.info("Found %d release cross-reference artist names", len(names)) + return names + + +def merge_and_write( + base: set[str], + alternates: set[str], + cross_refs: set[str], + release_cross_refs: set[str], + output: Path, +) -> None: + """Merge all artist name sources and write to output file. + + Names are sorted alphabetically for stable diffs. Empty strings and + compilation artist names are filtered out. Original case is preserved. + + Args: + base: Artist names from library.db. + alternates: Alternate artist names from LIBRARY_RELEASE. + cross_refs: Artist names from LIBRARY_CODE_CROSS_REFERENCE. + release_cross_refs: Artist names from RELEASE_CROSS_REFERENCE. + output: Path to write the output file. + """ + all_names = base | alternates | cross_refs | release_cross_refs + filtered = sorted( + name for name in all_names if name and name.strip() and not is_compilation_artist(name) + ) + + new_from_alternates = len(alternates - base) + new_from_cross_refs = len(cross_refs - base - alternates) + new_from_release_xrefs = len(release_cross_refs - base - alternates - cross_refs) + + logger.info( + "Merged: %d base + %d new from alternates + %d new from cross-refs " + "+ %d new from release cross-refs = %d total", + len(base), + new_from_alternates, + new_from_cross_refs, + new_from_release_xrefs, + len(filtered), + ) + + with open(output, "w", encoding="utf-8") as f: + for name in filtered: + f.write(name + "\n") + + logger.info("Wrote %d artist names to %s", len(filtered), output) + + +def main() -> None: + args = parse_args() + + if not args.library_db.exists(): + logger.error("library.db not found: %s", args.library_db) + sys.exit(1) + + # Source 1: Base names from library.db + base = extract_base_artists(args.library_db) + + # Source 2: WXYC MySQL enrichment (optional) + alternates: set[str] = set() + cross_refs: set[str] = set() + release_cross_refs: set[str] = set() + + if args.wxyc_db_url: + conn = connect_mysql(args.wxyc_db_url) + try: + alternates = extract_alternate_names(conn) + cross_refs = extract_cross_referenced_artists(conn) + release_cross_refs = extract_release_cross_ref_artists(conn) + finally: + conn.close() + + # Merge and write output + args.output.parent.mkdir(parents=True, exist_ok=True) + merge_and_write(base, alternates, cross_refs, release_cross_refs, args.output) + + +if __name__ == "__main__": + main() diff --git a/scripts/filter_csv.py b/scripts/filter_csv.py index 1ba5406..0e714fd 100644 --- a/scripts/filter_csv.py +++ b/scripts/filter_csv.py @@ -1,6 +1,4 @@ #!/usr/bin/env python3 -from __future__ import annotations - """Filter Discogs CSV exports to only include releases by library artists. This script significantly reduces the data size by only keeping releases @@ -10,6 +8,8 @@ python filter_discogs_csv.py /path/to/library_artists.txt /path/to/csv_output/ /path/to/filtered_output/ """ +from __future__ import annotations + import csv import logging import sys diff --git a/scripts/import_csv.py b/scripts/import_csv.py index 6b837aa..cde4f16 100644 --- a/scripts/import_csv.py +++ b/scripts/import_csv.py @@ -1,6 +1,4 @@ #!/usr/bin/env python3 -from __future__ import annotations - """Import Discogs CSV files into PostgreSQL with proper multiline handling. Imports only the columns needed by the optimized schema (see 04-create-database.sql). @@ -8,6 +6,8 @@ The release_image.csv is processed separately to populate artwork_url on release. """ +from __future__ import annotations + import csv import logging import re diff --git a/scripts/run_pipeline.py b/scripts/run_pipeline.py index 9e29b20..e4e560b 100644 --- a/scripts/run_pipeline.py +++ b/scripts/run_pipeline.py @@ -9,6 +9,7 @@ --xml2db \\ --library-artists \\ [--library-db ] \\ + [--wxyc-db-url ] \\ [--database-url ] Database build from pre-filtered CSVs (steps 4-9): @@ -88,6 +89,16 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace: help="Path to library.db for KEEP/PRUNE classification " "(optional; if omitted, the prune step is skipped).", ) + parser.add_argument( + "--wxyc-db-url", + type=str, + default=None, + metavar="URL", + help="MySQL connection URL for WXYC catalog database " + "(e.g. mysql://user:pass@host:port/dbname). " + "Enriches library_artists.txt with alternate names and cross-references. " + "Requires --library-db.", + ) parser.add_argument( "--database-url", type=str, @@ -105,6 +116,9 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace: if args.library_artists is None: parser.error("--library-artists is required when using --xml") + if args.wxyc_db_url and not args.library_db: + parser.error("--library-db is required when using --wxyc-db-url") + return args @@ -180,7 +194,13 @@ def run_step(description: str, cmd: list[str], **kwargs) -> None: def run_vacuum(db_url: str) -> None: """Run VACUUM FULL on all pipeline tables.""" logger.info("Running VACUUM FULL ...") - tables = ["release", "release_artist", "release_track", "release_track_artist", "cache_metadata"] + tables = [ + "release", + "release_artist", + "release_track", + "release_track_artist", + "cache_metadata", + ] conn = psycopg.connect(db_url, autocommit=True) for table in tables: logger.info(" VACUUM FULL %s ...", table) @@ -219,9 +239,12 @@ def convert_xml_to_csv(xml_file: Path, xml2db_dir: Path, output_dir: Path) -> No run_step( "Convert XML to CSV", [ - sys.executable, "run.py", - "--export", "release", - "--output", str(output_dir), + sys.executable, + "run.py", + "--export", + "release", + "--output", + str(output_dir), str(xml_file.resolve()), ], cwd=str(xml2db_dir), @@ -242,14 +265,32 @@ def fix_csv_newlines(input_dir: Path, output_dir: Path) -> None: mod.fix_csv_dir(input_dir, output_dir) -def filter_to_library_artists( - library_artists: Path, input_dir: Path, output_dir: Path +def enrich_library_artists( + library_db: Path, + library_artists_out: Path, + wxyc_db_url: str | None = None, ) -> None: + """Step 2.5: Enrich library_artists.txt with WXYC cross-reference data.""" + cmd = [ + sys.executable, + str(SCRIPT_DIR / "enrich_library_artists.py"), + "--library-db", + str(library_db), + "--output", + str(library_artists_out), + ] + if wxyc_db_url: + cmd.extend(["--wxyc-db-url", wxyc_db_url]) + run_step("Enrich library artists", cmd) + + +def filter_to_library_artists(library_artists: Path, input_dir: Path, output_dir: Path) -> None: """Step 3: Filter CSVs to only releases by library artists.""" run_step( "Filter to library artists", [ - sys.executable, str(SCRIPT_DIR / "filter_csv.py"), + sys.executable, + str(SCRIPT_DIR / "filter_csv.py"), str(library_artists), str(input_dir), str(output_dir), @@ -298,8 +339,16 @@ def main() -> None: # Step 2: Fix CSV newlines fix_csv_newlines(raw_csv_dir, cleaned_csv_dir) + # Step 2.5: Enrich library_artists.txt (optional) + if args.library_db: + enriched_artists = tmp / "enriched_library_artists.txt" + enrich_library_artists(args.library_db, enriched_artists, args.wxyc_db_url) + library_artists_path = enriched_artists + else: + library_artists_path = args.library_artists + # Step 3: Filter to library artists - filter_to_library_artists(args.library_artists, cleaned_csv_dir, filtered_csv_dir) + filter_to_library_artists(library_artists_path, cleaned_csv_dir, filtered_csv_dir) # Steps 4-9: Database build _run_database_build(db_url, filtered_csv_dir, args.library_db, python) @@ -311,9 +360,7 @@ def main() -> None: logger.info("Pipeline complete in %.1f minutes.", total / 60) -def _run_database_build( - db_url: str, csv_dir: Path, library_db: Path | None, python: str -) -> None: +def _run_database_build(db_url: str, csv_dir: Path, library_db: Path | None, python: str) -> None: """Steps 4-9: database schema, import, indexes, dedup, prune, vacuum.""" # Step 4: Wait for Postgres wait_for_postgres(db_url) diff --git a/scripts/verify_cache.py b/scripts/verify_cache.py index a67e8c0..0b3ceb4 100644 --- a/scripts/verify_cache.py +++ b/scripts/verify_cache.py @@ -1,6 +1,4 @@ #!/usr/bin/env python3 -from __future__ import annotations - """Verify and optionally prune the Discogs cache against the WXYC library catalog. Uses multi-index (artist, album) pair matching with three independent fuzzy @@ -30,6 +28,8 @@ database_url defaults to postgresql:///discogs """ +from __future__ import annotations + import argparse import asyncio import enum @@ -37,7 +37,6 @@ import logging import re import sqlite3 - import sys import time import unicodedata @@ -209,7 +208,7 @@ def __init__( self.compilation_titles = compilation_titles @classmethod - def from_rows(cls, rows: list[tuple[str, str]]) -> "LibraryIndex": + def from_rows(cls, rows: list[tuple[str, str]]) -> LibraryIndex: """Build index from (artist, title) row tuples. Args: @@ -258,7 +257,7 @@ def from_rows(cls, rows: list[tuple[str, str]]) -> "LibraryIndex": ) @classmethod - def from_sqlite(cls, db_path: Path) -> "LibraryIndex": + def from_sqlite(cls, db_path: Path) -> LibraryIndex: """Build index from the library SQLite database. Args: diff --git a/tests/e2e/test_pipeline.py b/tests/e2e/test_pipeline.py index df5771c..ad16616 100644 --- a/tests/e2e/test_pipeline.py +++ b/tests/e2e/test_pipeline.py @@ -13,8 +13,8 @@ from pathlib import Path import psycopg -from psycopg import sql import pytest +from psycopg import sql FIXTURES_DIR = Path(__file__).parent.parent / "fixtures" CSV_DIR = FIXTURES_DIR / "csv" @@ -70,9 +70,12 @@ def _run_pipeline(self, e2e_db_url): [ sys.executable, str(RUN_PIPELINE), - "--csv-dir", str(CSV_DIR), - "--library-db", str(FIXTURE_LIBRARY_DB), - "--database-url", e2e_db_url, + "--csv-dir", + str(CSV_DIR), + "--library-db", + str(FIXTURE_LIBRARY_DB), + "--database-url", + e2e_db_url, ], capture_output=True, text=True, @@ -106,8 +109,7 @@ def test_tables_populated(self) -> None: compilation releases, which may be pruned depending on matching. """ conn = self._connect() - for table in ("release", "release_artist", "release_track", - "cache_metadata"): + for table in ("release", "release_artist", "release_track", "cache_metadata"): with conn.cursor() as cur: cur.execute(f"SELECT count(*) FROM {table}") count = cur.fetchone()[0] @@ -122,9 +124,7 @@ def test_duplicates_removed(self) -> None: """ conn = self._connect() with conn.cursor() as cur: - cur.execute( - "SELECT id FROM release WHERE id IN (1001, 1002, 1003) ORDER BY id" - ) + cur.execute("SELECT id FROM release WHERE id IN (1001, 1002, 1003) ORDER BY id") ids = [row[0] for row in cur.fetchall()] conn.close() assert ids == [1002], f"Expected only 1002 after dedup, got {ids}" @@ -194,8 +194,7 @@ def test_fk_constraints_exist(self) -> None: """) fk_tables = {row[0] for row in cur.fetchall()} conn.close() - expected = {"release_artist", "release_track", - "release_track_artist", "cache_metadata"} + expected = {"release_artist", "release_track", "release_track_artist", "cache_metadata"} assert expected.issubset(fk_tables) def test_null_title_release_not_imported(self) -> None: @@ -220,7 +219,8 @@ def _run_pipeline(self, e2e_db_url): [ sys.executable, str(RUN_PIPELINE), - "--csv-dir", str(CSV_DIR), + "--csv-dir", + str(CSV_DIR), # No --library-db — prune should be skipped ], capture_output=True, diff --git a/tests/fixtures/create_fixtures.py b/tests/fixtures/create_fixtures.py index 76af162..b5c613f 100644 --- a/tests/fixtures/create_fixtures.py +++ b/tests/fixtures/create_fixtures.py @@ -40,8 +40,15 @@ def create_release_csv() -> None: - Releases that match and don't match the library """ headers = [ - "id", "status", "title", "country", "released", "notes", - "data_quality", "master_id", "format", + "id", + "status", + "title", + "country", + "released", + "notes", + "data_quality", + "master_id", + "format", ] rows = [ # Group 1: duplicate master_id 500 (Radiohead - OK Computer variants) @@ -50,35 +57,26 @@ def create_release_csv() -> None: [1001, "Accepted", "OK Computer", "UK", "1997-06-16", "", "Correct", 500, "CD"], [1002, "Accepted", "OK Computer", "US", "1997-07-01", "", "Correct", 500, "Vinyl"], [1003, "Accepted", "OK Computer", "JP", "1997", "", "Correct", 500, "Cassette"], - # Group 2: duplicate master_id 600 (Joy Division - Unknown Pleasures) # Release 2001 has 2 tracks, 2002 has 4 tracks [2001, "Accepted", "Unknown Pleasures", "UK", "1979-06-15", "", "Correct", 600, "LP"], [2002, "Accepted", "Unknown Pleasures", "US", "1979", "", "Correct", 600, "CD"], - # No duplicate - unique master_id [3001, "Accepted", "Kid A", "UK", "2000-10-02", "", "Correct", 700, "CD"], - # No master_id (should survive dedup) [4001, "Accepted", "Amnesiac", "UK", "2001-06-05", "", "Correct", "", "CD"], - # Release that won't match library (should be pruned) [5001, "Accepted", "Unknown Album", "US", "2020-01-01", "", "Correct", 800, "CD"], [5002, "Accepted", "Another Unknown", "US", "", "", "Correct", 900, "CD"], - # Bad date format [6001, "Accepted", "Homogenic", "UK", "Unknown", "", "Correct", 1000, "CD"], - # Missing title (should be skipped during import - required field) [7001, "Accepted", "", "US", "2023", "", "Correct", 1100, "CD"], - # Compilation release [8001, "Accepted", "Sugar Hill", "US", "1979", "", "Correct", 1200, "LP"], - # Various date format edge cases [9001, "Accepted", "Abbey Road", "UK", "1969-09-26", "", "Correct", 1300, "LP"], [9002, "Accepted", "Bridge Over Troubled Water", "US", "1970", "", "Correct", 1400, "LP"], - # Artist not in library [10001, "Accepted", "Some Random Album", "US", "2023-05-01", "", "Correct", 1500, "CD"], [10002, "Accepted", "Obscure Release", "DE", "2022", "", "Correct", 1600, "CD"], @@ -96,32 +94,24 @@ def create_release_artist_csv() -> None: [1003, 1, "Radiohead", 0, "", 1, ""], [3001, 1, "Radiohead", 0, "", 1, ""], [4001, 1, "Radiohead", 0, "", 1, ""], - # Joy Division releases (match library) [2001, 2, "Joy Division", 0, "", 1, ""], [2002, 2, "Joy Division", 0, "", 1, ""], - # Unknown artists (won't match library) [5001, 3, "DJ Unknown", 0, "", 1, ""], [5002, 4, "Mystery Band", 0, "", 1, ""], - # Bjork (match library, tests accent handling) [6001, 5, "Björk", 0, "", 1, ""], - # Note: release 7001 has empty title and is skipped during import, # so no child table rows should reference it. - # Compilation [8001, 7, "Various", 0, "", 1, ""], - # Beatles and Simon & Garfunkel (match library) [9001, 8, "Beatles, The", 0, "", 1, ""], [9002, 9, "Simon & Garfunkel", 0, "", 1, ""], - # Not in library [10001, 10, "Random Artist X", 0, "", 1, ""], [10002, 11, "Obscure Band Y", 0, "", 1, ""], - # Extra artist credit (should not be primary) [1001, 12, "Some Producer", 1, "", 2, ""], ] @@ -139,56 +129,43 @@ def create_release_track_csv() -> None: [1001, 1, "1", "Airbag", "4:44"], [1001, 2, "2", "Paranoid Android", "6:23"], [1001, 3, "3", "Subterranean Homesick Alien", "4:27"], - # Release 1002 (OK Computer US Vinyl) - 5 tracks (should win dedup) [1002, 1, "A1", "Airbag", "4:44"], [1002, 2, "A2", "Paranoid Android", "6:23"], [1002, 3, "A3", "Subterranean Homesick Alien", "4:27"], [1002, 4, "B1", "Exit Music (For a Film)", "4:24"], [1002, 5, "B2", "Let Down", "4:59"], - # Release 1003 (OK Computer JP Cassette) - 1 track [1003, 1, "1", "Airbag", "4:44"], - # Release 2001 (Unknown Pleasures UK LP) - 2 tracks [2001, 1, "A1", "Disorder", "3:29"], [2001, 2, "A2", "Day of the Lords", "4:48"], - # Release 2002 (Unknown Pleasures US CD) - 4 tracks (should win dedup) [2002, 1, "1", "Disorder", "3:29"], [2002, 2, "2", "Day of the Lords", "4:48"], [2002, 3, "3", "Candidate", "3:05"], [2002, 4, "4", "Insight", "4:03"], - # Release 3001 (Kid A) - 2 tracks [3001, 1, "1", "Everything In Its Right Place", "4:11"], [3001, 2, "2", "Kid A", "4:44"], - # Release 4001 (Amnesiac, no master_id) - 2 tracks [4001, 1, "1", "Packt Like Sardines in a Crushd Tin Box", "4:00"], [4001, 2, "2", "Pyramid Song", "4:49"], - # Release 5001 (Unknown Album) - 1 track [5001, 1, "1", "Unknown Track", "3:00"], - # Release 5002 (Another Unknown) - 1 track [5002, 1, "1", "Mystery Track", "2:30"], - # Release 6001 (Homogenic) - 2 tracks [6001, 1, "1", "Hunter", "4:15"], [6001, 2, "2", "Joga", "5:05"], - # Release 8001 (Sugar Hill compilation) - 2 tracks [8001, 1, "A1", "Rapper's Delight", "14:35"], [8001, 2, "A2", "Apache", "5:35"], - # Release 9001 (Abbey Road) - 2 tracks [9001, 1, "A1", "Come Together", "4:20"], [9001, 2, "A2", "Something", "3:03"], - # Release 9002 (Bridge Over Troubled Water) - 1 track [9002, 1, "A1", "Bridge Over Troubled Water", "4:52"], - # Releases not in library [10001, 1, "1", "Random Track", "3:00"], [10002, 1, "1", "Obscure Track", "4:00"], @@ -214,14 +191,11 @@ def create_release_image_csv() -> None: # Primary image [1001, "primary", 600, 600, "https://img.discogs.com/abc123/release-1001.jpg"], [1001, "secondary", 300, 300, "https://img.discogs.com/abc123/release-1001-back.jpg"], - # Only secondary (should be used as fallback) [2001, "secondary", 600, 600, "https://img.discogs.com/def456/release-2001.jpg"], - # Primary for other releases [3001, "primary", 600, 600, "https://img.discogs.com/ghi789/release-3001.jpg"], [9001, "primary", 600, 600, "https://img.discogs.com/jkl012/release-9001.jpg"], - # No image for some releases (5001, 5002) - tests artwork_url being NULL ] write_csv("release_image.csv", headers, rows) diff --git a/tests/integration/test_dedup.py b/tests/integration/test_dedup.py index 946461c..776addd 100644 --- a/tests/integration/test_dedup.py +++ b/tests/integration/test_dedup.py @@ -35,8 +35,13 @@ pytestmark = pytest.mark.postgres -ALL_TABLES = ("cache_metadata", "release_track_artist", - "release_track", "release_artist", "release") +ALL_TABLES = ( + "cache_metadata", + "release_track_artist", + "release_track", + "release_artist", + "release", +) def _drop_all_tables(conn) -> None: @@ -83,7 +88,6 @@ def _fresh_import(db_url: str) -> None: conn.close() - def _run_dedup(db_url: str) -> None: """Run the full dedup pipeline against the database.""" conn = psycopg.connect(db_url, autocommit=True) @@ -91,14 +95,30 @@ def _run_dedup(db_url: str) -> None: if delete_count > 0: tables = [ ("release", "new_release", "id, title, release_year, artwork_url", "id"), - ("release_artist", "new_release_artist", - "release_id, artist_name, extra", "release_id"), - ("release_track", "new_release_track", - "release_id, sequence, position, title, duration", "release_id"), - ("release_track_artist", "new_release_track_artist", - "release_id, track_sequence, artist_name", "release_id"), - ("cache_metadata", "new_cache_metadata", - "release_id, cached_at, source, last_validated", "release_id"), + ( + "release_artist", + "new_release_artist", + "release_id, artist_name, extra", + "release_id", + ), + ( + "release_track", + "new_release_track", + "release_id, sequence, position, title, duration", + "release_id", + ), + ( + "release_track_artist", + "new_release_track_artist", + "release_id, track_sequence, artist_name", + "release_id", + ), + ( + "cache_metadata", + "new_cache_metadata", + "release_id, cached_at, source, last_validated", + "release_id", + ), ] for old, new, cols, id_col in tables: @@ -134,9 +154,7 @@ def test_correct_release_kept_for_master_500(self) -> None: """Release 1002 (5 tracks) kept over 1001 (3 tracks) and 1003 (1 track).""" conn = self._connect() with conn.cursor() as cur: - cur.execute( - "SELECT id FROM release WHERE id IN (1001, 1002, 1003) ORDER BY id" - ) + cur.execute("SELECT id FROM release WHERE id IN (1001, 1002, 1003) ORDER BY id") ids = [row[0] for row in cur.fetchall()] conn.close() assert ids == [1002] @@ -145,9 +163,7 @@ def test_correct_release_kept_for_master_600(self) -> None: """Release 2002 (4 tracks) kept over 2001 (2 tracks).""" conn = self._connect() with conn.cursor() as cur: - cur.execute( - "SELECT id FROM release WHERE id IN (2001, 2002) ORDER BY id" - ) + cur.execute("SELECT id FROM release WHERE id IN (2001, 2002) ORDER BY id") ids = [row[0] for row in cur.fetchall()] conn.close() assert ids == [2002] @@ -174,13 +190,9 @@ def test_child_table_rows_cleaned(self) -> None: """Deduped releases have their child table rows removed.""" conn = self._connect() with conn.cursor() as cur: - cur.execute( - "SELECT count(*) FROM release_artist WHERE release_id = 1001" - ) + cur.execute("SELECT count(*) FROM release_artist WHERE release_id = 1001") artist_count = cur.fetchone()[0] - cur.execute( - "SELECT count(*) FROM release_track WHERE release_id = 1001" - ) + cur.execute("SELECT count(*) FROM release_track WHERE release_id = 1001") track_count = cur.fetchone()[0] conn.close() assert artist_count == 0 @@ -190,9 +202,7 @@ def test_kept_release_tracks_preserved(self) -> None: """The kept release still has its tracks.""" conn = self._connect() with conn.cursor() as cur: - cur.execute( - "SELECT count(*) FROM release_track WHERE release_id = 1002" - ) + cur.execute("SELECT count(*) FROM release_track WHERE release_id = 1002") count = cur.fetchone()[0] conn.close() assert count == 5 @@ -232,8 +242,7 @@ def test_fk_constraints_recreated(self) -> None: """) fk_tables = {row[0] for row in cur.fetchall()} conn.close() - expected = {"release_artist", "release_track", - "release_track_artist", "cache_metadata"} + expected = {"release_artist", "release_track", "release_track_artist", "cache_metadata"} assert expected.issubset(fk_tables) def test_total_release_count_after_dedup(self) -> None: @@ -257,19 +266,13 @@ def _set_up(self, db_url): _drop_all_tables(conn) with conn.cursor() as cur: cur.execute(SCHEMA_DIR.joinpath("create_database.sql").read_text()) + cur.execute("INSERT INTO release (id, title, master_id) VALUES (1, 'A', 100)") + cur.execute("INSERT INTO release (id, title, master_id) VALUES (2, 'B', 200)") cur.execute( - "INSERT INTO release (id, title, master_id) VALUES (1, 'A', 100)" - ) - cur.execute( - "INSERT INTO release (id, title, master_id) VALUES (2, 'B', 200)" - ) - cur.execute( - "INSERT INTO release_track (release_id, sequence, title) " - "VALUES (1, 1, 'Track A')" + "INSERT INTO release_track (release_id, sequence, title) VALUES (1, 1, 'Track A')" ) cur.execute( - "INSERT INTO release_track (release_id, sequence, title) " - "VALUES (2, 1, 'Track B')" + "INSERT INTO release_track (release_id, sequence, title) VALUES (2, 1, 'Track B')" ) conn.close() diff --git a/tests/integration/test_enrich_library_artists.py b/tests/integration/test_enrich_library_artists.py new file mode 100644 index 0000000..e167704 --- /dev/null +++ b/tests/integration/test_enrich_library_artists.py @@ -0,0 +1,93 @@ +"""Integration tests for scripts/enrich_library_artists.py against WXYC MySQL.""" + +from __future__ import annotations + +import importlib.util +import os +import sys +from pathlib import Path + +import pytest + +# Load enrich_library_artists module +_SCRIPT_PATH = Path(__file__).parent.parent.parent / "scripts" / "enrich_library_artists.py" +_spec = importlib.util.spec_from_file_location("enrich_library_artists", _SCRIPT_PATH) +assert _spec is not None and _spec.loader is not None +_mod = importlib.util.module_from_spec(_spec) +sys.modules["enrich_library_artists"] = _mod +_spec.loader.exec_module(_mod) + +connect_mysql = _mod.connect_mysql +extract_alternate_names = _mod.extract_alternate_names +extract_cross_referenced_artists = _mod.extract_cross_referenced_artists +extract_release_cross_ref_artists = _mod.extract_release_cross_ref_artists +extract_base_artists = _mod.extract_base_artists +merge_and_write = _mod.merge_and_write + +FIXTURES_DIR = Path(__file__).parent.parent / "fixtures" + +pytestmark = pytest.mark.mysql + +MYSQL_URL = os.environ.get("WXYC_DB_URL", "mysql://root:wxyc@localhost:3307/wxycmusic") + + +@pytest.fixture(scope="module") +def mysql_conn(): + """Connect to the WXYC MySQL database.""" + conn = connect_mysql(MYSQL_URL) + yield conn + conn.close() + + +class TestExtractAlternateNames: + """Extract alternate artist names from LIBRARY_RELEASE.""" + + def test_returns_nonempty_set(self, mysql_conn) -> None: + names = extract_alternate_names(mysql_conn) + assert len(names) > 100 + + def test_contains_known_alternates(self, mysql_conn) -> None: + names = extract_alternate_names(mysql_conn) + names_lower = {n.lower() for n in names} + assert "body count" in names_lower + assert "bobby digital" in names_lower + assert "common sense" in names_lower + + +class TestExtractCrossReferencedArtists: + """Extract artist names from LIBRARY_CODE_CROSS_REFERENCE.""" + + def test_returns_nonempty_set(self, mysql_conn) -> None: + names = extract_cross_referenced_artists(mysql_conn) + assert len(names) > 10 + + def test_contains_known_cross_refs(self, mysql_conn) -> None: + names = extract_cross_referenced_artists(mysql_conn) + names_lower = {n.lower() for n in names} + # "Crooked Fingers is filed w/ Eric Bachmann" + assert "eric bachmann" in names_lower + assert "crooked fingers" in names_lower + + +class TestExtractReleaseCrossRefArtists: + """Extract artist names from RELEASE_CROSS_REFERENCE.""" + + def test_returns_nonempty_set(self, mysql_conn) -> None: + names = extract_release_cross_ref_artists(mysql_conn) + assert len(names) > 5 + + +class TestFullEnrichment: + """End-to-end enrichment produces more artists than base set.""" + + def test_enriched_set_is_larger(self, mysql_conn, tmp_path: Path) -> None: + base = extract_base_artists(FIXTURES_DIR / "library.db") + alternates = extract_alternate_names(mysql_conn) + cross_refs = extract_cross_referenced_artists(mysql_conn) + release_cross_refs = extract_release_cross_ref_artists(mysql_conn) + + output = tmp_path / "enriched.txt" + merge_and_write(base, alternates, cross_refs, release_cross_refs, output) + + lines = output.read_text().splitlines() + assert len(lines) > len(base) diff --git a/tests/integration/test_prune.py b/tests/integration/test_prune.py index 9d7dfcf..a3845c9 100644 --- a/tests/integration/test_prune.py +++ b/tests/integration/test_prune.py @@ -14,8 +14,13 @@ CSV_DIR = FIXTURES_DIR / "csv" FIXTURE_LIBRARY_DB = FIXTURES_DIR / "library.db" -ALL_TABLES = ("cache_metadata", "release_track_artist", - "release_track", "release_artist", "release") +ALL_TABLES = ( + "cache_metadata", + "release_track_artist", + "release_track", + "release_artist", + "release", +) # Load import_csv module _IMPORT_PATH = Path(__file__).parent.parent.parent / "scripts" / "import_csv.py" @@ -162,9 +167,7 @@ def test_prune_deletes_releases(self) -> None: conn = psycopg.connect(self.db_url) with conn.cursor() as cur: id_list = list(self.report.prune_ids) - cur.execute( - "DELETE FROM release WHERE id = ANY(%s::integer[])", (id_list,) - ) + cur.execute("DELETE FROM release WHERE id = ANY(%s::integer[])", (id_list,)) conn.commit() with conn.cursor() as cur: diff --git a/tests/integration/test_schema.py b/tests/integration/test_schema.py index 7d2b308..9074eaa 100644 --- a/tests/integration/test_schema.py +++ b/tests/integration/test_schema.py @@ -36,8 +36,13 @@ def test_schema_executes_without_error(self) -> None: conn.close() def test_all_tables_exist(self) -> None: - expected = {"release", "release_artist", "release_track", - "release_track_artist", "cache_metadata"} + expected = { + "release", + "release_artist", + "release_track", + "release_track_artist", + "cache_metadata", + } conn = self._connect() with conn.cursor() as cur: cur.execute(""" @@ -57,8 +62,13 @@ def test_all_tables_exist(self) -> None: ("release_track_artist", {"release_id", "track_sequence", "artist_name"}), ("cache_metadata", {"release_id", "cached_at", "source", "last_validated"}), ], - ids=["release", "release_artist", "release_track", - "release_track_artist", "cache_metadata"], + ids=[ + "release", + "release_artist", + "release_track", + "release_track_artist", + "cache_metadata", + ], ) def test_table_columns(self, table: str, expected_columns: set[str]) -> None: conn = self._connect() @@ -97,8 +107,12 @@ def test_fk_constraints_with_cascade(self) -> None: """) fk_tables = {row[0] for row in cur.fetchall()} conn.close() - expected_fk_tables = {"release_artist", "release_track", - "release_track_artist", "cache_metadata"} + expected_fk_tables = { + "release_artist", + "release_track", + "release_track_artist", + "cache_metadata", + } assert expected_fk_tables.issubset(fk_tables) def test_schema_is_idempotent(self) -> None: @@ -121,8 +135,7 @@ def _apply_schema_and_data(self, db_url): cur.execute(SCHEMA_DIR.joinpath("create_database.sql").read_text()) # Insert minimal data so indexes have something to work with cur.execute( - "INSERT INTO release (id, title) VALUES (1, 'Test Album') " - "ON CONFLICT DO NOTHING" + "INSERT INTO release (id, title) VALUES (1, 'Test Album') ON CONFLICT DO NOTHING" ) cur.execute( "INSERT INTO release_artist (release_id, artist_name, extra) " diff --git a/tests/unit/test_csv_to_tsv.py b/tests/unit/test_csv_to_tsv.py index f49ab87..1c7c592 100644 --- a/tests/unit/test_csv_to_tsv.py +++ b/tests/unit/test_csv_to_tsv.py @@ -38,7 +38,7 @@ def test_empty_fields_become_null(self, tmp_path: Path) -> None: """Empty CSV fields are converted to \\N (PostgreSQL NULL).""" csv_file = tmp_path / "input.csv" tsv_file = tmp_path / "output.tsv" - csv_file.write_text('a,b,c\n1,,3\n') + csv_file.write_text("a,b,c\n1,,3\n") convert(csv_file, tsv_file) lines = tsv_file.read_text().splitlines() @@ -54,9 +54,7 @@ def test_empty_fields_become_null(self, tmp_path: Path) -> None: ], ids=["backslash", "tab", "newline", "carriage-return"], ) - def test_special_char_escaping( - self, tmp_path: Path, input_val: str, expected_val: str - ) -> None: + def test_special_char_escaping(self, tmp_path: Path, input_val: str, expected_val: str) -> None: """Special characters are escaped for PostgreSQL COPY.""" csv_file = tmp_path / "input.csv" tsv_file = tmp_path / "output.tsv" diff --git a/tests/unit/test_enrich_library_artists.py b/tests/unit/test_enrich_library_artists.py new file mode 100644 index 0000000..b9e9af3 --- /dev/null +++ b/tests/unit/test_enrich_library_artists.py @@ -0,0 +1,210 @@ +"""Unit tests for scripts/enrich_library_artists.py.""" + +from __future__ import annotations + +import importlib.util +import sys +from pathlib import Path + +import pytest + +# Load enrich_library_artists module from scripts directory +_SCRIPT_PATH = Path(__file__).parent.parent.parent / "scripts" / "enrich_library_artists.py" +_spec = importlib.util.spec_from_file_location("enrich_library_artists", _SCRIPT_PATH) +assert _spec is not None and _spec.loader is not None +_mod = importlib.util.module_from_spec(_spec) +sys.modules["enrich_library_artists"] = _mod +_spec.loader.exec_module(_mod) + +extract_base_artists = _mod.extract_base_artists +merge_and_write = _mod.merge_and_write +parse_args = _mod.parse_args + +FIXTURES_DIR = Path(__file__).parent.parent / "fixtures" + + +# --------------------------------------------------------------------------- +# extract_base_artists +# --------------------------------------------------------------------------- + + +class TestExtractBaseArtists: + """Extracting unique artist names from library.db.""" + + def test_returns_nonempty_set(self) -> None: + artists = extract_base_artists(FIXTURES_DIR / "library.db") + assert isinstance(artists, set) + assert len(artists) > 0 + + def test_contains_known_artist(self) -> None: + artists = extract_base_artists(FIXTURES_DIR / "library.db") + # library.db has "Radiohead" as an artist + assert "Radiohead" in artists + + def test_excludes_compilation_artists(self) -> None: + artists = extract_base_artists(FIXTURES_DIR / "library.db") + for name in artists: + name_lower = name.lower() + assert "various" not in name_lower, f"Compilation artist not excluded: {name}" + + def test_no_empty_strings(self) -> None: + artists = extract_base_artists(FIXTURES_DIR / "library.db") + assert "" not in artists + assert all(name.strip() for name in artists) + + def test_preserves_original_case(self) -> None: + artists = extract_base_artists(FIXTURES_DIR / "library.db") + # Should have mixed case, not all lowercase + assert "Radiohead" in artists + assert "radiohead" not in artists + + +# --------------------------------------------------------------------------- +# merge_and_write +# --------------------------------------------------------------------------- + + +class TestMergeAndWrite: + """Merging artist sets and writing output file.""" + + def test_merges_all_sources(self, tmp_path: Path) -> None: + output = tmp_path / "artists.txt" + merge_and_write( + base={"Alpha", "Beta"}, + alternates={"Gamma"}, + cross_refs={"Delta"}, + release_cross_refs={"Epsilon"}, + output=output, + ) + lines = output.read_text().splitlines() + assert set(lines) == {"Alpha", "Beta", "Gamma", "Delta", "Epsilon"} + + def test_no_duplicates(self, tmp_path: Path) -> None: + output = tmp_path / "artists.txt" + merge_and_write( + base={"Alpha", "Beta"}, + alternates={"Beta", "Gamma"}, + cross_refs={"Gamma", "Delta"}, + release_cross_refs={"Alpha"}, + output=output, + ) + lines = output.read_text().splitlines() + assert len(lines) == len(set(lines)) + assert set(lines) == {"Alpha", "Beta", "Gamma", "Delta"} + + def test_sorted_output(self, tmp_path: Path) -> None: + output = tmp_path / "artists.txt" + merge_and_write( + base={"Zebra", "Apple", "Mango"}, + alternates=set(), + cross_refs=set(), + release_cross_refs=set(), + output=output, + ) + lines = output.read_text().splitlines() + assert lines == sorted(lines) + + def test_excludes_empty_strings(self, tmp_path: Path) -> None: + output = tmp_path / "artists.txt" + merge_and_write( + base={"Alpha", ""}, + alternates={" ", "Beta"}, + cross_refs=set(), + release_cross_refs=set(), + output=output, + ) + lines = output.read_text().splitlines() + assert "" not in lines + assert " " not in lines + assert set(lines) == {"Alpha", "Beta"} + + def test_excludes_compilation_artists(self, tmp_path: Path) -> None: + output = tmp_path / "artists.txt" + merge_and_write( + base={"Alpha"}, + alternates={"Various Artists", "Soundtrack Orchestra"}, + cross_refs=set(), + release_cross_refs=set(), + output=output, + ) + lines = output.read_text().splitlines() + assert "Various Artists" not in lines + assert "Soundtrack Orchestra" not in lines + assert "Alpha" in lines + + def test_preserves_original_case(self, tmp_path: Path) -> None: + output = tmp_path / "artists.txt" + merge_and_write( + base={"The Beatles", "RZA", "dj shadow"}, + alternates=set(), + cross_refs=set(), + release_cross_refs=set(), + output=output, + ) + lines = output.read_text().splitlines() + assert "The Beatles" in lines + assert "RZA" in lines + assert "dj shadow" in lines + + def test_trailing_newline(self, tmp_path: Path) -> None: + output = tmp_path / "artists.txt" + merge_and_write( + base={"Alpha"}, + alternates=set(), + cross_refs=set(), + release_cross_refs=set(), + output=output, + ) + content = output.read_text() + assert content.endswith("\n") + + def test_empty_sets_produce_empty_file(self, tmp_path: Path) -> None: + output = tmp_path / "artists.txt" + merge_and_write( + base=set(), + alternates=set(), + cross_refs=set(), + release_cross_refs=set(), + output=output, + ) + content = output.read_text() + assert content == "" + + +# --------------------------------------------------------------------------- +# parse_args +# --------------------------------------------------------------------------- + + +class TestParseArgs: + """CLI argument parsing.""" + + def test_required_args(self) -> None: + args = parse_args( + [ + "--library-db", + "library.db", + "--output", + "artists.txt", + ] + ) + assert args.library_db == Path("library.db") + assert args.output == Path("artists.txt") + assert args.wxyc_db_url is None + + def test_with_wxyc_db_url(self) -> None: + args = parse_args( + [ + "--library-db", + "library.db", + "--output", + "artists.txt", + "--wxyc-db-url", + "mysql://wxyc:wxyc@localhost:3307/wxycmusic", + ] + ) + assert args.wxyc_db_url == "mysql://wxyc:wxyc@localhost:3307/wxycmusic" + + def test_missing_required_args_exits(self) -> None: + with pytest.raises(SystemExit): + parse_args(["--library-db", "library.db"]) # missing --output diff --git a/tests/unit/test_filter_csv.py b/tests/unit/test_filter_csv.py index d637e04..175f6bf 100644 --- a/tests/unit/test_filter_csv.py +++ b/tests/unit/test_filter_csv.py @@ -152,9 +152,7 @@ def test_filters_to_matching_ids(self, tmp_path: Path) -> None: input_path = FIXTURES_DIR / "csv" / "release.csv" output_path = tmp_path / "release_filtered.csv" - input_count, output_count = filter_csv_file( - input_path, output_path, matching_ids, "id" - ) + input_count, output_count = filter_csv_file(input_path, output_path, matching_ids, "id") assert input_count > 0 assert output_count == 2 @@ -183,9 +181,7 @@ def test_empty_matching_set(self, tmp_path: Path) -> None: input_path = FIXTURES_DIR / "csv" / "release.csv" output_path = tmp_path / "release_filtered.csv" - input_count, output_count = filter_csv_file( - input_path, output_path, set(), "id" - ) + input_count, output_count = filter_csv_file(input_path, output_path, set(), "id") assert input_count > 0 assert output_count == 0 @@ -194,7 +190,5 @@ def test_filters_child_table(self, tmp_path: Path) -> None: input_path = FIXTURES_DIR / "csv" / "release_track.csv" output_path = tmp_path / "release_track_filtered.csv" - _, output_count = filter_csv_file( - input_path, output_path, matching_ids, "release_id" - ) + _, output_count = filter_csv_file(input_path, output_path, matching_ids, "release_id") assert output_count == 3 # Release 1001 has 3 tracks diff --git a/tests/unit/test_import_csv.py b/tests/unit/test_import_csv.py index 8c2e1be..7b2dc5c 100644 --- a/tests/unit/test_import_csv.py +++ b/tests/unit/test_import_csv.py @@ -43,10 +43,17 @@ class TestExtractYear: ("2023-00-00", "2023"), ], ids=[ - "full-date", "full-date-1997", "full-date-1969", - "year-only", "year-only-1979", - "empty", "none", "unknown-text", "tbd-text", - "zeros", "partial-date", + "full-date", + "full-date-1997", + "full-date-1969", + "year-only", + "year-only-1979", + "empty", + "none", + "unknown-text", + "tbd-text", + "zeros", + "partial-date", ], ) def test_extract_year(self, input_val: str | None, expected: str | None) -> None: diff --git a/tests/unit/test_verify_cache.py b/tests/unit/test_verify_cache.py index 6697371..df0c0a3 100644 --- a/tests/unit/test_verify_cache.py +++ b/tests/unit/test_verify_cache.py @@ -2,6 +2,7 @@ import importlib.util import json +import sys from pathlib import Path from unittest.mock import AsyncMock, MagicMock, patch @@ -12,6 +13,7 @@ _spec = importlib.util.spec_from_file_location("verify_cache", _SCRIPT_PATH) assert _spec is not None and _spec.loader is not None _vc = importlib.util.module_from_spec(_spec) +sys.modules["verify_cache"] = _vc _spec.loader.exec_module(_vc) # Re-export for cleaner access in tests