From 01ba9932d4e2d42f8f1a408c6702dc4a3f6a93d3 Mon Sep 17 00:00:00 2001 From: Jake Bromberg Date: Wed, 11 Feb 2026 19:42:32 -0800 Subject: [PATCH] =?UTF-8?q?fix:=20strip=20diacritics=20in=20CSV=20filter?= =?UTF-8?q?=20so=20artists=20like=20Bj=C3=B6rk=20aren't=20dropped?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The library stores ASCII names ("Bjork") but Discogs uses diacritics ("Björk"). The step 3 filter compared with .lower().strip() only, so all releases for diacritics artists were silently excluded from the cache. --- scripts/filter_csv.py | 10 ++++++++-- tests/unit/test_filter_csv.py | 20 +++++++++++++++++++- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/scripts/filter_csv.py b/scripts/filter_csv.py index 1ba5406..510acfe 100644 --- a/scripts/filter_csv.py +++ b/scripts/filter_csv.py @@ -13,6 +13,7 @@ import csv import logging import sys +import unicodedata from pathlib import Path logging.basicConfig( @@ -34,8 +35,13 @@ def normalize_artist(name: str) -> str: - """Normalize artist name for matching.""" - return name.lower().strip() + """Normalize artist name for matching. + + Strips diacritics so that Discogs "Björk" matches library "Bjork". + """ + nfkd = unicodedata.normalize("NFKD", name) + stripped = "".join(c for c in nfkd if not unicodedata.combining(c)) + return stripped.lower().strip() def load_library_artists(path: Path) -> set[str]: diff --git a/tests/unit/test_filter_csv.py b/tests/unit/test_filter_csv.py index d637e04..fb4f042 100644 --- a/tests/unit/test_filter_csv.py +++ b/tests/unit/test_filter_csv.py @@ -40,8 +40,26 @@ class TestNormalizeArtist: ("RADIOHEAD", "radiohead"), (" Mixed Case ", "mixed case"), ("", ""), + ("Björk", "bjork"), + ("Sigur Rós", "sigur ros"), + ("Motörhead", "motorhead"), + ("Hüsker Dü", "husker du"), + ("Café Tacvba", "cafe tacvba"), + ("Zoé", "zoe"), + ], + ids=[ + "lowercase", + "strip-spaces", + "all-caps", + "mixed-case-strip", + "empty", + "bjork", + "sigur-ros", + "motorhead", + "husker-du", + "cafe-tacvba", + "zoe", ], - ids=["lowercase", "strip-spaces", "all-caps", "mixed-case-strip", "empty"], ) def test_normalize(self, raw: str, expected: str) -> None: assert normalize_artist(raw) == expected