From 01ba9932d4e2d42f8f1a408c6702dc4a3f6a93d3 Mon Sep 17 00:00:00 2001
From: Jake Bromberg <jake@funlandresearch.com>
Date: Wed, 11 Feb 2026 19:42:32 -0800
Subject: [PATCH] =?UTF-8?q?fix:=20strip=20diacritics=20in=20CSV=20filter?=
 =?UTF-8?q?=20so=20artists=20like=20Bj=C3=B6rk=20aren't=20dropped?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The library stores ASCII names ("Bjork") but Discogs uses diacritics
("Björk"). The step 3 filter compared with .lower().strip() only, so
all releases for diacritics artists were silently excluded from the cache.
---
 scripts/filter_csv.py         | 10 ++++++++--
 tests/unit/test_filter_csv.py | 20 +++++++++++++++++++-
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/scripts/filter_csv.py b/scripts/filter_csv.py
index 1ba5406..510acfe 100644
--- a/scripts/filter_csv.py
+++ b/scripts/filter_csv.py
@@ -13,6 +13,7 @@
 import csv
 import logging
 import sys
+import unicodedata
 from pathlib import Path
 
 logging.basicConfig(
@@ -34,8 +35,13 @@
 
 
 def normalize_artist(name: str) -> str:
-    """Normalize artist name for matching."""
-    return name.lower().strip()
+    """Normalize artist name for matching.
+
+    Strips diacritics so that Discogs "Björk" matches library "Bjork".
+    """
+    nfkd = unicodedata.normalize("NFKD", name)
+    stripped = "".join(c for c in nfkd if not unicodedata.combining(c))
+    return stripped.lower().strip()
 
 
 def load_library_artists(path: Path) -> set[str]:
diff --git a/tests/unit/test_filter_csv.py b/tests/unit/test_filter_csv.py
index d637e04..fb4f042 100644
--- a/tests/unit/test_filter_csv.py
+++ b/tests/unit/test_filter_csv.py
@@ -40,8 +40,26 @@ class TestNormalizeArtist:
             ("RADIOHEAD", "radiohead"),
             ("  Mixed Case  ", "mixed case"),
             ("", ""),
+            ("Björk", "bjork"),
+            ("Sigur Rós", "sigur ros"),
+            ("Motörhead", "motorhead"),
+            ("Hüsker Dü", "husker du"),
+            ("Café Tacvba", "cafe tacvba"),
+            ("Zoé", "zoe"),
+        ],
+        ids=[
+            "lowercase",
+            "strip-spaces",
+            "all-caps",
+            "mixed-case-strip",
+            "empty",
+            "bjork",
+            "sigur-ros",
+            "motorhead",
+            "husker-du",
+            "cafe-tacvba",
+            "zoe",
         ],
-        ids=["lowercase", "strip-spaces", "all-caps", "mixed-case-strip", "empty"],
     )
     def test_normalize(self, raw: str, expected: str) -> None:
         assert normalize_artist(raw) == expected