Skip to content

Commit ee2db20

Browse files
authored
Merge pull request #139 from maxmind/ugexe/email-nfc
Normalize email addresses to NFC
2 parents 6364583 + b96f65a commit ee2db20

File tree

3 files changed

+27
-0
lines changed

3 files changed

+27
-0
lines changed

HISTORY.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ History
3333
become ``gmail.com``.
3434
* Additional ``gmail.com`` typos are now normalized when ``hash_email`` is
3535
used. For example, ``gmali.com`` will become ``gmail.com``.
36+
* When ``hash_email`` is used, the local part of an email address is now
37+
normalized to NFC.
3638

3739
2.9.0 (2023-12-05)
3840
++++++++++++++++++

minfraud/request.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import re
99
import warnings
1010
import hashlib
11+
import unicodedata
1112
from typing import Any, Dict
1213
from voluptuous import MultipleInvalid
1314

@@ -364,6 +365,8 @@ def _clean_email(address):
364365
domain = _clean_domain(address[at_idx + 1 :]) # noqa
365366
local_part = address[:at_idx]
366367

368+
local_part = unicodedata.normalize("NFC", local_part)
369+
367370
# Strip off aliased part of email address.
368371
if domain in _YAHOO_DOMAINS:
369372
divider = "-"

tests/test_request.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,26 @@ def test_maybe_hash_email(self):
141141
}
142142
},
143143
},
144+
{
145+
"name": "email local part nfc normalization form 1",
146+
"input": {"email": {"address": "bu\u0308cher@example.com"}},
147+
"expected": {
148+
"email": {
149+
"address": "53550c712b146287a2d0dd30e5ed6f4b",
150+
"domain": "example.com",
151+
}
152+
},
153+
},
154+
{
155+
"name": "email local part nfc normalization form 2",
156+
"input": {"email": {"address": "b\u00FCcher@example.com"}},
157+
"expected": {
158+
"email": {
159+
"address": "53550c712b146287a2d0dd30e5ed6f4b",
160+
"domain": "example.com",
161+
}
162+
},
163+
},
144164
]
145165

146166
for test in tests:
@@ -231,6 +251,8 @@ def test_clean_email():
231251
{"input": "foo@example.comcom", "output": "foo@example.com"},
232252
{"input": "foo@example.com.", "output": "foo@example.com"},
233253
{"input": "foo@example.com...", "output": "foo@example.com"},
254+
{"input": "example@bu\u0308cher.com", "output": "example@xn--bcher-kva.com"},
255+
{"input": "example@b\u00FCcher.com", "output": "example@xn--bcher-kva.com"},
234256
]
235257

236258
for test in tests:

0 commit comments

Comments
 (0)