Skip to content

Commit e593ab1

Browse files
authored
Merge pull request #137 from maxmind/horgh/normalize
Replace fewer TLDs when normalizing
2 parents 130251c + af8438b commit e593ab1

File tree

3 files changed

+59
-10
lines changed

3 files changed

+59
-10
lines changed

HISTORY.rst

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,8 @@ History
2626
* Duplicate ``.com`` s are now removed from email domain names when
2727
``hash_email`` is used. For example, ``example.com.com`` will become
2828
``example.com``.
29-
* Extraneous characters after ``.com`` are now removed from email domain
30-
names when ``hash_email`` is used. For example, ``example.comfoo`` will
31-
become ``example.com``.
32-
* Certain ``.com`` typos are now normalized to ``.com`` when ``hash_email`` is
33-
used. For example, ``example.cam`` will become ``example.com``.
29+
* Certain TLD typos are now normalized when ``hash_email`` is used. For
30+
example, ``example.comcom`` will become ``example.com``.
3431
* Additional ``gmail.com`` domain names with leading digits are now
3532
normalized when ``hash_email`` is used. For example, ``100gmail.com`` will
3633
become ``gmail.com``.

minfraud/request.py

Lines changed: 50 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,50 @@
3030
"putlook.com": "outlook.com",
3131
}
3232

33+
_TYPO_TLDS = {
34+
"comm": "com",
35+
"commm": "com",
36+
"commmm": "com",
37+
"comn": "com",
38+
"cbm": "com",
39+
"ccm": "com",
40+
"cdm": "com",
41+
"cem": "com",
42+
"cfm": "com",
43+
"cgm": "com",
44+
"chm": "com",
45+
"cim": "com",
46+
"cjm": "com",
47+
"ckm": "com",
48+
"clm": "com",
49+
"cmm": "com",
50+
"cnm": "com",
51+
"cpm": "com",
52+
"cqm": "com",
53+
"crm": "com",
54+
"csm": "com",
55+
"ctm": "com",
56+
"cum": "com",
57+
"cvm": "com",
58+
"cwm": "com",
59+
"cxm": "com",
60+
"cym": "com",
61+
"czm": "com",
62+
"col": "com",
63+
"con": "com",
64+
"dom": "com",
65+
"don": "com",
66+
"som": "com",
67+
"son": "com",
68+
"vom": "com",
69+
"von": "com",
70+
"xom": "com",
71+
"xon": "com",
72+
"clam": "com",
73+
"colm": "com",
74+
"comcom": "com",
75+
}
76+
3377
_EQUIVALENT_DOMAINS = {
3478
"googlemail.com": "gmail.com",
3579
"pm.me": "protonmail.com",
@@ -296,10 +340,14 @@ def _clean_domain(domain):
296340
domain = domain.strip().rstrip(".").encode("idna").decode("ASCII")
297341

298342
domain = re.sub(r"(?:\.com){2,}$", ".com", domain)
299-
domain = re.sub(r"\.com[^.]+$", ".com", domain)
300-
domain = re.sub(r"(?:\.(?:com|c[a-z]{1,2}m|co[ln]|[dsvx]o[mn]|))$", ".com", domain)
301343
domain = re.sub(r"^\d+(?:gmail?\.com)$", "gmail.com", domain)
302344

345+
idx = domain.rfind(".")
346+
if idx != -1:
347+
tld = domain[idx + 1 :] # noqa
348+
if tld in _TYPO_TLDS:
349+
domain = domain[:idx] + "." + _TYPO_TLDS.get(tld)
350+
303351
domain = _TYPO_DOMAINS.get(domain, domain)
304352
domain = _EQUIVALENT_DOMAINS.get(domain, domain)
305353

tests/test_request.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,10 @@ def test_clean_email():
210210
{"input": "Test+@maxmind.com", "output": "test@maxmind.com"},
211211
{"input": "+@maxmind.com", "output": "+@maxmind.com"},
212212
{"input": " Test@maxmind.com", "output": "test@maxmind.com"},
213-
{"input": "Test@maxmind.com|abc124472372", "output": "test@maxmind.com"},
213+
{
214+
"input": "Test@maxmind.com|abc124472372",
215+
"output": "test@maxmind.com|abc124472372",
216+
},
214217
{"input": "Test+foo@yahoo.com", "output": "test+foo@yahoo.com"},
215218
{"input": "Test-foo@yahoo.com", "output": "test@yahoo.com"},
216219
{"input": "Test-foo-foo2@yahoo.com", "output": "test@yahoo.com"},
@@ -222,9 +225,10 @@ def test_clean_email():
222225
{"input": "alias@user.fastmail.com", "output": "user@fastmail.com"},
223226
{"input": "foo-bar@ymail.com", "output": "foo@ymail.com"},
224227
{"input": "foo@example.com.com", "output": "foo@example.com"},
225-
{"input": "foo@example.comfoo", "output": "foo@example.com"},
226-
{"input": "foo@example.cam", "output": "foo@example.com"},
228+
{"input": "foo@example.comfoo", "output": "foo@example.comfoo"},
229+
{"input": "foo@example.cam", "output": "foo@example.cam"},
227230
{"input": "foo@10000gmail.com", "output": "foo@gmail.com"},
231+
{"input": "foo@example.comcom", "output": "foo@example.com"},
228232
]
229233

230234
for test in tests:

0 commit comments

Comments
 (0)