diff --git a/HISTORY.rst b/HISTORY.rst index 457babd..47c7c4e 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -33,6 +33,8 @@ History become ``gmail.com``. * Additional ``gmail.com`` typos are now normalized when ``hash_email`` is used. For example, ``gmali.com`` will become ``gmail.com``. +* When ``hash_email`` is used, the local part of an email address is now + normalized to NFC. 2.9.0 (2023-12-05) ++++++++++++++++++ diff --git a/minfraud/request.py b/minfraud/request.py index 767aae1..0d2a468 100644 --- a/minfraud/request.py +++ b/minfraud/request.py @@ -8,6 +8,7 @@ import re import warnings import hashlib +import unicodedata from typing import Any, Dict from voluptuous import MultipleInvalid @@ -364,6 +365,8 @@ def _clean_email(address): domain = _clean_domain(address[at_idx + 1 :]) # noqa local_part = address[:at_idx] + local_part = unicodedata.normalize("NFC", local_part) + # Strip off aliased part of email address. if domain in _YAHOO_DOMAINS: divider = "-" diff --git a/tests/test_request.py b/tests/test_request.py index cebe66b..b4b14cd 100644 --- a/tests/test_request.py +++ b/tests/test_request.py @@ -141,6 +141,26 @@ def test_maybe_hash_email(self): } }, }, + { + "name": "email local part nfc normalization form 1", + "input": {"email": {"address": "bu\u0308cher@example.com"}}, + "expected": { + "email": { + "address": "53550c712b146287a2d0dd30e5ed6f4b", + "domain": "example.com", + } + }, + }, + { + "name": "email local part nfc normalization form 2", + "input": {"email": {"address": "b\u00FCcher@example.com"}}, + "expected": { + "email": { + "address": "53550c712b146287a2d0dd30e5ed6f4b", + "domain": "example.com", + } + }, + }, ] for test in tests: @@ -231,6 +251,8 @@ def test_clean_email(): {"input": "foo@example.comcom", "output": "foo@example.com"}, {"input": "foo@example.com.", "output": "foo@example.com"}, {"input": "foo@example.com...", "output": "foo@example.com"}, + {"input": "example@bu\u0308cher.com", "output": "example@xn--bcher-kva.com"}, + {"input": "example@b\u00FCcher.com", "output": "example@xn--bcher-kva.com"}, ] for test in tests: