Skip to content

Commit 210c661

Browse files
committed
Improve exception messages related to the dot-atom rule and also invalid and unsafe characters
* Check for invalid and (now) unsafe characters in domain names before going to IDNA parsing so we can be more sure we don't generate unsafe strings in error messages. It's also clearer. But these were probably invalid anyway per IDNA rules. * Check for bad characters in local parts before using the dot-atom regex because the regex can fail because of invalid dot usage and the error message wouldn't indicate that. * Check for invalid dot usage in the local part explicitly to improve error messages. Add tests. * Use a safe and intelligible (no Python escape codes) representation of invalid characters in error messages.
1 parent 18e880c commit 210c661

File tree

3 files changed

+141
-67
lines changed

3 files changed

+141
-67
lines changed

Diff for: email_validator/rfc_constants.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,9 @@
2121
# must also satisfy the requirements of RFC 952/RFC 1123 which restrict
2222
# the allowed characters of hostnames further. The hyphen cannot be at
2323
# the beginning or end of a *dot-atom component* of a hostname either.
24-
ATEXT_HOSTNAME = r'(?:(?:[a-zA-Z0-9][a-zA-Z0-9\-]*)?[a-zA-Z0-9])'
25-
DOT_ATOM_TEXT_HOSTNAME = re.compile(ATEXT_HOSTNAME + r'(?:\.' + ATEXT_HOSTNAME + r')*\Z')
24+
ATEXT_HOSTNAME_INTL = re.compile(r"[a-zA-Z0-9\-\." + "\u0080-\U0010FFFF" + "]")
25+
HOSTNAME_LABEL = r'(?:(?:[a-zA-Z0-9][a-zA-Z0-9\-]*)?[a-zA-Z0-9])'
26+
DOT_ATOM_TEXT_HOSTNAME = re.compile(HOSTNAME_LABEL + r'(?:\.' + HOSTNAME_LABEL + r')*\Z')
2627
DOMAIN_NAME_REGEX = re.compile(r"[A-Za-z]\Z") # all TLDs currently end with a letter
2728

2829
# Length constants

Diff for: email_validator/syntax.py

+97-41
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from .exceptions_types import EmailSyntaxError
22
from .rfc_constants import EMAIL_MAX_LENGTH, LOCAL_PART_MAX_LENGTH, DOMAIN_MAX_LENGTH, \
3-
DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT, ATEXT_INTL, DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX
3+
DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT, ATEXT_INTL, ATEXT_HOSTNAME_INTL, DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX
44

55
import re
66
import unicodedata
@@ -16,6 +16,21 @@ def get_length_reason(addr, utf8=False, limit=EMAIL_MAX_LENGTH):
1616
return reason.format(prefix, diff, suffix)
1717

1818

19+
def safe_character_display(c):
20+
# Return safely displayable characters in quotes.
21+
if unicodedata.category(c)[0] in ("L", "N", "P", "S"):
22+
return repr(c)
23+
24+
# Construct a hex string in case the unicode name doesn't exist.
25+
if ord(c) < 0xFFFF:
26+
h = "U+{:04x}".format(ord(c)).upper()
27+
else:
28+
h = "U+{:08x}".format(ord(c)).upper()
29+
30+
# Return the character name or, if it has no name, the hex string.
31+
return unicodedata.name(c, h)
32+
33+
1934
def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=False):
2035
"""Validates the syntax of the local part of an email address."""
2136

@@ -41,6 +56,19 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals
4156
reason = get_length_reason(local, limit=LOCAL_PART_MAX_LENGTH)
4257
raise EmailSyntaxError("The email address is too long before the @-sign {}.".format(reason))
4358

59+
# Check for invalid characters.
60+
atext_re = re.compile('[.' + (ATEXT if not allow_smtputf8 else ATEXT_INTL) + ']')
61+
bad_chars = set(
62+
safe_character_display(c)
63+
for c in local
64+
if not atext_re.match(c)
65+
)
66+
if bad_chars:
67+
raise EmailSyntaxError("The email address contains invalid characters before the @-sign: " + ", ".join(sorted(bad_chars)) + ".")
68+
69+
# Check for dot errors imposted by the dot-atom rule.
70+
check_dot_atom(local, 'An email address cannot start with a {}.', 'An email address cannot have a {} immediately before the @-sign.', is_hostname=False)
71+
4472
# Check the local part against the regular expression for the older ASCII requirements.
4573
m = DOT_ATOM_TEXT.match(local)
4674
if m:
@@ -53,14 +81,10 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals
5381

5482
else:
5583
# The local part failed the ASCII check. Now try the extended internationalized requirements.
84+
# This should already be handled by the bad_chars and check_dot_atom tests above.
5685
m = DOT_ATOM_TEXT_INTL.match(local)
5786
if not m:
58-
# It's not a valid internationalized address either. Report which characters were not valid.
59-
bad_chars = ', '.join(sorted(set(
60-
unicodedata.name(c, repr(c)) for c in local if not re.match(u"[" + (ATEXT if not allow_smtputf8 else ATEXT_INTL) + u"]", c)
61-
)))
62-
raise EmailSyntaxError("The email address contains invalid characters before the @-sign: %s." % bad_chars)
63-
87+
raise EmailSyntaxError("The email address contains invalid characters before the @-sign.")
6488
# It would be valid if internationalized characters were allowed by the caller.
6589
if not allow_smtputf8:
6690
raise EmailSyntaxError("Internationalized characters before the @-sign are not supported.")
@@ -74,28 +98,7 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals
7498
# Check for unsafe characters.
7599
# Some of this may be redundant with the range U+0080 to U+10FFFF that is checked
76100
# by DOT_ATOM_TEXT_INTL.
77-
for i, c in enumerate(local):
78-
category = unicodedata.category(c)
79-
if category[0] in ("L", "N", "P", "S"):
80-
# letters, numbers, punctuation, and symbols are permitted
81-
pass
82-
elif category[0] == "M":
83-
# combining character in first position would combine with something
84-
# outside of the email address if concatenated to the right, but are
85-
# otherwise permitted
86-
if i == 0:
87-
raise EmailSyntaxError("The email address contains an initial invalid character (%s)."
88-
% unicodedata.name(c, repr(c)))
89-
elif category[0] in ("Z", "C"):
90-
# spaces and line/paragraph characters (Z) and
91-
# control, format, surrogate, private use, and unassigned code points (C)
92-
raise EmailSyntaxError("The email address contains an invalid character (%s)."
93-
% unicodedata.name(c, repr(c)))
94-
else:
95-
# All categories should be handled above, but in case there is something new
96-
# in the future.
97-
raise EmailSyntaxError("The email address contains a character (%s; category %s) that may not be safe."
98-
% (unicodedata.name(c, repr(c)), category))
101+
check_unsafe_chars(local)
99102

100103
# Try encoding to UTF-8. Failure is possible with some characters like
101104
# surrogate code points, but those are checked above. Still, we don't
@@ -113,13 +116,65 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals
113116
}
114117

115118

119+
def check_unsafe_chars(s):
120+
bad_chars = set()
121+
for i, c in enumerate(s):
122+
category = unicodedata.category(c)
123+
if category[0] in ("L", "N", "P", "S"):
124+
# letters, numbers, punctuation, and symbols are permitted
125+
pass
126+
elif category[0] == "M":
127+
# combining character in first position would combine with something
128+
# outside of the email address if concatenated to the right, but are
129+
# otherwise permitted
130+
if i == 0:
131+
bad_chars.add(c)
132+
elif category[0] in ("Z", "C"):
133+
# spaces and line/paragraph characters (Z) and
134+
# control, format, surrogate, private use, and unassigned code points (C)
135+
bad_chars.add(c)
136+
else:
137+
# All categories should be handled above, but in case there is something new
138+
# in the future.
139+
bad_chars.add(c)
140+
if bad_chars:
141+
raise EmailSyntaxError("The email address contains unsafe characters: "
142+
+ ", ".join(safe_character_display(c) for c in sorted(bad_chars)) + ".")
143+
144+
145+
def check_dot_atom(label, start_descr, end_descr, is_hostname):
146+
if label.endswith("."):
147+
raise EmailSyntaxError(end_descr.format("period"))
148+
if label.startswith("."):
149+
raise EmailSyntaxError(start_descr.format("period"))
150+
if ".." in label:
151+
raise EmailSyntaxError("An email address cannot have two periods in a row.")
152+
if is_hostname:
153+
if label.endswith("-"):
154+
raise EmailSyntaxError(end_descr.format("hyphen"))
155+
if label.startswith("-"):
156+
raise EmailSyntaxError(start_descr.format("hyphen"))
157+
if ".-" in label or "-." in label:
158+
raise EmailSyntaxError("An email address cannot have a period and a hyphen next to each other.")
159+
160+
116161
def validate_email_domain_part(domain, test_environment=False, globally_deliverable=True):
117162
"""Validates the syntax of the domain part of an email address."""
118163

119164
# Empty?
120165
if len(domain) == 0:
121166
raise EmailSyntaxError("There must be something after the @-sign.")
122167

168+
# Check for invalid characters before normalization.
169+
bad_chars = set(
170+
safe_character_display(c)
171+
for c in domain
172+
if not ATEXT_HOSTNAME_INTL.match(c)
173+
)
174+
if bad_chars:
175+
raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".")
176+
check_unsafe_chars(domain)
177+
123178
# Perform UTS-46 normalization, which includes casefolding, NFC normalization,
124179
# and converting all label separators (the period/full stop, fullwidth full stop,
125180
# ideographic full stop, and halfwidth ideographic full stop) to basic periods.
@@ -136,23 +191,13 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera
136191
# Check that before we do IDNA encoding because the IDNA library gives
137192
# unfriendly errors for these cases, but after UTS-46 normalization because
138193
# it can insert periods and hyphens (from fullwidth characters).
139-
if domain.endswith("."):
140-
raise EmailSyntaxError("An email address cannot end with a period.")
141-
if domain.startswith("."):
142-
raise EmailSyntaxError("An email address cannot have a period immediately after the @-sign.")
143-
if ".." in domain:
144-
raise EmailSyntaxError("An email address cannot have two periods in a row.")
145-
if domain.endswith("-"):
146-
raise EmailSyntaxError("An email address cannot end with a hyphen.")
147-
if domain.startswith("-"):
148-
raise EmailSyntaxError("An email address cannot have a hyphen immediately after the @-sign.")
149-
if ".-" in domain or "-." in domain:
150-
raise EmailSyntaxError("An email address cannot have a period and a hyphen next to each other.")
194+
check_dot_atom(domain, 'An email address cannot have a {} immediately after the @-sign.', 'An email address cannot end with a {}.', is_hostname=True)
151195
for label in domain.split("."):
152196
if re.match(r"(?!xn)..--", label, re.I): # RFC 5890 invalid R-LDH labels
153197
raise EmailSyntaxError("An email address cannot have two letters followed by two dashes immediately after the @-sign or after a period, except Punycode.")
154198

155199
if DOT_ATOM_TEXT_HOSTNAME.match(domain):
200+
# This is a valid non-internationalized domain.
156201
ascii_domain = domain
157202
else:
158203
# If international characters are present in the domain name, convert
@@ -236,6 +281,17 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera
236281
except idna.IDNAError as e:
237282
raise EmailSyntaxError("The part after the @-sign is not valid IDNA ({}).".format(str(e)))
238283

284+
# Check for invalid characters after normalization. These
285+
# should never arise.
286+
bad_chars = set(
287+
safe_character_display(c)
288+
for c in domain
289+
if not ATEXT_HOSTNAME_INTL.match(c)
290+
)
291+
if bad_chars:
292+
raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".")
293+
check_unsafe_chars(domain)
294+
239295
# Return the IDNA ASCII-encoded form of the domain, which is how it
240296
# would be transmitted on the wire (except when used with SMTPUTF8
241297
# possibly), as well as the canonical Unicode form of the domain,

Diff for: tests/test_syntax.py

+41-24
Original file line numberDiff line numberDiff line change
@@ -226,21 +226,21 @@ def test_email_valid(email_input, output):
226226
('my@baddashfw.-a.com', 'An email address cannot have a period and a hyphen next to each other.'),
227227
('[email protected]-.com', 'An email address cannot have a period and a hyphen next to each other.'),
228228
229-
'The part after the @-sign contains invalid characters (Codepoint U+000A at position 4 of '
230-
'\'com\\n\' not allowed).'),
229+
'The part after the @-sign contains invalid characters: U+000A.'),
231230
('my@example\n.com',
232-
'The part after the @-sign contains invalid characters (Codepoint U+000A at position 8 of '
233-
'\'example\\n\' not allowed).'),
234-
('[email protected]', 'The email address contains invalid characters before the @-sign: FULL STOP.'),
235-
('[email protected]', 'The email address contains invalid characters before the @-sign: FULL STOP.'),
236-
('[email protected]', 'The email address contains invalid characters before the @-sign: FULL STOP.'),
231+
'The part after the @-sign contains invalid characters: U+000A.'),
232+
('me@x!', 'The part after the @-sign contains invalid characters: \'!\'.'),
233+
('me@x ', 'The part after the @-sign contains invalid characters: SPACE.'),
234+
('[email protected]', 'An email address cannot start with a period.'),
235+
('[email protected]', 'An email address cannot have two periods in a row.'),
236+
('[email protected]', 'An email address cannot have a period immediately before the @-sign.'),
237237
('me@⒈wouldbeinvalid.com',
238238
"The part after the @-sign contains invalid characters (Codepoint U+2488 not allowed "
239239
"at position 1 in '⒈wouldbeinvalid.com')."),
240240
('@example.com', 'There must be something before the @-sign.'),
241-
('\n[email protected]', 'The email address contains invalid characters before the @-sign: \'\\n\'.'),
242-
('m\n[email protected]', 'The email address contains invalid characters before the @-sign: \'\\n\'.'),
243-
('my\n@example.com', 'The email address contains invalid characters before the @-sign: \'\\n\'.'),
241+
('\n[email protected]', 'The email address contains invalid characters before the @-sign: U+000A.'),
242+
('m\n[email protected]', 'The email address contains invalid characters before the @-sign: U+000A.'),
243+
('my\n@example.com', 'The email address contains invalid characters before the @-sign: U+000A.'),
244244
('11111111112222222222333333333344444444445555555555666666666677777@example.com', 'The email address is too long before the @-sign (1 character too many).'),
245245
('111111111122222222223333333333444444444455555555556666666666777777@example.com', 'The email address is too long before the @-sign (2 characters too many).'),
246246
('me@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.111111111122222222223333333333444444444455555555556.com', 'The email address is too long (4 characters too many).'),
@@ -253,7 +253,6 @@ def test_email_valid(email_input, output):
253253
('my.λong.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (at least 1 character too many).'),
254254
('me@bad-tld-1', 'The part after the @-sign is not valid. It should have a period.'),
255255
('[email protected]', 'The part after the @-sign is not valid. It is not within a valid top-level domain.'),
256-
('me@x!', 'The part after the @-sign contains invalid characters (Codepoint U+0021 at position 2 of \'x!\' not allowed).'),
257256
('[email protected]', 'The part after the @-sign is not valid IDNA (Invalid A-label).'),
258257
('[email protected]', 'An email address cannot have two letters followed by two dashes immediately after the @-sign or after a period, except Punycode.'),
259258
('me@yy--0.tld', 'An email address cannot have two letters followed by two dashes immediately after the @-sign or after a period, except Punycode.'),
@@ -289,25 +288,43 @@ def test_email_invalid_reserved_domain(email_input):
289288

290289

291290
@pytest.mark.parametrize(
292-
'email_input',
291+
('s', 'expected_error'),
292+
[
293+
('\u2005', 'FOUR-PER-EM SPACE'), # four-per-em space (Zs)
294+
('\u0300', 'COMBINING GRAVE ACCENT'), # grave accent (M)
295+
('\u009C', 'U+009C'), # string terminator (Cc)
296+
('\u200B', 'ZERO WIDTH SPACE'), # zero-width space (Cf)
297+
('\u202Dforward-\u202Ereversed', 'LEFT-TO-RIGHT OVERRIDE, RIGHT-TO-LEFT OVERRIDE'), # BIDI (Cf)
298+
('\uD800', 'U+D800'), # surrogate (Cs)
299+
('\uE000', 'U+E000'), # private use (Co)
300+
('\U0010FDEF', 'U+0010FDEF'), # priate use (Co)
301+
('\uFDEF', 'U+FDEF'), # unassigned (Cn)
302+
],
303+
)
304+
def test_email_unsafe_character(s, expected_error):
305+
# Check for various unsafe characters:
306+
307+
with pytest.raises(EmailSyntaxError) as exc_info:
308+
validate_email(s + "@test", test_environment=True)
309+
assert str(exc_info.value) == f"The email address contains unsafe characters: {expected_error}."
310+
311+
with pytest.raises(EmailSyntaxError) as exc_info:
312+
validate_email("test@" + s, test_environment=True)
313+
assert "The email address contains unsafe characters" in str(exc_info.value)
314+
315+
316+
@pytest.mark.parametrize(
317+
('email_input', 'expected_error'),
293318
[
294-
('white space@test'),
295-
('\n@test'),
296-
('\u2005@test'), # four-per-em space (Zs)
297-
('\u009C@test'), # string terminator (Cc)
298-
('\u200B@test'), # zero-width space (Cf)
299-
('\u202Dforward-\u202Ereversed@test'), # BIDI (Cf)
300-
('\uD800@test'), # surrogate (Cs)
301-
('\uE000@test'), # private use (Co)
302-
('\uFDEF@test'), # unassigned (Cn)
303-
('\u0300@test'), # grave accent (M)
319+
('white space@test', 'The email address contains invalid characters before the @-sign: SPACE.'),
320+
('\n@test', 'The email address contains invalid characters before the @-sign: U+000A.'),
304321
],
305322
)
306-
def test_email_unsafe_character(email_input):
323+
def test_email_invalid_character(email_input, expected_error):
307324
# Check for various unsafe characters:
308325
with pytest.raises(EmailSyntaxError) as exc_info:
309326
validate_email(email_input, test_environment=True)
310-
assert "invalid character" in str(exc_info.value)
327+
assert str(exc_info.value) == expected_error
311328

312329

313330
def test_email_test_domain_name_in_test_environment():

0 commit comments

Comments
 (0)