Skip to content

Commit d6a5d4b

Browse files
committed
Add more citations throughout the library
1 parent 210c661 commit d6a5d4b

File tree

3 files changed

+72
-31
lines changed

3 files changed

+72
-31
lines changed

Diff for: email_validator/deliverability.py

+9-7
Original file line numberDiff line numberDiff line change
@@ -34,15 +34,16 @@ def validate_email_deliverability(domain, domain_i18n, timeout=None, dns_resolve
3434

3535
try:
3636
try:
37-
# Try resolving for MX records.
37+
# Try resolving for MX records (RFC 5321 Section 5).
3838
response = dns_resolver.resolve(domain, "MX")
3939

4040
# For reporting, put them in priority order and remove the trailing dot in the qnames.
4141
mtas = sorted([(r.preference, str(r.exchange).rstrip('.')) for r in response])
4242

43-
# Remove "null MX" records from the list (their value is (0, ".") but we've stripped
44-
# trailing dots, so the 'exchange' is just ""). If there was only a null MX record,
45-
# email is not deliverable.
43+
# RFC 7505: Null MX (0, ".") records signify the domain does not accept email.
44+
# Remove null MX records from the mtas list (but we've stripped trailing dots,
45+
# so the 'exchange' is just "") so we can check if there are no non-null MX
46+
# records remaining.
4647
mtas = [(preference, exchange) for preference, exchange in mtas
4748
if exchange != ""]
4849
if len(mtas) == 0: # null MX only, if there were no MX records originally a NoAnswer exception would have occurred
@@ -52,7 +53,7 @@ def validate_email_deliverability(domain, domain_i18n, timeout=None, dns_resolve
5253
deliverability_info["mx_fallback_type"] = None
5354

5455
except dns.resolver.NoAnswer:
55-
# If there was no MX record, fall back to an A record, as SMTP servers do.
56+
# If there was no MX record, fall back to an A record. (RFC 5321 Section 5)
5657
try:
5758
response = dns_resolver.resolve(domain, "A")
5859
deliverability_info["mx"] = [(0, str(r)) for r in response]
@@ -61,6 +62,7 @@ def validate_email_deliverability(domain, domain_i18n, timeout=None, dns_resolve
6162
except dns.resolver.NoAnswer:
6263

6364
# If there was no A record, fall back to an AAAA record.
65+
# (It's unclear if SMTP servers actually do this.)
6466
try:
6567
response = dns_resolver.resolve(domain, "AAAA")
6668
deliverability_info["mx"] = [(0, str(r)) for r in response]
@@ -73,8 +75,8 @@ def validate_email_deliverability(domain, domain_i18n, timeout=None, dns_resolve
7375
# have been raised).
7476
raise EmailUndeliverableError("The domain name %s does not accept email." % domain_i18n)
7577

76-
# Check for a SPF reject-all record ("v=spf1 -all") which indicates
77-
# no emails are sent from this domain (similar to a NULL MX record
78+
# Check for a SPF (RFC 7208) reject-all record ("v=spf1 -all") which indicates
79+
# no emails are sent from this domain (similar to a Null MX record
7880
# but for sending rather than receiving). In combination with the
7981
# absence of an MX record, this is probably a good sign that the
8082
# domain is not used for email.

Diff for: email_validator/rfc_constants.py

+4-5
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,8 @@
1818
DOT_ATOM_TEXT_INTL = re.compile('[' + ATEXT_INTL + ']+(?:\\.[' + ATEXT_INTL + r']+)*\Z')
1919

2020
# The domain part of the email address, after IDNA (ASCII) encoding,
21-
# must also satisfy the requirements of RFC 952/RFC 1123 which restrict
22-
# the allowed characters of hostnames further. The hyphen cannot be at
23-
# the beginning or end of a *dot-atom component* of a hostname either.
21+
# must also satisfy the requirements of RFC 952/RFC 1123 Section 2.1 which
22+
# restrict the allowed characters of hostnames further.
2423
ATEXT_HOSTNAME_INTL = re.compile(r"[a-zA-Z0-9\-\." + "\u0080-\U0010FFFF" + "]")
2524
HOSTNAME_LABEL = r'(?:(?:[a-zA-Z0-9][a-zA-Z0-9\-]*)?[a-zA-Z0-9])'
2625
DOT_ATOM_TEXT_HOSTNAME = re.compile(HOSTNAME_LABEL + r'(?:\.' + HOSTNAME_LABEL + r')*\Z')
@@ -31,5 +30,5 @@
3130
# explains the maximum length of an email address is 254 octets.
3231
EMAIL_MAX_LENGTH = 254
3332
LOCAL_PART_MAX_LENGTH = 64
34-
DNS_LABEL_LENGTH_LIMIT = 63 # RFC 1035 2.3.1
35-
DOMAIN_MAX_LENGTH = 255 # RFC 1035 2.3.4
33+
DNS_LABEL_LENGTH_LIMIT = 63 # in "octets", RFC 1035 2.3.1
34+
DOMAIN_MAX_LENGTH = 255 # in "octets", RFC 1035 2.3.4 and RFC 5321 4.5.3.1.2

Diff for: email_validator/syntax.py

+59-19
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,8 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals
4646
"smtputf8": False,
4747
}
4848

49-
# RFC 5321 4.5.3.1.1
49+
# Check the length of the local part by couting characters.
50+
# (RFC 5321 4.5.3.1.1)
5051
# We're checking the number of characters here. If the local part
5152
# is ASCII-only, then that's the same as bytes (octets). If it's
5253
# internationalized, then the UTF-8 encoding may be longer, but
@@ -57,6 +58,8 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals
5758
raise EmailSyntaxError("The email address is too long before the @-sign {}.".format(reason))
5859

5960
# Check for invalid characters.
61+
# (RFC 2822 Section 3.2.4 / RFC 5322 Section 3.2.3, plus RFC 6531 section 3.3
62+
# if internationalized local parts are allowed)
6063
atext_re = re.compile('[.' + (ATEXT if not allow_smtputf8 else ATEXT_INTL) + ']')
6164
bad_chars = set(
6265
safe_character_display(c)
@@ -67,9 +70,11 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals
6770
raise EmailSyntaxError("The email address contains invalid characters before the @-sign: " + ", ".join(sorted(bad_chars)) + ".")
6871

6972
# Check for dot errors imposted by the dot-atom rule.
73+
# (RFC 2822 3.2.4)
7074
check_dot_atom(local, 'An email address cannot start with a {}.', 'An email address cannot have a {} immediately before the @-sign.', is_hostname=False)
7175

72-
# Check the local part against the regular expression for the older ASCII requirements.
76+
# Check the local part against the non-internationalized regular expression.
77+
# (RFC 2822 3.2.4)
7378
m = DOT_ATOM_TEXT.match(local)
7479
if m:
7580
# Return the local part unchanged and flag that SMTPUTF8 is not needed.
@@ -82,6 +87,7 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals
8287
else:
8388
# The local part failed the ASCII check. Now try the extended internationalized requirements.
8489
# This should already be handled by the bad_chars and check_dot_atom tests above.
90+
# It's the same pattern but with additional characters permitted.
8591
m = DOT_ATOM_TEXT_INTL.match(local)
8692
if not m:
8793
raise EmailSyntaxError("The email address contains invalid characters before the @-sign.")
@@ -97,7 +103,8 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals
97103

98104
# Check for unsafe characters.
99105
# Some of this may be redundant with the range U+0080 to U+10FFFF that is checked
100-
# by DOT_ATOM_TEXT_INTL.
106+
# by DOT_ATOM_TEXT_INTL. Other characters may be permitted by the email specs, but
107+
# they may not be valid, safe, or sensible Unicode strings.
101108
check_unsafe_chars(local)
102109

103110
# Try encoding to UTF-8. Failure is possible with some characters like
@@ -117,39 +124,56 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals
117124

118125

119126
def check_unsafe_chars(s):
127+
# Check for unsafe characters or characters that would make the string
128+
# invalid or non-sensible Unicode.
120129
bad_chars = set()
121130
for i, c in enumerate(s):
122131
category = unicodedata.category(c)
123132
if category[0] in ("L", "N", "P", "S"):
124-
# letters, numbers, punctuation, and symbols are permitted
133+
# Letters, numbers, punctuation, and symbols are permitted.
125134
pass
126135
elif category[0] == "M":
127-
# combining character in first position would combine with something
128-
# outside of the email address if concatenated to the right, but are
129-
# otherwise permitted
136+
# Combining character in first position would combine with something
137+
# outside of the email address if concatenated, so they are not safe.
138+
# We also check if this occurs after the @-sign, which would not be
139+
# sensible.
130140
if i == 0:
131141
bad_chars.add(c)
132-
elif category[0] in ("Z", "C"):
133-
# spaces and line/paragraph characters (Z) and
134-
# control, format, surrogate, private use, and unassigned code points (C)
142+
elif category[0] == "Z":
143+
# Spaces and line/paragraph characters (Z) outside of the ASCII range
144+
# are not specifically disallowed as far as I can tell, but they
145+
# violate the spirit of the non-internationalized specification that
146+
# email addresses do not contain spaces or line breaks when not quoted.
147+
bad_chars.add(c)
148+
elif category[0] == "C":
149+
# Control, format, surrogate, private use, and unassigned code points (C)
150+
# are all unsafe in various ways. Control and format characters can affect
151+
# text rendering if the email address is concatenated with other text.
152+
# Bidirectional format characters are unsafe, even if used properly, because
153+
# they cause an email address to render as a different email address.
154+
# Private use characters do not make sense for publicly deliverable
155+
# email addresses.
135156
bad_chars.add(c)
136157
else:
137158
# All categories should be handled above, but in case there is something new
138-
# in the future.
159+
# to the Unicode specification in the future, reject all other categories.
139160
bad_chars.add(c)
140161
if bad_chars:
141162
raise EmailSyntaxError("The email address contains unsafe characters: "
142163
+ ", ".join(safe_character_display(c) for c in sorted(bad_chars)) + ".")
143164

144165

145166
def check_dot_atom(label, start_descr, end_descr, is_hostname):
167+
# RFC 2822 3.2.4
146168
if label.endswith("."):
147169
raise EmailSyntaxError(end_descr.format("period"))
148170
if label.startswith("."):
149171
raise EmailSyntaxError(start_descr.format("period"))
150172
if ".." in label:
151173
raise EmailSyntaxError("An email address cannot have two periods in a row.")
174+
152175
if is_hostname:
176+
# RFC 952
153177
if label.endswith("-"):
154178
raise EmailSyntaxError(end_descr.format("hyphen"))
155179
if label.startswith("-"):
@@ -166,13 +190,19 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera
166190
raise EmailSyntaxError("There must be something after the @-sign.")
167191

168192
# Check for invalid characters before normalization.
193+
# (RFC 952 plus RFC 6531 section 3.3 for internationalized addresses)
169194
bad_chars = set(
170195
safe_character_display(c)
171196
for c in domain
172197
if not ATEXT_HOSTNAME_INTL.match(c)
173198
)
174199
if bad_chars:
175200
raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".")
201+
202+
# Check for unsafe characters.
203+
# Some of this may be redundant with the range U+0080 to U+10FFFF that is checked
204+
# by DOT_ATOM_TEXT_INTL. Other characters may be permitted by the email specs, but
205+
# they may not be valid, safe, or sensible Unicode strings.
176206
check_unsafe_chars(domain)
177207

178208
# Perform UTS-46 normalization, which includes casefolding, NFC normalization,
@@ -191,9 +221,13 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera
191221
# Check that before we do IDNA encoding because the IDNA library gives
192222
# unfriendly errors for these cases, but after UTS-46 normalization because
193223
# it can insert periods and hyphens (from fullwidth characters).
224+
# (RFC 952, RFC 2822 3.2.4)
194225
check_dot_atom(domain, 'An email address cannot have a {} immediately after the @-sign.', 'An email address cannot end with a {}.', is_hostname=True)
226+
227+
# Check for RFC 5890's invalid R-LDH labels, which are labels that start
228+
# with two characters other than "xn" and two dashes.
195229
for label in domain.split("."):
196-
if re.match(r"(?!xn)..--", label, re.I): # RFC 5890 invalid R-LDH labels
230+
if re.match(r"(?!xn)..--", label, re.I):
197231
raise EmailSyntaxError("An email address cannot have two letters followed by two dashes immediately after the @-sign or after a period, except Punycode.")
198232

199233
if DOT_ATOM_TEXT_HOSTNAME.match(domain):
@@ -230,23 +264,29 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera
230264
if not m:
231265
raise EmailSyntaxError("The email address contains invalid characters after the @-sign after IDNA encoding.")
232266

233-
# RFC 5321 4.5.3.1.2
234-
# We're checking the number of bytes (octets) here, which can be much
267+
# Check the length of the domain name in bytes.
268+
# (RFC 1035 2.3.4 and RFC 5321 4.5.3.1.2)
269+
# We're checking the number of bytes ("octets") here, which can be much
235270
# higher than the number of characters in internationalized domains,
236271
# on the assumption that the domain may be transmitted without SMTPUTF8
237272
# as IDNA ASCII. (This is also checked by idna.encode, so this exception
238273
# is never reached for internationalized domains.)
239274
if len(ascii_domain) > DOMAIN_MAX_LENGTH:
240275
reason = get_length_reason(ascii_domain, limit=DOMAIN_MAX_LENGTH)
241276
raise EmailSyntaxError("The email address is too long after the @-sign {}.".format(reason))
277+
278+
# Also check the label length limit.
279+
# (RFC 1035 2.3.1)
242280
for label in ascii_domain.split("."):
243281
if len(label) > DNS_LABEL_LENGTH_LIMIT:
244282
reason = get_length_reason(label, limit=DNS_LABEL_LENGTH_LIMIT)
245-
raise EmailSyntaxError("On either side of the @-sign, periods cannot be separated by so many characters {}.".format(reason))
283+
raise EmailSyntaxError("After the @-sign, periods cannot be separated by so many characters {}.".format(reason))
246284

247285
if globally_deliverable:
248286
# All publicly deliverable addresses have domain named with at least
249-
# one period, and we'll consider the lack of a period a syntax error
287+
# one period, at least for gTLDs created since 2013 (per the ICANN Board
288+
# New gTLD Program Committee, https://www.icann.org/en/announcements/details/new-gtld-dotless-domain-names-prohibited-30-8-2013-en).
289+
# We'll consider the lack of a period a syntax error
250290
# since that will match people's sense of what an email address looks
251291
# like. We'll skip this in test environments to allow '@test' email
252292
# addresses.
@@ -260,6 +300,7 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera
260300
# Check special-use and reserved domain names.
261301
# Some might fail DNS-based deliverability checks, but that
262302
# can be turned off, so we should fail them all sooner.
303+
# See the references in __init__.py.
263304
from . import SPECIAL_USE_DOMAIN_NAMES
264305
for d in SPECIAL_USE_DOMAIN_NAMES:
265306
# See the note near the definition of SPECIAL_USE_DOMAIN_NAMES.
@@ -274,15 +315,14 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera
274315
# but not be actual IDNA. For ASCII-only domains, the conversion out
275316
# of IDNA just gives the same thing back.
276317
#
277-
# This gives us the canonical internationalized form of the domain,
278-
# which we should use in all error messages.
318+
# This gives us the canonical internationalized form of the domain.
279319
try:
280320
domain_i18n = idna.decode(ascii_domain.encode('ascii'))
281321
except idna.IDNAError as e:
282322
raise EmailSyntaxError("The part after the @-sign is not valid IDNA ({}).".format(str(e)))
283323

284324
# Check for invalid characters after normalization. These
285-
# should never arise.
325+
# should never arise. See the similar checks above.
286326
bad_chars = set(
287327
safe_character_display(c)
288328
for c in domain

0 commit comments

Comments
 (0)