@@ -46,7 +46,8 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals
46
46
"smtputf8" : False ,
47
47
}
48
48
49
- # RFC 5321 4.5.3.1.1
49
+ # Check the length of the local part by couting characters.
50
+ # (RFC 5321 4.5.3.1.1)
50
51
# We're checking the number of characters here. If the local part
51
52
# is ASCII-only, then that's the same as bytes (octets). If it's
52
53
# internationalized, then the UTF-8 encoding may be longer, but
@@ -57,6 +58,8 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals
57
58
raise EmailSyntaxError ("The email address is too long before the @-sign {}." .format (reason ))
58
59
59
60
# Check for invalid characters.
61
+ # (RFC 2822 Section 3.2.4 / RFC 5322 Section 3.2.3, plus RFC 6531 section 3.3
62
+ # if internationalized local parts are allowed)
60
63
atext_re = re .compile ('[.' + (ATEXT if not allow_smtputf8 else ATEXT_INTL ) + ']' )
61
64
bad_chars = set (
62
65
safe_character_display (c )
@@ -67,9 +70,11 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals
67
70
raise EmailSyntaxError ("The email address contains invalid characters before the @-sign: " + ", " .join (sorted (bad_chars )) + "." )
68
71
69
72
# Check for dot errors imposted by the dot-atom rule.
73
+ # (RFC 2822 3.2.4)
70
74
check_dot_atom (local , 'An email address cannot start with a {}.' , 'An email address cannot have a {} immediately before the @-sign.' , is_hostname = False )
71
75
72
- # Check the local part against the regular expression for the older ASCII requirements.
76
+ # Check the local part against the non-internationalized regular expression.
77
+ # (RFC 2822 3.2.4)
73
78
m = DOT_ATOM_TEXT .match (local )
74
79
if m :
75
80
# Return the local part unchanged and flag that SMTPUTF8 is not needed.
@@ -82,6 +87,7 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals
82
87
else :
83
88
# The local part failed the ASCII check. Now try the extended internationalized requirements.
84
89
# This should already be handled by the bad_chars and check_dot_atom tests above.
90
+ # It's the same pattern but with additional characters permitted.
85
91
m = DOT_ATOM_TEXT_INTL .match (local )
86
92
if not m :
87
93
raise EmailSyntaxError ("The email address contains invalid characters before the @-sign." )
@@ -97,7 +103,8 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals
97
103
98
104
# Check for unsafe characters.
99
105
# Some of this may be redundant with the range U+0080 to U+10FFFF that is checked
100
- # by DOT_ATOM_TEXT_INTL.
106
+ # by DOT_ATOM_TEXT_INTL. Other characters may be permitted by the email specs, but
107
+ # they may not be valid, safe, or sensible Unicode strings.
101
108
check_unsafe_chars (local )
102
109
103
110
# Try encoding to UTF-8. Failure is possible with some characters like
@@ -117,39 +124,56 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals
117
124
118
125
119
126
def check_unsafe_chars (s ):
127
+ # Check for unsafe characters or characters that would make the string
128
+ # invalid or non-sensible Unicode.
120
129
bad_chars = set ()
121
130
for i , c in enumerate (s ):
122
131
category = unicodedata .category (c )
123
132
if category [0 ] in ("L" , "N" , "P" , "S" ):
124
- # letters , numbers, punctuation, and symbols are permitted
133
+ # Letters , numbers, punctuation, and symbols are permitted.
125
134
pass
126
135
elif category [0 ] == "M" :
127
- # combining character in first position would combine with something
128
- # outside of the email address if concatenated to the right, but are
129
- # otherwise permitted
136
+ # Combining character in first position would combine with something
137
+ # outside of the email address if concatenated, so they are not safe.
138
+ # We also check if this occurs after the @-sign, which would not be
139
+ # sensible.
130
140
if i == 0 :
131
141
bad_chars .add (c )
132
- elif category [0 ] in ("Z" , "C" ):
133
- # spaces and line/paragraph characters (Z) and
134
- # control, format, surrogate, private use, and unassigned code points (C)
142
+ elif category [0 ] == "Z" :
143
+ # Spaces and line/paragraph characters (Z) outside of the ASCII range
144
+ # are not specifically disallowed as far as I can tell, but they
145
+ # violate the spirit of the non-internationalized specification that
146
+ # email addresses do not contain spaces or line breaks when not quoted.
147
+ bad_chars .add (c )
148
+ elif category [0 ] == "C" :
149
+ # Control, format, surrogate, private use, and unassigned code points (C)
150
+ # are all unsafe in various ways. Control and format characters can affect
151
+ # text rendering if the email address is concatenated with other text.
152
+ # Bidirectional format characters are unsafe, even if used properly, because
153
+ # they cause an email address to render as a different email address.
154
+ # Private use characters do not make sense for publicly deliverable
155
+ # email addresses.
135
156
bad_chars .add (c )
136
157
else :
137
158
# All categories should be handled above, but in case there is something new
138
- # in the future.
159
+ # to the Unicode specification in the future, reject all other categories .
139
160
bad_chars .add (c )
140
161
if bad_chars :
141
162
raise EmailSyntaxError ("The email address contains unsafe characters: "
142
163
+ ", " .join (safe_character_display (c ) for c in sorted (bad_chars )) + "." )
143
164
144
165
145
166
def check_dot_atom (label , start_descr , end_descr , is_hostname ):
167
+ # RFC 2822 3.2.4
146
168
if label .endswith ("." ):
147
169
raise EmailSyntaxError (end_descr .format ("period" ))
148
170
if label .startswith ("." ):
149
171
raise EmailSyntaxError (start_descr .format ("period" ))
150
172
if ".." in label :
151
173
raise EmailSyntaxError ("An email address cannot have two periods in a row." )
174
+
152
175
if is_hostname :
176
+ # RFC 952
153
177
if label .endswith ("-" ):
154
178
raise EmailSyntaxError (end_descr .format ("hyphen" ))
155
179
if label .startswith ("-" ):
@@ -166,13 +190,19 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera
166
190
raise EmailSyntaxError ("There must be something after the @-sign." )
167
191
168
192
# Check for invalid characters before normalization.
193
+ # (RFC 952 plus RFC 6531 section 3.3 for internationalized addresses)
169
194
bad_chars = set (
170
195
safe_character_display (c )
171
196
for c in domain
172
197
if not ATEXT_HOSTNAME_INTL .match (c )
173
198
)
174
199
if bad_chars :
175
200
raise EmailSyntaxError ("The part after the @-sign contains invalid characters: " + ", " .join (sorted (bad_chars )) + "." )
201
+
202
+ # Check for unsafe characters.
203
+ # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked
204
+ # by DOT_ATOM_TEXT_INTL. Other characters may be permitted by the email specs, but
205
+ # they may not be valid, safe, or sensible Unicode strings.
176
206
check_unsafe_chars (domain )
177
207
178
208
# Perform UTS-46 normalization, which includes casefolding, NFC normalization,
@@ -191,9 +221,13 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera
191
221
# Check that before we do IDNA encoding because the IDNA library gives
192
222
# unfriendly errors for these cases, but after UTS-46 normalization because
193
223
# it can insert periods and hyphens (from fullwidth characters).
224
+ # (RFC 952, RFC 2822 3.2.4)
194
225
check_dot_atom (domain , 'An email address cannot have a {} immediately after the @-sign.' , 'An email address cannot end with a {}.' , is_hostname = True )
226
+
227
+ # Check for RFC 5890's invalid R-LDH labels, which are labels that start
228
+ # with two characters other than "xn" and two dashes.
195
229
for label in domain .split ("." ):
196
- if re .match (r"(?!xn)..--" , label , re .I ): # RFC 5890 invalid R-LDH labels
230
+ if re .match (r"(?!xn)..--" , label , re .I ):
197
231
raise EmailSyntaxError ("An email address cannot have two letters followed by two dashes immediately after the @-sign or after a period, except Punycode." )
198
232
199
233
if DOT_ATOM_TEXT_HOSTNAME .match (domain ):
@@ -230,23 +264,29 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera
230
264
if not m :
231
265
raise EmailSyntaxError ("The email address contains invalid characters after the @-sign after IDNA encoding." )
232
266
233
- # RFC 5321 4.5.3.1.2
234
- # We're checking the number of bytes (octets) here, which can be much
267
+ # Check the length of the domain name in bytes.
268
+ # (RFC 1035 2.3.4 and RFC 5321 4.5.3.1.2)
269
+ # We're checking the number of bytes ("octets") here, which can be much
235
270
# higher than the number of characters in internationalized domains,
236
271
# on the assumption that the domain may be transmitted without SMTPUTF8
237
272
# as IDNA ASCII. (This is also checked by idna.encode, so this exception
238
273
# is never reached for internationalized domains.)
239
274
if len (ascii_domain ) > DOMAIN_MAX_LENGTH :
240
275
reason = get_length_reason (ascii_domain , limit = DOMAIN_MAX_LENGTH )
241
276
raise EmailSyntaxError ("The email address is too long after the @-sign {}." .format (reason ))
277
+
278
+ # Also check the label length limit.
279
+ # (RFC 1035 2.3.1)
242
280
for label in ascii_domain .split ("." ):
243
281
if len (label ) > DNS_LABEL_LENGTH_LIMIT :
244
282
reason = get_length_reason (label , limit = DNS_LABEL_LENGTH_LIMIT )
245
- raise EmailSyntaxError ("On either side of the @-sign, periods cannot be separated by so many characters {}." .format (reason ))
283
+ raise EmailSyntaxError ("After the @-sign, periods cannot be separated by so many characters {}." .format (reason ))
246
284
247
285
if globally_deliverable :
248
286
# All publicly deliverable addresses have domain named with at least
249
- # one period, and we'll consider the lack of a period a syntax error
287
+ # one period, at least for gTLDs created since 2013 (per the ICANN Board
288
+ # New gTLD Program Committee, https://www.icann.org/en/announcements/details/new-gtld-dotless-domain-names-prohibited-30-8-2013-en).
289
+ # We'll consider the lack of a period a syntax error
250
290
# since that will match people's sense of what an email address looks
251
291
# like. We'll skip this in test environments to allow '@test' email
252
292
# addresses.
@@ -260,6 +300,7 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera
260
300
# Check special-use and reserved domain names.
261
301
# Some might fail DNS-based deliverability checks, but that
262
302
# can be turned off, so we should fail them all sooner.
303
+ # See the references in __init__.py.
263
304
from . import SPECIAL_USE_DOMAIN_NAMES
264
305
for d in SPECIAL_USE_DOMAIN_NAMES :
265
306
# See the note near the definition of SPECIAL_USE_DOMAIN_NAMES.
@@ -274,15 +315,14 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera
274
315
# but not be actual IDNA. For ASCII-only domains, the conversion out
275
316
# of IDNA just gives the same thing back.
276
317
#
277
- # This gives us the canonical internationalized form of the domain,
278
- # which we should use in all error messages.
318
+ # This gives us the canonical internationalized form of the domain.
279
319
try :
280
320
domain_i18n = idna .decode (ascii_domain .encode ('ascii' ))
281
321
except idna .IDNAError as e :
282
322
raise EmailSyntaxError ("The part after the @-sign is not valid IDNA ({})." .format (str (e )))
283
323
284
324
# Check for invalid characters after normalization. These
285
- # should never arise.
325
+ # should never arise. See the similar checks above.
286
326
bad_chars = set (
287
327
safe_character_display (c )
288
328
for c in domain
0 commit comments