Skip to content

Commit 8e588b9

Browse files
vstinnertdwyer
authored andcommitted
00415: [CVE-2023-27043] pythongh-102988: Reject malformed addresses in email.parseaddr() (python#111116)
Detect email address parsing errors and return empty tuple to indicate the parsing error (old API). Add an optional 'strict' parameter to getaddresses() and parseaddr() functions. Patch by Thomas Dwyer. Co-Authored-By: Thomas Dwyer <[email protected]>
1 parent 786b8f1 commit 8e588b9

File tree

4 files changed

+361
-21
lines changed

4 files changed

+361
-21
lines changed

Doc/library/email.utils.rst

+15-4
Original file line numberDiff line numberDiff line change
@@ -60,13 +60,18 @@ of the new API.
6060
begins with angle brackets, they are stripped off.
6161

6262

63-
.. function:: parseaddr(address)
63+
.. function:: parseaddr(address, *, strict=True)
6464

6565
Parse address -- which should be the value of some address-containing field such
6666
as :mailheader:`To` or :mailheader:`Cc` -- into its constituent *realname* and
6767
*email address* parts. Returns a tuple of that information, unless the parse
6868
fails, in which case a 2-tuple of ``('', '')`` is returned.
6969

70+
If *strict* is true, use a strict parser which rejects malformed inputs.
71+
72+
.. versionchanged:: 3.13
73+
Add *strict* optional parameter and reject malformed inputs by default.
74+
7075

7176
.. function:: formataddr(pair, charset='utf-8')
7277

@@ -84,12 +89,15 @@ of the new API.
8489
Added the *charset* option.
8590

8691

87-
.. function:: getaddresses(fieldvalues)
92+
.. function:: getaddresses(fieldvalues, *, strict=True)
8893

8994
This method returns a list of 2-tuples of the form returned by ``parseaddr()``.
9095
*fieldvalues* is a sequence of header field values as might be returned by
91-
:meth:`Message.get_all <email.message.Message.get_all>`. Here's a simple
92-
example that gets all the recipients of a message::
96+
:meth:`Message.get_all <email.message.Message.get_all>`.
97+
98+
If *strict* is true, use a strict parser which rejects malformed inputs.
99+
100+
Here's a simple example that gets all the recipients of a message::
93101

94102
from email.utils import getaddresses
95103

@@ -99,6 +107,9 @@ of the new API.
99107
resent_ccs = msg.get_all('resent-cc', [])
100108
all_recipients = getaddresses(tos + ccs + resent_tos + resent_ccs)
101109

110+
.. versionchanged:: 3.13
111+
Add *strict* optional parameter and reject malformed inputs by default.
112+
102113

103114
.. function:: parsedate(date)
104115

Lib/email/utils.py

+142-9
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
specialsre = re.compile(r'[][\\()<>@,:;".]')
4949
escapesre = re.compile(r'[\\"]')
5050

51+
5152
def _has_surrogates(s):
5253
"""Return True if s contains surrogate-escaped binary data."""
5354
# This check is based on the fact that unless there are surrogates, utf8
@@ -106,12 +107,127 @@ def formataddr(pair, charset='utf-8'):
106107
return address
107108

108109

110+
def _iter_escaped_chars(addr):
111+
pos = 0
112+
escape = False
113+
for pos, ch in enumerate(addr):
114+
if escape:
115+
yield (pos, '\\' + ch)
116+
escape = False
117+
elif ch == '\\':
118+
escape = True
119+
else:
120+
yield (pos, ch)
121+
if escape:
122+
yield (pos, '\\')
123+
124+
125+
def _strip_quoted_realnames(addr):
126+
"""Strip real names between quotes."""
127+
if '"' not in addr:
128+
# Fast path
129+
return addr
130+
131+
start = 0
132+
open_pos = None
133+
result = []
134+
for pos, ch in _iter_escaped_chars(addr):
135+
if ch == '"':
136+
if open_pos is None:
137+
open_pos = pos
138+
else:
139+
if start != open_pos:
140+
result.append(addr[start:open_pos])
141+
start = pos + 1
142+
open_pos = None
143+
144+
if start < len(addr):
145+
result.append(addr[start:])
146+
147+
return ''.join(result)
109148

110-
def getaddresses(fieldvalues):
111-
"""Return a list of (REALNAME, EMAIL) for each fieldvalue."""
112-
all = COMMASPACE.join(fieldvalues)
113-
a = _AddressList(all)
114-
return a.addresslist
149+
150+
supports_strict_parsing = True
151+
152+
def getaddresses(fieldvalues, *, strict=True):
153+
"""Return a list of (REALNAME, EMAIL) or ('','') for each fieldvalue.
154+
155+
When parsing fails for a fieldvalue, a 2-tuple of ('', '') is returned in
156+
its place.
157+
158+
If strict is true, use a strict parser which rejects malformed inputs.
159+
"""
160+
161+
# If strict is true, if the resulting list of parsed addresses is greater
162+
# than the number of fieldvalues in the input list, a parsing error has
163+
# occurred and consequently a list containing a single empty 2-tuple [('',
164+
# '')] is returned in its place. This is done to avoid invalid output.
165+
#
166+
# Malformed input: getaddresses(['[email protected] <[email protected]>'])
167+
# Invalid output: [('', '[email protected]'), ('', '[email protected]')]
168+
# Safe output: [('', '')]
169+
170+
if not strict:
171+
all = COMMASPACE.join(str(v) for v in fieldvalues)
172+
a = _AddressList(all)
173+
return a.addresslist
174+
175+
fieldvalues = [str(v) for v in fieldvalues]
176+
fieldvalues = _pre_parse_validation(fieldvalues)
177+
addr = COMMASPACE.join(fieldvalues)
178+
a = _AddressList(addr)
179+
result = _post_parse_validation(a.addresslist)
180+
181+
# Treat output as invalid if the number of addresses is not equal to the
182+
# expected number of addresses.
183+
n = 0
184+
for v in fieldvalues:
185+
# When a comma is used in the Real Name part it is not a deliminator.
186+
# So strip those out before counting the commas.
187+
v = _strip_quoted_realnames(v)
188+
# Expected number of addresses: 1 + number of commas
189+
n += 1 + v.count(',')
190+
if len(result) != n:
191+
return [('', '')]
192+
193+
return result
194+
195+
196+
def _check_parenthesis(addr):
197+
# Ignore parenthesis in quoted real names.
198+
addr = _strip_quoted_realnames(addr)
199+
200+
opens = 0
201+
for pos, ch in _iter_escaped_chars(addr):
202+
if ch == '(':
203+
opens += 1
204+
elif ch == ')':
205+
opens -= 1
206+
if opens < 0:
207+
return False
208+
return (opens == 0)
209+
210+
211+
def _pre_parse_validation(email_header_fields):
212+
accepted_values = []
213+
for v in email_header_fields:
214+
if not _check_parenthesis(v):
215+
v = "('', '')"
216+
accepted_values.append(v)
217+
218+
return accepted_values
219+
220+
221+
def _post_parse_validation(parsed_email_header_tuples):
222+
accepted_values = []
223+
# The parser would have parsed a correctly formatted domain-literal
224+
# The existence of an [ after parsing indicates a parsing failure
225+
for v in parsed_email_header_tuples:
226+
if '[' in v[1]:
227+
v = ('', '')
228+
accepted_values.append(v)
229+
230+
return accepted_values
115231

116232

117233

@@ -214,16 +330,33 @@ def parsedate_to_datetime(data):
214330
tzinfo=datetime.timezone(datetime.timedelta(seconds=tz)))
215331

216332

217-
def parseaddr(addr):
333+
def parseaddr(addr, *, strict=True):
218334
"""
219335
Parse addr into its constituent realname and email address parts.
220336
221337
Return a tuple of realname and email address, unless the parse fails, in
222338
which case return a 2-tuple of ('', '').
339+
340+
If strict is True, use a strict parser which rejects malformed inputs.
223341
"""
224-
addrs = _AddressList(addr).addresslist
225-
if not addrs:
226-
return '', ''
342+
if not strict:
343+
addrs = _AddressList(addr).addresslist
344+
if not addrs:
345+
return ('', '')
346+
return addrs[0]
347+
348+
if isinstance(addr, list):
349+
addr = addr[0]
350+
351+
if not isinstance(addr, str):
352+
return ('', '')
353+
354+
addr = _pre_parse_validation([addr])[0]
355+
addrs = _post_parse_validation(_AddressList(addr).addresslist)
356+
357+
if not addrs or len(addrs) > 1:
358+
return ('', '')
359+
227360
return addrs[0]
228361

229362

0 commit comments

Comments
 (0)