Skip to content

Commit 80f5cfa

Browse files
committed
Fix for email.generator.Generator with whitespace between encoded words.
email.generator.Generator currently does not handle whitespace between encoded words correctly when the encoded words span multiple lines. The current generator will create an encoded word for each line. If the end of the line happens to correspond with the end real word in the plaintext, the generator will place an unencoded space at the start of the subsequent lines to represent the whitespace between the plaintext words. A compliant decoder will strip all the whitespace from between two encoded words which leads to missing spaces in the round-tripped output. The fix for this is to make sure that whitespace between two encoded words ends up inside of one or the other of the encoded words. This fix places the space inside of the second encoded word. Test case from python#92081
1 parent feca9bb commit 80f5cfa

File tree

3 files changed

+64
-8
lines changed

3 files changed

+64
-8
lines changed

Lib/email/_header_value_parser.py

Lines changed: 27 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2766,10 +2766,13 @@ def _refold_parse_tree(parse_tree, *, policy):
27662766
# max_line_length 0/None means no limit, ie: infinitely long.
27672767
maxlen = policy.max_line_length or sys.maxsize
27682768
encoding = 'utf-8' if policy.utf8 else 'us-ascii'
2769-
lines = ['']
2770-
last_ew = None
2769+
lines = [''] # Folded lines to be output
2770+
prepend_whitespace = '' # When we have whitespace between two encoded
2771+
# words, we may need to encode the whitespace
2772+
last_ew = None # Points to the last encoded character if there's an ew on
2773+
# the line
27712774
wrap_as_ew_blocked = 0
2772-
want_encoding = False
2775+
want_encoding = False # True if we need to encode this part
27732776
end_ew_not_allowed = Terminal('', 'wrap_as_ew_blocked')
27742777
parts = list(parse_tree)
27752778
while parts:
@@ -2793,10 +2796,12 @@ def _refold_parse_tree(parse_tree, *, policy):
27932796
# 'charset' property on the policy.
27942797
charset = 'utf-8'
27952798
want_encoding = True
2799+
27962800
if part.token_type == 'mime-parameters':
27972801
# Mime parameter folding (using RFC2231) is extra special.
27982802
_fold_mime_parameters(part, lines, maxlen, encoding)
27992803
continue
2804+
28002805
if want_encoding and not wrap_as_ew_blocked:
28012806
if not part.as_ew_allowed:
28022807
want_encoding = False
@@ -2823,20 +2828,24 @@ def _refold_parse_tree(parse_tree, *, policy):
28232828
# It's a terminal, wrap it as an encoded word, possibly
28242829
# combining it with previously encoded words if allowed.
28252830
last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew,
2826-
part.ew_combine_allowed, charset)
2831+
part.ew_combine_allowed, charset, prepend_whitespace)
2832+
prepend_whitespace = ''
28272833
want_encoding = False
28282834
continue
2835+
28292836
if len(tstr) <= maxlen - len(lines[-1]):
28302837
lines[-1] += tstr
28312838
continue
28322839
# This part is too long to fit. The RFC wants us to break at
28332840
# "major syntactic breaks", so unless we don't consider this
28342841
# to be one, check if it will fit on the next line by itself.
2842+
prepend_whitespace = ''
28352843
if (part.syntactic_break and
28362844
len(tstr) + 1 <= maxlen):
28372845
newline = _steal_trailing_WSP_if_exists(lines)
28382846
if newline or part.startswith_fws():
28392847
lines.append(newline + tstr)
2848+
prepend_whitespace = ' ' # part.value
28402849
last_ew = None
28412850
continue
28422851
if not hasattr(part, 'encode'):
@@ -2860,9 +2869,10 @@ def _refold_parse_tree(parse_tree, *, policy):
28602869
else:
28612870
# We can't fold it onto the next line either...
28622871
lines[-1] += tstr
2872+
28632873
return policy.linesep.join(lines) + policy.linesep
28642874

2865-
def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset):
2875+
def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, prepend_whitespace):
28662876
"""Fold string to_encode into lines as encoded word, combining if allowed.
28672877
Return the new value for last_ew, or None if ew_combine_allowed is False.
28682878
@@ -2877,14 +2887,15 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset):
28772887
to_encode = str(
28782888
get_unstructured(lines[-1][last_ew:] + to_encode))
28792889
lines[-1] = lines[-1][:last_ew]
2880-
if to_encode[0] in WSP:
2890+
elif to_encode[0] in WSP:
28812891
# We're joining this to non-encoded text, so don't encode
28822892
# the leading blank.
28832893
leading_wsp = to_encode[0]
28842894
to_encode = to_encode[1:]
28852895
if (len(lines[-1]) == maxlen):
28862896
lines.append(_steal_trailing_WSP_if_exists(lines))
28872897
lines[-1] += leading_wsp
2898+
28882899
trailing_wsp = ''
28892900
if to_encode[-1] in WSP:
28902901
# Likewise for the trailing space.
@@ -2904,11 +2915,20 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset):
29042915

29052916
while to_encode:
29062917
remaining_space = maxlen - len(lines[-1])
2907-
text_space = remaining_space - chrome_len
2918+
text_space = remaining_space - chrome_len - len(prepend_whitespace)
29082919
if text_space <= 0:
29092920
lines.append(' ')
29102921
continue
29112922

2923+
# If we are at the start of a continuation line, prepend whitespace
2924+
# (we only want to do this when the line starts with an encoded word
2925+
# but if we're folding in this helper function, then we know that we
2926+
# are going to be writing out an encoded word.)
2927+
if len(lines) > 1 and len(lines[-1]) == 1 and prepend_whitespace:
2928+
encoded_word = _ew.encode(prepend_whitespace, charset=encode_as)
2929+
lines[-1] += encoded_word
2930+
prepend_whitespace = ''
2931+
29122932
to_encode_word = to_encode[:text_space]
29132933
encoded_word = _ew.encode(to_encode_word, charset=encode_as)
29142934
excess = len(encoded_word) - remaining_space

Lib/test/test_email/test_generator.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,41 @@ class TestBytesGenerator(TestGeneratorBase, TestEmailBase):
232232
ioclass = io.BytesIO
233233
typ = lambda self, x: x.encode('ascii')
234234

235+
def test_defaults_handle_spaces_between_encoded_words_when_folded(self):
236+
source = ("Уведомление о принятии в работу обращения для"
237+
" подключения услуги")
238+
expected = ('Subject: =?utf-8?b?0KPQstC10LTQvtC80LvQtdC90LjQtSDQviDQv9GA0LjQvdGP0YLQuNC4?=\n'
239+
' =?utf-8?b?INCyINGA0LDQsdC+0YLRgyDQvtCx0YDQsNGJ0LXQvdC40Y8g0LTQu9GPINC/0L4=?=\n'
240+
' =?utf-8?b?0LTQutC70Y7Rh9C10L3QuNGPINGD0YHQu9GD0LPQuA==?=\n\n').encode('ascii')
241+
msg = EmailMessage()
242+
msg['Subject'] = source
243+
s = io.BytesIO()
244+
g = BytesGenerator(s)
245+
g.flatten(msg)
246+
self.assertEqual(s.getvalue(), expected)
247+
248+
def test_defaults_handle_spaces_at_start_of_subject(self):
249+
source = " Уведомление"
250+
expected = b"Subject: =?utf-8?b?0KPQstC10LTQvtC80LvQtdC90LjQtQ==?=\n\n"
251+
msg = EmailMessage()
252+
msg['Subject'] = source
253+
s = io.BytesIO()
254+
g = BytesGenerator(s)
255+
g.flatten(msg)
256+
self.assertEqual(s.getvalue(), expected)
257+
258+
def test_defaults_handle_spaces_at_start_of_continuation_line(self):
259+
source = " ф ффффффффффффффффффф ф ф"
260+
expected = (b"Subject: "
261+
b"=?utf-8?b?0YQg0YTRhNGE0YTRhNGE0YTRhNGE0YTRhNGE0YTRhNGE0YTRhNGE0YQ=?=\n"
262+
b" =?utf-8?b?INGEINGE?=\n\n")
263+
msg = EmailMessage()
264+
msg['Subject'] = source
265+
s = io.BytesIO()
266+
g = BytesGenerator(s)
267+
g.flatten(msg)
268+
self.assertEqual(s.getvalue(), expected)
269+
235270
def test_cte_type_7bit_handles_unknown_8bit(self):
236271
source = ("Subject: Maintenant je vous présente mon "
237272
"collègue\n\n").encode('utf-8')

Lib/test/test_email/test_headerregistry.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from test.test_email import TestEmailBase, parameterize
88
from email import headerregistry
99
from email.headerregistry import Address, Group
10+
from email.header import decode_header
1011
from test.support import ALWAYS_EQ
1112

1213

@@ -1628,7 +1629,7 @@ def test_address_display_names(self):
16281629
'Lôrem ipsum dôlôr sit amet, cônsectetuer adipiscing. '
16291630
'Suspendisse pôtenti. Aliquam nibh. Suspendisse pôtenti.',
16301631
'=?utf-8?q?L=C3=B4rem_ipsum_d=C3=B4l=C3=B4r_sit_amet=2C_c'
1631-
'=C3=B4nsectetuer?=\n =?utf-8?q?adipiscing=2E_Suspendisse'
1632+
'=C3=B4nsectetuer?=\n =?utf-8?q?_adipiscing=2E_Suspendisse'
16321633
'_p=C3=B4tenti=2E_Aliquam_nibh=2E?=\n Suspendisse =?utf-8'
16331634
'?q?p=C3=B4tenti=2E?=',
16341635
),

0 commit comments

Comments
 (0)