Skip to content

Commit c02c357

Browse files
committed
pythongh-88500: Reduce memory use of urllib.unquote
`urllib.unquote_to_bytes` and `urllib.unquote` could both potentially generate `O(len(string))` intermediate `bytes` or `str` objects while computing the unquoted final result depending on the input provided. As Python objects are relatively large, this could consume a lot of ram. This switches the implementation to using an expanding `bytearray` and a generator internally instead of precomputed `split()` style operations.
1 parent 53a54b7 commit c02c357

File tree

2 files changed

+25
-16
lines changed

2 files changed

+25
-16
lines changed

Lib/test/test_urllib.py

+2
Original file line numberDiff line numberDiff line change
@@ -1104,6 +1104,8 @@ def test_unquoting(self):
11041104
self.assertEqual(result.count('%'), 1,
11051105
"using unquote(): not all characters escaped: "
11061106
"%s" % result)
1107+
1108+
def test_unquote_rejects_none_and_tuple(self):
11071109
self.assertRaises((TypeError, AttributeError), urllib.parse.unquote, None)
11081110
self.assertRaises((TypeError, AttributeError), urllib.parse.unquote, ())
11091111

Lib/urllib/parse.py

+23-16
Original file line numberDiff line numberDiff line change
@@ -600,6 +600,9 @@ def urldefrag(url):
600600

601601
def unquote_to_bytes(string):
602602
"""unquote_to_bytes('abc%20def') -> b'abc def'."""
603+
return bytes(_unquote_to_bytearray(string))
604+
605+
def _unquote_to_bytearray(string):
603606
# Note: strings are encoded as UTF-8. This is only an issue if it contains
604607
# unescaped non-ASCII characters, which URIs should not.
605608
if not string:
@@ -611,8 +614,8 @@ def unquote_to_bytes(string):
611614
bits = string.split(b'%')
612615
if len(bits) == 1:
613616
return string
614-
res = [bits[0]]
615-
append = res.append
617+
res = bytearray(bits[0])
618+
add_data = res.extend
616619
# Delay the initialization of the table to not waste memory
617620
# if the function is never called
618621
global _hextobyte
@@ -621,15 +624,25 @@ def unquote_to_bytes(string):
621624
for a in _hexdig for b in _hexdig}
622625
for item in bits[1:]:
623626
try:
624-
append(_hextobyte[item[:2]])
625-
append(item[2:])
627+
add_data(_hextobyte[item[:2]])
628+
add_data(item[2:])
626629
except KeyError:
627-
append(b'%')
628-
append(item)
629-
return b''.join(res)
630+
add_data(b'%')
631+
add_data(item)
632+
return res
630633

631634
_asciire = re.compile('([\x00-\x7f]+)')
632635

636+
def _generate_unquoted_parts(string, encoding, errors):
637+
previous_match_end = 0
638+
for ascii_match in _asciire.finditer(string):
639+
start, end = ascii_match.span()
640+
yield string[previous_match_end:start] # Non-ASCII
641+
# The ascii_match[1] group == string[start:end].
642+
yield _unquote_to_bytearray(ascii_match[1]).decode(encoding, errors)
643+
previous_match_end = end
644+
yield string[previous_match_end:] # Non-ASCII tail
645+
633646
def unquote(string, encoding='utf-8', errors='replace'):
634647
"""Replace %xx escapes by their single-character equivalent. The optional
635648
encoding and errors parameters specify how to decode percent-encoded
@@ -641,22 +654,16 @@ def unquote(string, encoding='utf-8', errors='replace'):
641654
unquote('abc%20def') -> 'abc def'.
642655
"""
643656
if isinstance(string, bytes):
644-
return unquote_to_bytes(string).decode(encoding, errors)
657+
return _unquote_to_bytearray(string).decode(encoding, errors)
645658
if '%' not in string:
659+
# Is it a string-like object?
646660
string.split
647661
return string
648662
if encoding is None:
649663
encoding = 'utf-8'
650664
if errors is None:
651665
errors = 'replace'
652-
bits = _asciire.split(string)
653-
res = [bits[0]]
654-
append = res.append
655-
for i in range(1, len(bits), 2):
656-
append(unquote_to_bytes(bits[i]).decode(encoding, errors))
657-
append(bits[i + 1])
658-
return ''.join(res)
659-
666+
return ''.join(_generate_unquoted_parts(string, encoding, errors))
660667

661668
def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
662669
encoding='utf-8', errors='replace', max_num_fields=None, separator='&'):

0 commit comments

Comments
 (0)