Skip to content

gh-85287: Change codecs to raise precise UnicodeEncodeError and UnicodeDecodeError #113674

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 30 commits into from
Mar 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
7339989
fix issue gh-85287
jjsloboda Jan 3, 2024
e92d414
add news blurb
jjsloboda Jan 3, 2024
10e7cd0
add more lenient unicode error handling within the except blocks
jjsloboda Jan 3, 2024
0122f90
fix IDNA-specific length issue
jjsloboda Jan 3, 2024
a310dd2
Merge branch 'main' into unicode-errors-fix-85287
jjsloboda Jan 3, 2024
63948d2
fix two issues
jjsloboda Jan 3, 2024
4479ab2
Merge branch 'main' into unicode-errors-fix-85287
jjsloboda Jan 3, 2024
81310e3
use plain UnicodeError for problems outside the en/decoded string
jjsloboda Jan 6, 2024
367de4e
split label empty vs too long
jjsloboda Jan 6, 2024
9f57515
use labels input for finding error offset, not output result
jjsloboda Jan 6, 2024
389122d
update test for undefined encoding
jjsloboda Jan 6, 2024
fe47caa
fixed linebreaks on some of the longer exceptions
jjsloboda Jan 6, 2024
a4098fa
Merge branch 'main' into unicode-errors-fix-85287
jjsloboda Jan 6, 2024
95cb5bb
add tests for unicode error offsets, and tighten up the logic for cal…
jjsloboda Jan 7, 2024
10d092f
Merge branch 'main' into unicode-errors-fix-85287
jjsloboda Jan 7, 2024
9ac979f
Merge branch 'main' into unicode-errors-fix-85287
jjsloboda Feb 16, 2024
aefd7c2
reduce scope of exception object, and fail gracefully if it cannot be…
jjsloboda Feb 16, 2024
f73ccfe
use object formatting on inbuf directly in exc
jjsloboda Feb 16, 2024
e0747b4
reduce scope of exception object, and fail gracefully if it cannot be…
jjsloboda Feb 16, 2024
93e99ae
update MultibyteIncrementalEncoder.getstate()
methane Feb 21, 2024
87e1f99
fixup
methane Feb 21, 2024
0728a43
change buffer size issue error back to UnicodeError
jjsloboda Feb 22, 2024
0f80786
Merge branch 'main' into unicode-errors-fix-85287
jjsloboda Feb 22, 2024
5c8c59e
Merge branch 'main' into unicode-errors-fix-85287
jjsloboda Feb 22, 2024
1cc911d
update test to match changed exception
jjsloboda Feb 22, 2024
9594bae
Update Modules/cjkcodecs/multibytecodec.c
methane Feb 23, 2024
ea3ff8a
improve idna codec errors
methane Feb 23, 2024
8a2bc50
improve punycode.decode()
methane Feb 23, 2024
a63e17a
improve punycode_decode again
methane Feb 23, 2024
4c329e4
Merge branch 'main' into unicode-errors-fix-85287
methane Mar 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 117 additions & 47 deletions Lib/encodings/idna.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
sace_prefix = "xn--"

# This assumes query strings, so AllowUnassigned is true
def nameprep(label):
def nameprep(label): # type: (str) -> str
# Map
newlabel = []
for c in label:
Expand All @@ -25,7 +25,7 @@ def nameprep(label):
label = unicodedata.normalize("NFKC", label)

# Prohibit
for c in label:
for i, c in enumerate(label):
if stringprep.in_table_c12(c) or \
stringprep.in_table_c22(c) or \
stringprep.in_table_c3(c) or \
Expand All @@ -35,7 +35,7 @@ def nameprep(label):
stringprep.in_table_c7(c) or \
stringprep.in_table_c8(c) or \
stringprep.in_table_c9(c):
raise UnicodeError("Invalid character %r" % c)
raise UnicodeEncodeError("idna", label, i, i+1, f"Invalid character {c!r}")

# Check bidi
RandAL = [stringprep.in_table_d1(x) for x in label]
Expand All @@ -46,59 +46,73 @@ def nameprep(label):
# This is table C.8, which was already checked
# 2) If a string contains any RandALCat character, the string
# MUST NOT contain any LCat character.
if any(stringprep.in_table_d2(x) for x in label):
raise UnicodeError("Violation of BIDI requirement 2")
for i, x in enumerate(label):
if stringprep.in_table_d2(x):
raise UnicodeEncodeError("idna", label, i, i+1,
"Violation of BIDI requirement 2")
# 3) If a string contains any RandALCat character, a
# RandALCat character MUST be the first character of the
# string, and a RandALCat character MUST be the last
# character of the string.
if not RandAL[0] or not RandAL[-1]:
raise UnicodeError("Violation of BIDI requirement 3")
if not RandAL[0]:
raise UnicodeEncodeError("idna", label, 0, 1,
"Violation of BIDI requirement 3")
if not RandAL[-1]:
raise UnicodeEncodeError("idna", label, len(label)-1, len(label),
"Violation of BIDI requirement 3")

return label

def ToASCII(label):
def ToASCII(label): # type: (str) -> bytes
try:
# Step 1: try ASCII
label = label.encode("ascii")
except UnicodeError:
label_ascii = label.encode("ascii")
except UnicodeEncodeError:
pass
else:
# Skip to step 3: UseSTD3ASCIIRules is false, so
# Skip to step 8.
if 0 < len(label) < 64:
return label
raise UnicodeError("label empty or too long")
if 0 < len(label_ascii) < 64:
return label_ascii
if len(label) == 0:
raise UnicodeEncodeError("idna", label, 0, 1, "label empty")
else:
raise UnicodeEncodeError("idna", label, 0, len(label), "label too long")

# Step 2: nameprep
label = nameprep(label)

# Step 3: UseSTD3ASCIIRules is false
# Step 4: try ASCII
try:
label = label.encode("ascii")
except UnicodeError:
label_ascii = label.encode("ascii")
except UnicodeEncodeError:
pass
else:
# Skip to step 8.
if 0 < len(label) < 64:
return label
raise UnicodeError("label empty or too long")
return label_ascii
if len(label) == 0:
raise UnicodeEncodeError("idna", label, 0, 1, "label empty")
else:
raise UnicodeEncodeError("idna", label, 0, len(label), "label too long")

# Step 5: Check ACE prefix
if label[:4].lower() == sace_prefix:
raise UnicodeError("Label starts with ACE prefix")
if label.lower().startswith(sace_prefix):
raise UnicodeEncodeError(
"idna", label, 0, len(sace_prefix), "Label starts with ACE prefix")

# Step 6: Encode with PUNYCODE
label = label.encode("punycode")
label_ascii = label.encode("punycode")

# Step 7: Prepend ACE prefix
label = ace_prefix + label
label_ascii = ace_prefix + label_ascii

# Step 8: Check size
if 0 < len(label) < 64:
return label
raise UnicodeError("label empty or too long")
# do not check for empty as we prepend ace_prefix.
if len(label_ascii) < 64:
return label_ascii
raise UnicodeEncodeError("idna", label, 0, len(label), "label too long")

def ToUnicode(label):
if len(label) > 1024:
Expand All @@ -110,41 +124,51 @@ def ToUnicode(label):
# per https://www.rfc-editor.org/rfc/rfc3454#section-3.1 while still
# preventing us from wasting time decoding a big thing that'll just
# hit the actual <= 63 length limit in Step 6.
raise UnicodeError("label way too long")
if isinstance(label, str):
label = label.encode("utf-8", errors="backslashreplace")
raise UnicodeDecodeError("idna", label, 0, len(label), "label way too long")
# Step 1: Check for ASCII
if isinstance(label, bytes):
pure_ascii = True
else:
try:
label = label.encode("ascii")
pure_ascii = True
except UnicodeError:
except UnicodeEncodeError:
pure_ascii = False
if not pure_ascii:
assert isinstance(label, str)
# Step 2: Perform nameprep
label = nameprep(label)
# It doesn't say this, but apparently, it should be ASCII now
try:
label = label.encode("ascii")
except UnicodeError:
raise UnicodeError("Invalid character in IDN label")
except UnicodeEncodeError as exc:
raise UnicodeEncodeError("idna", label, exc.start, exc.end,
"Invalid character in IDN label")
# Step 3: Check for ACE prefix
if not label[:4].lower() == ace_prefix:
assert isinstance(label, bytes)
if not label.lower().startswith(ace_prefix):
return str(label, "ascii")

# Step 4: Remove ACE prefix
label1 = label[len(ace_prefix):]

# Step 5: Decode using PUNYCODE
result = label1.decode("punycode")
try:
result = label1.decode("punycode")
except UnicodeDecodeError as exc:
offset = len(ace_prefix)
raise UnicodeDecodeError("idna", label, offset+exc.start, offset+exc.end, exc.reason)

# Step 6: Apply ToASCII
label2 = ToASCII(result)

# Step 7: Compare the result of step 6 with the one of step 3
# label2 will already be in lower case.
if str(label, "ascii").lower() != str(label2, "ascii"):
raise UnicodeError("IDNA does not round-trip", label, label2)
raise UnicodeDecodeError("idna", label, 0, len(label),
f"IDNA does not round-trip, '{label!r}' != '{label2!r}'")

# Step 8: return the result of step 5
return result
Expand All @@ -156,7 +180,7 @@ def encode(self, input, errors='strict'):

if errors != 'strict':
# IDNA is quite clear that implementations must be strict
raise UnicodeError("unsupported error handling "+errors)
raise UnicodeError(f"Unsupported error handling: {errors}")

if not input:
return b'', 0
Expand All @@ -168,11 +192,16 @@ def encode(self, input, errors='strict'):
else:
# ASCII name: fast path
labels = result.split(b'.')
for label in labels[:-1]:
if not (0 < len(label) < 64):
raise UnicodeError("label empty or too long")
if len(labels[-1]) >= 64:
raise UnicodeError("label too long")
for i, label in enumerate(labels[:-1]):
if len(label) == 0:
offset = sum(len(l) for l in labels[:i]) + i
raise UnicodeEncodeError("idna", input, offset, offset+1,
"label empty")
for i, label in enumerate(labels):
if len(label) >= 64:
offset = sum(len(l) for l in labels[:i]) + i
raise UnicodeEncodeError("idna", input, offset, offset+len(label),
"label too long")
return result, len(input)

result = bytearray()
Expand All @@ -182,17 +211,27 @@ def encode(self, input, errors='strict'):
del labels[-1]
else:
trailing_dot = b''
for label in labels:
for i, label in enumerate(labels):
if result:
# Join with U+002E
result.extend(b'.')
result.extend(ToASCII(label))
try:
result.extend(ToASCII(label))
except (UnicodeEncodeError, UnicodeDecodeError) as exc:
offset = sum(len(l) for l in labels[:i]) + i
raise UnicodeEncodeError(
"idna",
input,
offset + exc.start,
offset + exc.end,
exc.reason,
)
return bytes(result+trailing_dot), len(input)

def decode(self, input, errors='strict'):

if errors != 'strict':
raise UnicodeError("Unsupported error handling "+errors)
raise UnicodeError(f"Unsupported error handling: {errors}")

if not input:
return "", 0
Expand All @@ -218,16 +257,23 @@ def decode(self, input, errors='strict'):
trailing_dot = ''

result = []
for label in labels:
result.append(ToUnicode(label))
for i, label in enumerate(labels):
try:
u_label = ToUnicode(label)
except (UnicodeEncodeError, UnicodeDecodeError) as exc:
offset = sum(len(x) for x in labels[:i]) + len(labels[:i])
raise UnicodeDecodeError(
"idna", input, offset+exc.start, offset+exc.end, exc.reason)
else:
result.append(u_label)

return ".".join(result)+trailing_dot, len(input)

class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
def _buffer_encode(self, input, errors, final):
if errors != 'strict':
# IDNA is quite clear that implementations must be strict
raise UnicodeError("unsupported error handling "+errors)
raise UnicodeError(f"Unsupported error handling: {errors}")

if not input:
return (b'', 0)
Expand All @@ -251,7 +297,16 @@ def _buffer_encode(self, input, errors, final):
# Join with U+002E
result.extend(b'.')
size += 1
result.extend(ToASCII(label))
try:
result.extend(ToASCII(label))
except (UnicodeEncodeError, UnicodeDecodeError) as exc:
raise UnicodeEncodeError(
"idna",
input,
size + exc.start,
size + exc.end,
exc.reason,
)
size += len(label)

result += trailing_dot
Expand All @@ -261,7 +316,7 @@ def _buffer_encode(self, input, errors, final):
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
def _buffer_decode(self, input, errors, final):
if errors != 'strict':
raise UnicodeError("Unsupported error handling "+errors)
raise UnicodeError("Unsupported error handling: {errors}")

if not input:
return ("", 0)
Expand All @@ -271,7 +326,11 @@ def _buffer_decode(self, input, errors, final):
labels = dots.split(input)
else:
# Must be ASCII string
input = str(input, "ascii")
try:
input = str(input, "ascii")
except (UnicodeEncodeError, UnicodeDecodeError) as exc:
raise UnicodeDecodeError("idna", input,
exc.start, exc.end, exc.reason)
labels = input.split(".")

trailing_dot = ''
Expand All @@ -288,7 +347,18 @@ def _buffer_decode(self, input, errors, final):
result = []
size = 0
for label in labels:
result.append(ToUnicode(label))
try:
u_label = ToUnicode(label)
except (UnicodeEncodeError, UnicodeDecodeError) as exc:
raise UnicodeDecodeError(
"idna",
input.encode("ascii", errors="backslashreplace"),
size + exc.start,
size + exc.end,
exc.reason,
)
else:
result.append(u_label)
if size:
size += 1
size += len(label)
Expand Down
Loading