-
-
Notifications
You must be signed in to change notification settings - Fork 31.9k
gh-85287: Change codecs to raise precise UnicodeEncodeError and UnicodeDecodeError #113674
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
7339989
e92d414
10e7cd0
0122f90
a310dd2
63948d2
4479ab2
81310e3
367de4e
9f57515
389122d
fe47caa
a4098fa
95cb5bb
10d092f
9ac979f
aefd7c2
f73ccfe
e0747b4
93e99ae
87e1f99
0728a43
0f80786
5c8c59e
1cc911d
9594bae
ea3ff8a
8a2bc50
a63e17a
4c329e4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||
---|---|---|---|---|---|---|---|---|---|---|
|
@@ -25,7 +25,7 @@ def nameprep(label): | |||||||||
label = unicodedata.normalize("NFKC", label) | ||||||||||
|
||||||||||
# Prohibit | ||||||||||
for c in label: | ||||||||||
for i, c in enumerate(label): | ||||||||||
if stringprep.in_table_c12(c) or \ | ||||||||||
stringprep.in_table_c22(c) or \ | ||||||||||
stringprep.in_table_c3(c) or \ | ||||||||||
|
@@ -35,7 +35,7 @@ def nameprep(label): | |||||||||
stringprep.in_table_c7(c) or \ | ||||||||||
stringprep.in_table_c8(c) or \ | ||||||||||
stringprep.in_table_c9(c): | ||||||||||
raise UnicodeError("Invalid character %r" % c) | ||||||||||
raise UnicodeEncodeError("idna", label, i, i+1, f"Invalid character {c!r}") | ||||||||||
|
||||||||||
# Check bidi | ||||||||||
RandAL = [stringprep.in_table_d1(x) for x in label] | ||||||||||
|
@@ -46,29 +46,35 @@ def nameprep(label): | |||||||||
# This is table C.8, which was already checked | ||||||||||
# 2) If a string contains any RandALCat character, the string | ||||||||||
# MUST NOT contain any LCat character. | ||||||||||
if any(stringprep.in_table_d2(x) for x in label): | ||||||||||
raise UnicodeError("Violation of BIDI requirement 2") | ||||||||||
for i, x in enumerate(label): | ||||||||||
if stringprep.in_table_d2(x): | ||||||||||
raise UnicodeEncodeError("idna", label, i, i+1, "Violation of BIDI requirement 2") | ||||||||||
# 3) If a string contains any RandALCat character, a | ||||||||||
# RandALCat character MUST be the first character of the | ||||||||||
# string, and a RandALCat character MUST be the last | ||||||||||
# character of the string. | ||||||||||
if not RandAL[0] or not RandAL[-1]: | ||||||||||
raise UnicodeError("Violation of BIDI requirement 3") | ||||||||||
if not RandAL[0]: | ||||||||||
raise UnicodeEncodeError( | ||||||||||
"idna", label, | ||||||||||
0, 1, "Violation of BIDI requirement 3") | ||||||||||
if not RandAL[-1]: | ||||||||||
raise UnicodeEncodeError("idna", label, len(label)-1, len(label), "Violation of BIDI requirement 3") | ||||||||||
|
||||||||||
return label | ||||||||||
|
||||||||||
def ToASCII(label): | ||||||||||
try: | ||||||||||
# Step 1: try ASCII | ||||||||||
label = label.encode("ascii") | ||||||||||
except UnicodeError: | ||||||||||
except UnicodeEncodeError: | ||||||||||
pass | ||||||||||
else: | ||||||||||
# Skip to step 3: UseSTD3ASCIIRules is false, so | ||||||||||
# Skip to step 8. | ||||||||||
if 0 < len(label) < 64: | ||||||||||
return label | ||||||||||
raise UnicodeError("label empty or too long") | ||||||||||
label = label.decode("ascii", errors="backslashreplace") | ||||||||||
raise UnicodeEncodeError("idna", label, 0, len(label), "label empty or too long") | ||||||||||
|
||||||||||
# Step 2: nameprep | ||||||||||
label = nameprep(label) | ||||||||||
|
@@ -77,17 +83,20 @@ def ToASCII(label): | |||||||||
# Step 4: try ASCII | ||||||||||
try: | ||||||||||
label = label.encode("ascii") | ||||||||||
except UnicodeError: | ||||||||||
except UnicodeEncodeError: | ||||||||||
pass | ||||||||||
else: | ||||||||||
# Skip to step 8. | ||||||||||
if 0 < len(label) < 64: | ||||||||||
return label | ||||||||||
raise UnicodeError("label empty or too long") | ||||||||||
label = label.decode("ascii", errors="backslashreplace") | ||||||||||
raise UnicodeEncodeError("idna", label, 0, len(label), "label empty or too long") | ||||||||||
|
||||||||||
# Step 5: Check ACE prefix | ||||||||||
if label.startswith(sace_prefix): | ||||||||||
raise UnicodeError("Label starts with ACE prefix") | ||||||||||
raise UnicodeEncodeError( | ||||||||||
"idna", label.decode("ascii", errors="backslashreplace"), | ||||||||||
0, len(sace_prefix), "Label starts with ACE prefix") | ||||||||||
|
||||||||||
# Step 6: Encode with PUNYCODE | ||||||||||
label = label.encode("punycode") | ||||||||||
|
@@ -98,7 +107,8 @@ def ToASCII(label): | |||||||||
# Step 8: Check size | ||||||||||
if 0 < len(label) < 64: | ||||||||||
return label | ||||||||||
raise UnicodeError("label empty or too long") | ||||||||||
label = label[len(ace_prefix):].decode("punycode", errors="replace") | ||||||||||
raise UnicodeEncodeError("idna", label, 0, len(label), "label empty or too long") | ||||||||||
|
||||||||||
def ToUnicode(label): | ||||||||||
if len(label) > 1024: | ||||||||||
|
@@ -110,24 +120,28 @@ def ToUnicode(label): | |||||||||
# per https://www.rfc-editor.org/rfc/rfc3454#section-3.1 while still | ||||||||||
# preventing us from wasting time decoding a big thing that'll just | ||||||||||
# hit the actual <= 63 length limit in Step 6. | ||||||||||
raise UnicodeError("label way too long") | ||||||||||
if isinstance(label, bytes): | ||||||||||
label = label.decode("utf-8", errors="backslashreplace") | ||||||||||
raise UnicodeEncodeError("idna", label, 0, len(label), "label way too long") | ||||||||||
# Step 1: Check for ASCII | ||||||||||
if isinstance(label, bytes): | ||||||||||
pure_ascii = True | ||||||||||
else: | ||||||||||
try: | ||||||||||
label = label.encode("ascii") | ||||||||||
pure_ascii = True | ||||||||||
except UnicodeError: | ||||||||||
except UnicodeEncodeError: | ||||||||||
pure_ascii = False | ||||||||||
if not pure_ascii: | ||||||||||
# Step 2: Perform nameprep | ||||||||||
label = nameprep(label) | ||||||||||
# It doesn't say this, but apparently, it should be ASCII now | ||||||||||
try: | ||||||||||
label = label.encode("ascii") | ||||||||||
except UnicodeError: | ||||||||||
raise UnicodeError("Invalid character in IDN label") | ||||||||||
except UnicodeEncodeError as exc: | ||||||||||
if isinstance(label, bytes): | ||||||||||
label = label.decode("utf-8", errors="backslashreplace") | ||||||||||
raise UnicodeEncodeError("idna", label, exc.start, exc.end, "Invalid character in IDN label") | ||||||||||
# Step 3: Check for ACE prefix | ||||||||||
if not label.startswith(ace_prefix): | ||||||||||
return str(label, "ascii") | ||||||||||
|
@@ -144,7 +158,7 @@ def ToUnicode(label): | |||||||||
# Step 7: Compare the result of step 6 with the one of step 3 | ||||||||||
# label2 will already be in lower case. | ||||||||||
if str(label, "ascii").lower() != str(label2, "ascii"): | ||||||||||
raise UnicodeError("IDNA does not round-trip", label, label2) | ||||||||||
raise UnicodeEncodeError("idna", label, 0, len(label), f"IDNA does not round-trip, '{label!r}' != '{label2!r}'") | ||||||||||
|
||||||||||
# Step 8: return the result of step 5 | ||||||||||
return result | ||||||||||
|
@@ -156,7 +170,7 @@ def encode(self, input, errors='strict'): | |||||||||
|
||||||||||
if errors != 'strict': | ||||||||||
# IDNA is quite clear that implementations must be strict | ||||||||||
raise UnicodeError("unsupported error handling "+errors) | ||||||||||
raise UnicodeEncodeError("idna", input, 0, 1, f"unsupported error handling {errors}") | ||||||||||
|
||||||||||
if not input: | ||||||||||
return b'', 0 | ||||||||||
|
@@ -168,11 +182,13 @@ def encode(self, input, errors='strict'): | |||||||||
else: | ||||||||||
# ASCII name: fast path | ||||||||||
labels = result.split(b'.') | ||||||||||
index = 0 | ||||||||||
for label in labels[:-1]: | ||||||||||
if not (0 < len(label) < 64): | ||||||||||
raise UnicodeError("label empty or too long") | ||||||||||
raise UnicodeEncodeError("idna", input, index, index+len(label), "label empty or too long") | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agree we using absolute offset. But it still reports weird range for empty label. I think that it is better to include an opening or closing dot. Also, I think that it is better to use different messages for empty and too long labels. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Split out the empty vs too long cases. Note that |
||||||||||
index += len(label) + 1 | ||||||||||
if len(labels[-1]) >= 64: | ||||||||||
raise UnicodeError("label too long") | ||||||||||
raise UnicodeEncodeError("idna", input, index, len(input), "label too long") | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed the linebreaks of some of the longer exceptions to look like this |
||||||||||
return result, len(input) | ||||||||||
|
||||||||||
result = bytearray() | ||||||||||
|
@@ -186,13 +202,22 @@ def encode(self, input, errors='strict'): | |||||||||
if result: | ||||||||||
# Join with U+002E | ||||||||||
result.extend(b'.') | ||||||||||
result.extend(ToASCII(label)) | ||||||||||
try: | ||||||||||
result.extend(ToASCII(label)) | ||||||||||
except UnicodeEncodeError as exc: | ||||||||||
raise UnicodeEncodeError( | ||||||||||
"idna", | ||||||||||
input, | ||||||||||
len(result) + exc.start, | ||||||||||
len(result) + exc.end, | ||||||||||
exc.reason, | ||||||||||
) | ||||||||||
methane marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||
return bytes(result+trailing_dot), len(input) | ||||||||||
|
||||||||||
def decode(self, input, errors='strict'): | ||||||||||
|
||||||||||
if errors != 'strict': | ||||||||||
raise UnicodeError("Unsupported error handling "+errors) | ||||||||||
raise UnicodeDecodeError("idna", input, 0, 1, f"Unsupported error handling {errors}") | ||||||||||
|
||||||||||
if not input: | ||||||||||
return "", 0 | ||||||||||
|
@@ -219,15 +244,27 @@ def decode(self, input, errors='strict'): | |||||||||
|
||||||||||
result = [] | ||||||||||
for label in labels: | ||||||||||
result.append(ToUnicode(label)) | ||||||||||
try: | ||||||||||
u_label = ToUnicode(label) | ||||||||||
except UnicodeEncodeError as exc: | ||||||||||
len_result = sum(len(x) for x in result) + len(result) | ||||||||||
raise UnicodeDecodeError( | ||||||||||
"idna", | ||||||||||
input, | ||||||||||
len_result + exc.start, | ||||||||||
len_result + exc.end, | ||||||||||
exc.reason, | ||||||||||
) | ||||||||||
else: | ||||||||||
result.append(u_label) | ||||||||||
|
||||||||||
return ".".join(result)+trailing_dot, len(input) | ||||||||||
|
||||||||||
class IncrementalEncoder(codecs.BufferedIncrementalEncoder): | ||||||||||
def _buffer_encode(self, input, errors, final): | ||||||||||
if errors != 'strict': | ||||||||||
# IDNA is quite clear that implementations must be strict | ||||||||||
raise UnicodeError("unsupported error handling "+errors) | ||||||||||
raise UnicodeEncodeError("idna", input, 0, 1, f"Unsupported error handling {errors}") | ||||||||||
|
||||||||||
if not input: | ||||||||||
return (b'', 0) | ||||||||||
|
@@ -251,7 +288,16 @@ def _buffer_encode(self, input, errors, final): | |||||||||
# Join with U+002E | ||||||||||
result.extend(b'.') | ||||||||||
size += 1 | ||||||||||
result.extend(ToASCII(label)) | ||||||||||
try: | ||||||||||
result.extend(ToASCII(label)) | ||||||||||
except UnicodeEncodeError as exc: | ||||||||||
raise UnicodeEncodeError( | ||||||||||
"idna", | ||||||||||
input, | ||||||||||
size + exc.start, | ||||||||||
size + exc.end, | ||||||||||
exc.reason, | ||||||||||
) | ||||||||||
size += len(label) | ||||||||||
|
||||||||||
result += trailing_dot | ||||||||||
|
@@ -261,7 +307,7 @@ def _buffer_encode(self, input, errors, final): | |||||||||
class IncrementalDecoder(codecs.BufferedIncrementalDecoder): | ||||||||||
def _buffer_decode(self, input, errors, final): | ||||||||||
if errors != 'strict': | ||||||||||
raise UnicodeError("Unsupported error handling "+errors) | ||||||||||
raise UnicodeDecodeError("idna", input, 0, 1, "Unsupported error handling {errors}") | ||||||||||
|
||||||||||
if not input: | ||||||||||
return ("", 0) | ||||||||||
|
@@ -288,7 +334,18 @@ def _buffer_decode(self, input, errors, final): | |||||||||
result = [] | ||||||||||
size = 0 | ||||||||||
for label in labels: | ||||||||||
result.append(ToUnicode(label)) | ||||||||||
try: | ||||||||||
u_label = ToUnicode(label) | ||||||||||
except UnicodeEncodeError as exc: | ||||||||||
raise UnicodeDecodeError( | ||||||||||
"idna", | ||||||||||
input, | ||||||||||
size + exc.start, | ||||||||||
size + exc.end, | ||||||||||
exc.reason, | ||||||||||
) | ||||||||||
else: | ||||||||||
result.append(u_label) | ||||||||||
if size: | ||||||||||
size += 1 | ||||||||||
size += len(label) | ||||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
UnicodeEncodeError implies that there is an error in the encoded string.
Other codecs usually raise LookupError for unknown error handlers, but here an exception is raised even for known error handlers.
Perhaps ValueError better suits here. UnicodeError is a subclass of ValueError, so we can keep it for now.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Changed the cases where there was not an issue with the input string back to
UnicodeError
.