python · methane · Mar 17, 2024 · Jan 3, 2024 · Jan 3, 2024 · Jan 3, 2024
diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py
@@ -25,7 +25,7 @@ def nameprep(label):
     label = unicodedata.normalize("NFKC", label)
 
     # Prohibit
-    for c in label:
+    for i, c in enumerate(label):
         if stringprep.in_table_c12(c) or \
            stringprep.in_table_c22(c) or \
            stringprep.in_table_c3(c) or \
@@ -35,7 +35,7 @@ def nameprep(label):
            stringprep.in_table_c7(c) or \
            stringprep.in_table_c8(c) or \
            stringprep.in_table_c9(c):
-            raise UnicodeError("Invalid character %r" % c)
+            raise UnicodeEncodeError("idna", label, i, i+1, f"Invalid character {c!r}")
 
     # Check bidi
     RandAL = [stringprep.in_table_d1(x) for x in label]
@@ -46,29 +46,35 @@ def nameprep(label):
         # This is table C.8, which was already checked
         # 2) If a string contains any RandALCat character, the string
         # MUST NOT contain any LCat character.
-        if any(stringprep.in_table_d2(x) for x in label):
-            raise UnicodeError("Violation of BIDI requirement 2")
+        for i, x in enumerate(label):
+            if stringprep.in_table_d2(x):
+                raise UnicodeEncodeError("idna", label, i, i+1, "Violation of BIDI requirement 2")
         # 3) If a string contains any RandALCat character, a
         # RandALCat character MUST be the first character of the
         # string, and a RandALCat character MUST be the last
         # character of the string.
-        if not RandAL[0] or not RandAL[-1]:
-            raise UnicodeError("Violation of BIDI requirement 3")
+        if not RandAL[0]:
+            raise UnicodeEncodeError(
+                "idna", label,
+                0, 1, "Violation of BIDI requirement 3")
+        if not RandAL[-1]:
+            raise UnicodeEncodeError("idna", label, len(label)-1, len(label), "Violation of BIDI requirement 3")
 
     return label
 
 def ToASCII(label):
     try:
         # Step 1: try ASCII
         label = label.encode("ascii")
-    except UnicodeError:
+    except UnicodeEncodeError:
         pass
     else:
         # Skip to step 3: UseSTD3ASCIIRules is false, so
         # Skip to step 8.
         if 0 < len(label) < 64:
             return label
-        raise UnicodeError("label empty or too long")
+        label = label.decode("ascii", errors="backslashreplace")
+        raise UnicodeEncodeError("idna", label, 0, len(label), "label empty or too long")
 
     # Step 2: nameprep
     label = nameprep(label)
@@ -77,17 +83,20 @@ def ToASCII(label):
     # Step 4: try ASCII
     try:
         label = label.encode("ascii")
-    except UnicodeError:
+    except UnicodeEncodeError:
         pass
     else:
         # Skip to step 8.
         if 0 < len(label) < 64:
             return label
-        raise UnicodeError("label empty or too long")
+        label = label.decode("ascii", errors="backslashreplace")
+        raise UnicodeEncodeError("idna", label, 0, len(label), "label empty or too long")
 
     # Step 5: Check ACE prefix
     if label.startswith(sace_prefix):
-        raise UnicodeError("Label starts with ACE prefix")
+        raise UnicodeEncodeError(
+            "idna", label.decode("ascii", errors="backslashreplace"),
+            0, len(sace_prefix), "Label starts with ACE prefix")
 
     # Step 6: Encode with PUNYCODE
     label = label.encode("punycode")
@@ -98,7 +107,8 @@ def ToASCII(label):
     # Step 8: Check size
     if 0 < len(label) < 64:
         return label
-    raise UnicodeError("label empty or too long")
+    label = label[len(ace_prefix):].decode("punycode", errors="replace")
+    raise UnicodeEncodeError("idna", label, 0, len(label), "label empty or too long")
 
 def ToUnicode(label):
     if len(label) > 1024:
@@ -110,24 +120,28 @@ def ToUnicode(label):
         # per https://www.rfc-editor.org/rfc/rfc3454#section-3.1 while still
         # preventing us from wasting time decoding a big thing that'll just
         # hit the actual <= 63 length limit in Step 6.
-        raise UnicodeError("label way too long")
+        if isinstance(label, bytes):
+            label = label.decode("utf-8", errors="backslashreplace")
+        raise UnicodeEncodeError("idna", label, 0, len(label), "label way too long")
     # Step 1: Check for ASCII
     if isinstance(label, bytes):
         pure_ascii = True
     else:
         try:
             label = label.encode("ascii")
             pure_ascii = True
-        except UnicodeError:
+        except UnicodeEncodeError:
             pure_ascii = False
     if not pure_ascii:
         # Step 2: Perform nameprep
         label = nameprep(label)
         # It doesn't say this, but apparently, it should be ASCII now
         try:
             label = label.encode("ascii")
-        except UnicodeError:
-            raise UnicodeError("Invalid character in IDN label")
+        except UnicodeEncodeError as exc:
+            if isinstance(label, bytes):
+                label = label.decode("utf-8", errors="backslashreplace")
+            raise UnicodeEncodeError("idna", label, exc.start, exc.end, "Invalid character in IDN label")
     # Step 3: Check for ACE prefix
     if not label.startswith(ace_prefix):
         return str(label, "ascii")
@@ -144,7 +158,7 @@ def ToUnicode(label):
     # Step 7: Compare the result of step 6 with the one of step 3
     # label2 will already be in lower case.
     if str(label, "ascii").lower() != str(label2, "ascii"):
-        raise UnicodeError("IDNA does not round-trip", label, label2)
+        raise UnicodeEncodeError("idna", label, 0, len(label), f"IDNA does not round-trip, '{label!r}' != '{label2!r}'")
 
     # Step 8: return the result of step 5
     return result
@@ -156,7 +170,7 @@ def encode(self, input, errors='strict'):
 
         if errors != 'strict':
             # IDNA is quite clear that implementations must be strict
-            raise UnicodeError("unsupported error handling "+errors)
+            raise UnicodeEncodeError("idna", input, 0, 1, f"unsupported error handling {errors}")
-            raise UnicodeEncodeError("idna", input, 0, 1, f"unsupported error handling {errors}")
+            raise UnicodeEncodeError("idna", input, 0, 0, f"unsupported error handling {errors}")
-            raise UnicodeEncodeError("idna", input, 0, 1, f"unsupported error handling {errors}")
+            raise UnicodeEncodeError("idna", input, 0, 0, f"unsupported error handling {errors}")
 
         if not input:
             return b'', 0
@@ -168,11 +182,13 @@ def encode(self, input, errors='strict'):
         else:
             # ASCII name: fast path
             labels = result.split(b'.')
+            index = 0
             for label in labels[:-1]:
                 if not (0 < len(label) < 64):
-                    raise UnicodeError("label empty or too long")
+                    raise UnicodeEncodeError("idna", input, index, index+len(label), "label empty or too long")
-                    raise UnicodeEncodeError("idna", input, index, index+len(label), "label empty or too long")
+                    raise UnicodeEncodeError(
+                        "idna", input, index, index+len(label),
+                        "label empty or too long")
-                    raise UnicodeEncodeError("idna", input, index, index+len(label), "label empty or too long")
+                    raise UnicodeEncodeError(
+                        "idna", input, index, index+len(label),
+                        "label empty or too long")
+                index += len(label) + 1
             if len(labels[-1]) >= 64:
-                raise UnicodeError("label too long")
+                raise UnicodeEncodeError("idna", input, index, len(input), "label too long")
-                raise UnicodeEncodeError("idna", input, index, len(input), "label too long")
+                raise UnicodeEncodeError("idna", input, index, len(input),
+                                         "label too long")
-                raise UnicodeEncodeError("idna", input, index, len(input), "label too long")
+                raise UnicodeEncodeError("idna", input, index, len(input),
+                                         "label too long")
             return result, len(input)
 
         result = bytearray()
@@ -186,13 +202,22 @@ def encode(self, input, errors='strict'):
             if result:
                 # Join with U+002E
                 result.extend(b'.')
-            result.extend(ToASCII(label))
+            try:
+                result.extend(ToASCII(label))
+            except UnicodeEncodeError as exc:
+                raise UnicodeEncodeError(
+                    "idna",
+                    input,
+                    len(result) + exc.start,
+                    len(result) + exc.end,
+                    exc.reason,
+                )
         return bytes(result+trailing_dot), len(input)
 
     def decode(self, input, errors='strict'):
 
         if errors != 'strict':
-            raise UnicodeError("Unsupported error handling "+errors)
+            raise UnicodeDecodeError("idna", input, 0, 1, f"Unsupported error handling {errors}")
 
         if not input:
             return "", 0
@@ -219,15 +244,27 @@ def decode(self, input, errors='strict'):
 
         result = []
         for label in labels:
-            result.append(ToUnicode(label))
+            try:
+                u_label = ToUnicode(label)
+            except UnicodeEncodeError as exc:
+                len_result = sum(len(x) for x in result) + len(result)
+                raise UnicodeDecodeError(
+                    "idna",
+                    input,
+                    len_result + exc.start,
+                    len_result + exc.end,
+                    exc.reason,
+                )
+            else:
+                result.append(u_label)
 
         return ".".join(result)+trailing_dot, len(input)
 
 class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
     def _buffer_encode(self, input, errors, final):
         if errors != 'strict':
             # IDNA is quite clear that implementations must be strict
-            raise UnicodeError("unsupported error handling "+errors)
+            raise UnicodeEncodeError("idna", input, 0, 1, f"Unsupported error handling {errors}")
 
         if not input:
             return (b'', 0)
@@ -251,7 +288,16 @@ def _buffer_encode(self, input, errors, final):
                 # Join with U+002E
                 result.extend(b'.')
                 size += 1
-            result.extend(ToASCII(label))
+            try:
+                result.extend(ToASCII(label))
+            except UnicodeEncodeError as exc:
+                raise UnicodeEncodeError(
+                    "idna",
+                    input,
+                    size + exc.start,
+                    size + exc.end,
+                    exc.reason,
+                )
             size += len(label)
 
         result += trailing_dot
@@ -261,7 +307,7 @@ def _buffer_encode(self, input, errors, final):
 class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
     def _buffer_decode(self, input, errors, final):
         if errors != 'strict':
-            raise UnicodeError("Unsupported error handling "+errors)
+            raise UnicodeDecodeError("idna", input, 0, 1, "Unsupported error handling {errors}")
 
         if not input:
             return ("", 0)
@@ -288,7 +334,18 @@ def _buffer_decode(self, input, errors, final):
         result = []
         size = 0
         for label in labels:
-            result.append(ToUnicode(label))
+            try:
+                u_label = ToUnicode(label)
+            except UnicodeEncodeError as exc:
+                raise UnicodeDecodeError(
+                    "idna",
+                    input,
+                    size + exc.start,
+                    size + exc.end,
+                    exc.reason,
+                )
+            else:
+                result.append(u_label)
             if size:
                 size += 1
             size += len(label)

diff --git a/Lib/encodings/punycode.py b/Lib/encodings/punycode.py
@@ -1,4 +1,4 @@
-""" Codec for the Punicode encoding, as specified in RFC 3492
+""" Codec for the Punycode encoding, as specified in RFC 3492
 
 Written by Martin v. Löwis.
 """
@@ -134,16 +134,17 @@ def decode_generalized_number(extended, extpos, bias, errors):
             char = ord(extended[extpos])
         except IndexError:
             if errors == "strict":
-                raise UnicodeError("incomplete punicode string")
+                b_extended = extended.encode("utf-8", errors="backslashreplace")
+                raise UnicodeDecodeError("punycode", b_extended, extpos, extpos+1, "incomplete punycode string")
             return extpos + 1, None
         extpos += 1
         if 0x41 <= char <= 0x5A: # A-Z
             digit = char - 0x41
         elif 0x30 <= char <= 0x39:
             digit = char - 22 # 0x30-26
         elif errors == "strict":
-            raise UnicodeError("Invalid extended code point '%s'"
-                               % extended[extpos-1])
+            b_extended = extended.encode("utf-8", errors="backslashreplace")
+            raise UnicodeDecodeError("punycode", b_extended, extpos-1, extpos, f"Invalid extended code point '{extended[extpos-1]}'")
         else:
             return extpos, None
         t = T(j, bias)
@@ -161,8 +162,17 @@ def insertion_sort(base, extended, errors):
     bias = 72
     extpos = 0
     while extpos < len(extended):
-        newpos, delta = decode_generalized_number(extended, extpos,
-                                                  bias, errors)
+        try:
+            newpos, delta = decode_generalized_number(extended, extpos,
+                                                      bias, errors)
+        except UnicodeDecodeError as exc:
+            raise UnicodeDecodeError(
+                "punycode",
+                base.encode("utf-8", errors="backslashreplace")
+                    + b"-"
+                    + extended.encode("utf-8", errors="backslashreplace"),
+                pos + exc.start, pos + exc.end, exc.reason)
+
         if delta is None:
             # There was an error in decoding. We can't continue because
             # synchronization is lost.
@@ -171,7 +181,12 @@ def insertion_sort(base, extended, errors):
         char += pos // (len(base) + 1)
         if char > 0x10FFFF:
             if errors == "strict":
-                raise UnicodeError("Invalid character U+%x" % char)
+                raise UnicodeDecodeError(
+                    "punycode",
+                    base.encode("utf-8", errors="backslashreplace")
+                        + b"-"
+                        + extended.encode("utf-8", errors="backslashreplace"),
+                    pos, pos+1, f"Invalid character U+{char:x}")
             char = ord('?')
         pos = pos % (len(base) + 1)
         base = base[:pos] + chr(char) + base[pos:]
@@ -203,7 +218,7 @@ def encode(self, input, errors='strict'):
 
     def decode(self, input, errors='strict'):
         if errors not in ('strict', 'replace', 'ignore'):
-            raise UnicodeError("Unsupported error handling "+errors)
+            raise UnicodeDecodeError("punycode", input, 0, 1, f"Unsupported error handling {errors}")
         res = punycode_decode(input, errors)
         return res, len(input)
 
@@ -214,7 +229,7 @@ def encode(self, input, final=False):
 class IncrementalDecoder(codecs.IncrementalDecoder):
     def decode(self, input, final=False):
         if self.errors not in ('strict', 'replace', 'ignore'):
-            raise UnicodeError("Unsupported error handling "+self.errors)
+            raise UnicodeDecodeError("punycode", input, 0, 1, f"Unsupported error handling {self.errors}")
         return punycode_decode(input, self.errors)
 
 class StreamWriter(Codec,codecs.StreamWriter):

diff --git a/Lib/encodings/undefined.py b/Lib/encodings/undefined.py
@@ -1,6 +1,6 @@
 """ Python 'undefined' Codec
 
-    This codec will always raise a ValueError exception when being
+    This codec will always raise a UnicodeError exception when being
     used. It is intended for use by the site.py file to switch off
     automatic string to Unicode coercion.
 
@@ -16,18 +16,18 @@
 class Codec(codecs.Codec):
 
     def encode(self,input,errors='strict'):
-        raise UnicodeError("undefined encoding")
+        raise UnicodeEncodeError("undefined", input, 0, len(input), "undefined encoding")
 
     def decode(self,input,errors='strict'):
-        raise UnicodeError("undefined encoding")
+        raise UnicodeDecodeError("undefined", input, 0, len(input), "undefined encoding")
 
 class IncrementalEncoder(codecs.IncrementalEncoder):
     def encode(self, input, final=False):
-        raise UnicodeError("undefined encoding")
+        raise UnicodeEncodeError("undefined", input, 0, len(input), "undefined encoding")
 
 class IncrementalDecoder(codecs.IncrementalDecoder):
     def decode(self, input, final=False):
-        raise UnicodeError("undefined encoding")
+        raise UnicodeDecodeError("undefined", input, 0, len(input), "undefined encoding")
 
 class StreamWriter(Codec,codecs.StreamWriter):
     pass

diff --git a/Lib/encodings/utf_16.py b/Lib/encodings/utf_16.py
@@ -64,7 +64,7 @@ def _buffer_decode(self, input, errors, final):
             elif byteorder == 1:
                 self.decoder = codecs.utf_16_be_decode
             elif consumed >= 2:
-                raise UnicodeError("UTF-16 stream does not start with BOM")
+                raise UnicodeDecodeError("utf-16", input, 0, 2, "Stream does not start with BOM")
             return (output, consumed)
         return self.decoder(input, self.errors, final)
 
@@ -138,7 +138,7 @@ def decode(self, input, errors='strict'):
         elif byteorder == 1:
             self.decode = codecs.utf_16_be_decode
         elif consumed>=2:
-            raise UnicodeError("UTF-16 stream does not start with BOM")
+            raise UnicodeDecodeError("utf-16", input, 0, 2, "Stream does not start with BOM")
         return (object, consumed)
 
 ### encodings module API