python · vstinner · Oct 8, 2024 · Oct 3, 2024 · Oct 3, 2024 · Oct 3, 2024
@@ -8,6 +8,7 @@
 import weakref
 import errno
 from codecs import BOM_UTF8
+from itertools import product
 from textwrap import dedent
 
 from test.support import (captured_stderr, check_impl_detail,
@@ -1336,6 +1337,26 @@ def test_unicode_errors_no_object(self):
         for klass in klasses:
             self.assertEqual(str(klass.__new__(klass)), "")
 
+    def test_unicode_error_str_gh_123378(self):
+        for formatter, start, end, obj in product(
+            (str, repr),
+            range(-5, 5),
+            range(-5, 5),
+            ('', 'a', '123', '1234', '12345', 'abc123'),
-            ('', 'a', '123', '1234', '12345', 'abc123'),
+            ('', 'a', 'abc', 'abcde', 'abcdef'),
-            ('', 'a', '123', '1234', '12345', 'abc123'),
+            ('', 'a', 'abc', 'abcde', 'abcdef'),
+        ):
+            with self.subTest(formatter, obj=obj, start=start, end=end):
+                exc = UnicodeEncodeError('utf-8', obj, start, end, '')
+                self.assertIsInstance(formatter(exc), str)
+
+            with self.subTest(formatter, obj=obj, start=start, end=end):
+                exc = UnicodeTranslateError(obj, start, end, '')
+                self.assertIsInstance(formatter(exc), str)
+
+            encoded = obj.encode()
+            with self.subTest(formatter, obj=encoded, start=start, end=end):
+                exc = UnicodeDecodeError('utf-8', encoded, start, end, '')
+                self.assertIsInstance(formatter(exc), str)
+
     @no_tracing
     def test_badisinstance(self):
         # Bug #2542: if issubclass(e, MyException) raises an exception,

diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2024-10-03-14-39-41.gh-issue-123378.dCxANf.rst b/Misc/NEWS.d/next/Core_and_Builtins/2024-10-03-14-39-41.gh-issue-123378.dCxANf.rst
@@ -0,0 +1,3 @@
+Fix a crash in the :meth:`~object.__str__` method of :exc:`UnicodeError`
+objects when the :attr:`UnicodeError.start` and :attr:`UnicodeError.end`
+values are invalid or out-of-range. Patch by Bénédikt Tran.
@@ -2994,26 +2994,29 @@ UnicodeEncodeError_init(PyObject *self, PyObject *args, PyObject *kwds)
 static PyObject *
 UnicodeEncodeError_str(PyObject *self)
 {
-    PyUnicodeErrorObject *uself = (PyUnicodeErrorObject *)self;
+    PyUnicodeErrorObject *exc = (PyUnicodeErrorObject *)self;
     PyObject *result = NULL;
     PyObject *reason_str = NULL;
     PyObject *encoding_str = NULL;
 
-    if (!uself->object)
+    if (exc->object == NULL)
         /* Not properly initialized. */
         return PyUnicode_FromString("");
 
     /* Get reason and encoding as strings, which they might not be if
        they've been modified after we were constructed. */
-    reason_str = PyObject_Str(uself->reason);
+    reason_str = PyObject_Str(exc->reason);
     if (reason_str == NULL)
         goto done;
-    encoding_str = PyObject_Str(uself->encoding);
+    encoding_str = PyObject_Str(exc->encoding);
     if (encoding_str == NULL)
         goto done;
 
-    if (uself->start < PyUnicode_GET_LENGTH(uself->object) && uself->end == uself->start+1) {
-        Py_UCS4 badchar = PyUnicode_ReadChar(uself->object, uself->start);
+    Py_ssize_t len = PyUnicode_GET_LENGTH(exc->object);
+    Py_ssize_t start = exc->start, end = exc->end;
+
+    if ((start >= 0 && start < len) && (end >= 0 && end <= len) && end == start + 1) {
+        Py_UCS4 badchar = PyUnicode_ReadChar(exc->object, start);
         const char *fmt;
         if (badchar <= 0xff)
             fmt = "'%U' codec can't encode character '\\x%02x' in position %zd: %U";
@@ -3025,15 +3028,15 @@ UnicodeEncodeError_str(PyObject *self)
             fmt,
             encoding_str,
             (int)badchar,
-            uself->start,
+            start,
             reason_str);
     }
     else {
         result = PyUnicode_FromFormat(
             "'%U' codec can't encode characters in position %zd-%zd: %U",
             encoding_str,
-            uself->start,
-            uself->end-1,
+            start,
+            end - 1,
             reason_str);
     }
 done:
@@ -3107,41 +3110,43 @@ UnicodeDecodeError_init(PyObject *self, PyObject *args, PyObject *kwds)
 static PyObject *
 UnicodeDecodeError_str(PyObject *self)
 {
-    PyUnicodeErrorObject *uself = (PyUnicodeErrorObject *)self;
+    PyUnicodeErrorObject *exc = (PyUnicodeErrorObject *)self;
     PyObject *result = NULL;
     PyObject *reason_str = NULL;
     PyObject *encoding_str = NULL;
 
-    if (!uself->object)
+    if (exc->object == NULL)
         /* Not properly initialized. */
         return PyUnicode_FromString("");
 
     /* Get reason and encoding as strings, which they might not be if
        they've been modified after we were constructed. */
-    reason_str = PyObject_Str(uself->reason);
+    reason_str = PyObject_Str(exc->reason);
     if (reason_str == NULL)
         goto done;
-    encoding_str = PyObject_Str(uself->encoding);
+    encoding_str = PyObject_Str(exc->encoding);
     if (encoding_str == NULL)
         goto done;
 
-    if (uself->start < PyBytes_GET_SIZE(uself->object) && uself->end == uself->start+1) {
-        int byte = (int)(PyBytes_AS_STRING(((PyUnicodeErrorObject *)self)->object)[uself->start]&0xff);
+    Py_ssize_t len = PyBytes_GET_SIZE(exc->object);
+    Py_ssize_t start = exc->start, end = exc->end;
+
+    if ((start >= 0 && start < len) && (end >= 0 && end <= len) && end == start + 1) {
+        int badbyte = (int)(PyBytes_AS_STRING(exc->object)[start] & 0xff);
         result = PyUnicode_FromFormat(
             "'%U' codec can't decode byte 0x%02x in position %zd: %U",
             encoding_str,
-            byte,
-            uself->start,
+            badbyte,
+            start,
             reason_str);
     }
     else {
         result = PyUnicode_FromFormat(
             "'%U' codec can't decode bytes in position %zd-%zd: %U",
             encoding_str,
-            uself->start,
-            uself->end-1,
-            reason_str
-            );
+            start,
+            end - 1,
+            reason_str);
     }
 done:
     Py_XDECREF(reason_str);
@@ -3204,22 +3209,25 @@ UnicodeTranslateError_init(PyUnicodeErrorObject *self, PyObject *args,
 static PyObject *
 UnicodeTranslateError_str(PyObject *self)
 {
-    PyUnicodeErrorObject *uself = (PyUnicodeErrorObject *)self;
+    PyUnicodeErrorObject *exc = (PyUnicodeErrorObject *)self;
     PyObject *result = NULL;
     PyObject *reason_str = NULL;
 
-    if (!uself->object)
+    if (exc->object == NULL)
         /* Not properly initialized. */
         return PyUnicode_FromString("");
 
     /* Get reason as a string, which it might not be if it's been
        modified after we were constructed. */
-    reason_str = PyObject_Str(uself->reason);
+    reason_str = PyObject_Str(exc->reason);
     if (reason_str == NULL)
         goto done;
 
-    if (uself->start < PyUnicode_GET_LENGTH(uself->object) && uself->end == uself->start+1) {
-        Py_UCS4 badchar = PyUnicode_ReadChar(uself->object, uself->start);
+    Py_ssize_t len = PyUnicode_GET_LENGTH(exc->object);
+    Py_ssize_t start = exc->start, end = exc->end;
+
+    if ((start >= 0 && start < len) && (end >= 0 && end <= len) && end == start + 1) {
+        Py_UCS4 badchar = PyUnicode_ReadChar(exc->object, start);
         const char *fmt;
         if (badchar <= 0xff)
             fmt = "can't translate character '\\x%02x' in position %zd: %U";
@@ -3230,16 +3238,14 @@ UnicodeTranslateError_str(PyObject *self)
         result = PyUnicode_FromFormat(
             fmt,
             (int)badchar,
-            uself->start,
-            reason_str
-        );
+            start,
+            reason_str);
     } else {
         result = PyUnicode_FromFormat(
             "can't translate characters in position %zd-%zd: %U",
-            uself->start,
-            uself->end-1,
-            reason_str
-            );
+            start,
+            end - 1,
+            reason_str);
     }
 done:
     Py_XDECREF(reason_str);