From e9692697d6778a0b5b4744ea2f2cc2c68810832c Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Tue, 11 Jun 2024 21:38:36 +0300
Subject: [PATCH 1/3] gh-70278: Fix PyUnicode_FromFormat() with precision for
 %s and %V

PyUnicode_FromFormat() no longer produces the ending \ufffd
character for truncated C string when use precision with %s and %V.
It now truncates the string before the start of truncated multibyte sequences.
---
 Lib/test/test_capi/test_unicode.py            | 42 ++++++++++++++++++-
 ...4-06-11-21-38-32.gh-issue-70278.WDE4zM.rst |  4 ++
 Objects/unicodeobject.c                       |  8 +++-
 3 files changed, 50 insertions(+), 4 deletions(-)
 create mode 100644 Misc/NEWS.d/next/C API/2024-06-11-21-38-32.gh-issue-70278.WDE4zM.rst

diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py
index a69f817c515ba7..d5a8ada8381e32 100644
--- a/Lib/test/test_capi/test_unicode.py
+++ b/Lib/test/test_capi/test_unicode.py
@@ -415,8 +415,27 @@ def check_format(expected, format, *args):
         # truncated string
         check_format('abc',
                      b'%.3s', b'abcdef')
+        check_format('abc[',
+                     b'%.6s', 'abc[\u20ac]'.encode('utf8'))
+        check_format('abc[\u20ac',
+                     b'%.7s', 'abc[\u20ac]'.encode('utf8'))
         check_format('abc[\ufffd',
-                     b'%.5s', 'abc[\u20ac]'.encode('utf8'))
+                     b'%.5s', b'abc[\xff]')
+        check_format('abc[',
+                     b'%.6s', b'abc[\xe2\x82]')
+        check_format('abc[\ufffd]',
+                     b'%.7s', b'abc[\xe2\x82]')
+        check_format('      abc[',
+                     b'%10.6s', 'abc[\u20ac]'.encode('utf8'))
+        check_format('     abc[\u20ac',
+                     b'%10.7s', 'abc[\u20ac]'.encode('utf8'))
+        check_format('     abc[\ufffd',
+                     b'%10.5s', b'abc[\xff]')
+        check_format('      abc[',
+                     b'%10.6s', b'abc[\xe2\x82]')
+        check_format('    abc[\ufffd]',
+                     b'%10.7s', b'abc[\xe2\x82]')
+
         check_format("'\\u20acABC'",
                      b'%A', '\u20acABC')
         check_format("'\\u20",
@@ -429,10 +448,29 @@ def check_format(expected, format, *args):
                      b'%.3S', '\u20acABCDEF')
         check_format('\u20acAB',
                      b'%.3U', '\u20acABCDEF')
+
         check_format('\u20acAB',
                      b'%.3V', '\u20acABCDEF', None)
+        check_format('abc[',
+                     b'%.6V', None, 'abc[\u20ac]'.encode('utf8'))
+        check_format('abc[\u20ac',
+                     b'%.7V', None, 'abc[\u20ac]'.encode('utf8'))
         check_format('abc[\ufffd',
-                     b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
+                     b'%.5V', None, b'abc[\xff]')
+        check_format('abc[',
+                     b'%.6V', None, b'abc[\xe2\x82]')
+        check_format('abc[\ufffd]',
+                     b'%.7V', None, b'abc[\xe2\x82]')
+        check_format('      abc[',
+                     b'%10.6V', None, 'abc[\u20ac]'.encode('utf8'))
+        check_format('     abc[\u20ac',
+                     b'%10.7V', None, 'abc[\u20ac]'.encode('utf8'))
+        check_format('     abc[\ufffd',
+                     b'%10.5V', None, b'abc[\xff]')
+        check_format('      abc[',
+                     b'%10.6V', None, b'abc[\xe2\x82]')
+        check_format('    abc[\ufffd]',
+                     b'%10.7V', None, b'abc[\xe2\x82]')
 
         # following tests comes from #7330
         # test width modifier and precision modifier with %S
diff --git a/Misc/NEWS.d/next/C API/2024-06-11-21-38-32.gh-issue-70278.WDE4zM.rst b/Misc/NEWS.d/next/C API/2024-06-11-21-38-32.gh-issue-70278.WDE4zM.rst
new file mode 100644
index 00000000000000..1eca36a86bc97e
--- /dev/null
+++ b/Misc/NEWS.d/next/C API/2024-06-11-21-38-32.gh-issue-70278.WDE4zM.rst	
@@ -0,0 +1,4 @@
+:c:func:`PyUnicode_FromFormat` no longer produces the ending ``\ufffd``
+character for truncated C string when use precision with ``%s`` and ``%V``.
+It now truncates the string before the start of truncated multibyte
+sequences.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 3b0b4173408724..c3182490289886 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -2390,23 +2390,27 @@ unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
 {
     /* UTF-8 */
     Py_ssize_t length;
+    Py_ssize_t consumed;
+    Py_ssize_t *pconsumed;
     if (precision == -1) {
         length = strlen(str);
+        pconsumed = NULL;
     }
     else {
         length = 0;
         while (length < precision && str[length]) {
             length++;
         }
+        pconsumed = &consumed;
     }
 
     if (width < 0) {
         return unicode_decode_utf8_writer(writer, str, length,
-                                          _Py_ERROR_REPLACE, "replace", NULL);
+                                          _Py_ERROR_REPLACE, "replace", pconsumed);
     }
 
     PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
-                                                     "replace", NULL);
+                                                     "replace", pconsumed);
     if (unicode == NULL)
         return -1;
 

From 6f80f9c98766a76b3e121784f17d6c9621bda860 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Tue, 11 Jun 2024 23:31:01 +0300
Subject: [PATCH 2/3] Fix a case when the precision is larger than strlen().

---
 Lib/test/test_capi/test_unicode.py | 4 ++++
 Objects/unicodeobject.c            | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py
index d5a8ada8381e32..d964fb01baa198 100644
--- a/Lib/test/test_capi/test_unicode.py
+++ b/Lib/test/test_capi/test_unicode.py
@@ -425,6 +425,8 @@ def check_format(expected, format, *args):
                      b'%.6s', b'abc[\xe2\x82]')
         check_format('abc[\ufffd]',
                      b'%.7s', b'abc[\xe2\x82]')
+        check_format('abc[\ufffd',
+                     b'%.7s', b'abc[\xe2\x82\0')
         check_format('      abc[',
                      b'%10.6s', 'abc[\u20ac]'.encode('utf8'))
         check_format('     abc[\u20ac',
@@ -471,6 +473,8 @@ def check_format(expected, format, *args):
                      b'%10.6V', None, b'abc[\xe2\x82]')
         check_format('    abc[\ufffd]',
                      b'%10.7V', None, b'abc[\xe2\x82]')
+        check_format('     abc[\ufffd',
+                     b'%10.7V', None, b'abc[\xe2\x82\0')
 
         # following tests comes from #7330
         # test width modifier and precision modifier with %S
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index c3182490289886..9194d2182c70d4 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -2401,7 +2401,7 @@ unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
         while (length < precision && str[length]) {
             length++;
         }
-        pconsumed = &consumed;
+        pconsumed = (length < precision) ? NULL : &consumed;
     }
 
     if (width < 0) {

From 4000cdfd192fd9a32e29a120581bfbc79da03019 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Wed, 12 Jun 2024 13:32:41 +0300
Subject: [PATCH 3/3] Add a comment and refactor.

---
 Objects/unicodeobject.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 9194d2182c70d4..f3f47c4c62630c 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -2389,19 +2389,24 @@ unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
                               Py_ssize_t width, Py_ssize_t precision, int flags)
 {
     /* UTF-8 */
+    Py_ssize_t *pconsumed = NULL;
     Py_ssize_t length;
-    Py_ssize_t consumed;
-    Py_ssize_t *pconsumed;
     if (precision == -1) {
         length = strlen(str);
-        pconsumed = NULL;
     }
     else {
         length = 0;
         while (length < precision && str[length]) {
             length++;
         }
-        pconsumed = (length < precision) ? NULL : &consumed;
+        if (length == precision) {
+            /* The input string is not NUL-terminated.  If it ends with an
+             * incomplete UTF-8 sequence, truncate the string just before it.
+             * Incomplete sequences in the middle and sequences which cannot
+             * be valid prefixes are still treated as errors and replaced
+             * with \xfffd. */
+            pconsumed = &length;
+        }
     }
 
     if (width < 0) {