From e9692697d6778a0b5b4744ea2f2cc2c68810832c Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 11 Jun 2024 21:38:36 +0300 Subject: [PATCH 1/3] gh-70278: Fix PyUnicode_FromFormat() with precision for %s and %V PyUnicode_FromFormat() no longer produces the ending \ufffd character for truncated C string when use precision with %s and %V. It now truncates the string before the start of truncated multibyte sequences. --- Lib/test/test_capi/test_unicode.py | 42 ++++++++++++++++++- ...4-06-11-21-38-32.gh-issue-70278.WDE4zM.rst | 4 ++ Objects/unicodeobject.c | 8 +++- 3 files changed, 50 insertions(+), 4 deletions(-) create mode 100644 Misc/NEWS.d/next/C API/2024-06-11-21-38-32.gh-issue-70278.WDE4zM.rst diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index a69f817c515ba7..d5a8ada8381e32 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -415,8 +415,27 @@ def check_format(expected, format, *args): # truncated string check_format('abc', b'%.3s', b'abcdef') + check_format('abc[', + b'%.6s', 'abc[\u20ac]'.encode('utf8')) + check_format('abc[\u20ac', + b'%.7s', 'abc[\u20ac]'.encode('utf8')) check_format('abc[\ufffd', - b'%.5s', 'abc[\u20ac]'.encode('utf8')) + b'%.5s', b'abc[\xff]') + check_format('abc[', + b'%.6s', b'abc[\xe2\x82]') + check_format('abc[\ufffd]', + b'%.7s', b'abc[\xe2\x82]') + check_format(' abc[', + b'%10.6s', 'abc[\u20ac]'.encode('utf8')) + check_format(' abc[\u20ac', + b'%10.7s', 'abc[\u20ac]'.encode('utf8')) + check_format(' abc[\ufffd', + b'%10.5s', b'abc[\xff]') + check_format(' abc[', + b'%10.6s', b'abc[\xe2\x82]') + check_format(' abc[\ufffd]', + b'%10.7s', b'abc[\xe2\x82]') + check_format("'\\u20acABC'", b'%A', '\u20acABC') check_format("'\\u20", @@ -429,10 +448,29 @@ def check_format(expected, format, *args): b'%.3S', '\u20acABCDEF') check_format('\u20acAB', b'%.3U', '\u20acABCDEF') + check_format('\u20acAB', b'%.3V', '\u20acABCDEF', None) + check_format('abc[', + b'%.6V', None, 'abc[\u20ac]'.encode('utf8')) + check_format('abc[\u20ac', + b'%.7V', None, 'abc[\u20ac]'.encode('utf8')) check_format('abc[\ufffd', - b'%.5V', None, 'abc[\u20ac]'.encode('utf8')) + b'%.5V', None, b'abc[\xff]') + check_format('abc[', + b'%.6V', None, b'abc[\xe2\x82]') + check_format('abc[\ufffd]', + b'%.7V', None, b'abc[\xe2\x82]') + check_format(' abc[', + b'%10.6V', None, 'abc[\u20ac]'.encode('utf8')) + check_format(' abc[\u20ac', + b'%10.7V', None, 'abc[\u20ac]'.encode('utf8')) + check_format(' abc[\ufffd', + b'%10.5V', None, b'abc[\xff]') + check_format(' abc[', + b'%10.6V', None, b'abc[\xe2\x82]') + check_format(' abc[\ufffd]', + b'%10.7V', None, b'abc[\xe2\x82]') # following tests comes from #7330 # test width modifier and precision modifier with %S diff --git a/Misc/NEWS.d/next/C API/2024-06-11-21-38-32.gh-issue-70278.WDE4zM.rst b/Misc/NEWS.d/next/C API/2024-06-11-21-38-32.gh-issue-70278.WDE4zM.rst new file mode 100644 index 00000000000000..1eca36a86bc97e --- /dev/null +++ b/Misc/NEWS.d/next/C API/2024-06-11-21-38-32.gh-issue-70278.WDE4zM.rst @@ -0,0 +1,4 @@ +:c:func:`PyUnicode_FromFormat` no longer produces the ending ``\ufffd`` +character for truncated C string when use precision with ``%s`` and ``%V``. +It now truncates the string before the start of truncated multibyte +sequences. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 3b0b4173408724..c3182490289886 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2390,23 +2390,27 @@ unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str, { /* UTF-8 */ Py_ssize_t length; + Py_ssize_t consumed; + Py_ssize_t *pconsumed; if (precision == -1) { length = strlen(str); + pconsumed = NULL; } else { length = 0; while (length < precision && str[length]) { length++; } + pconsumed = &consumed; } if (width < 0) { return unicode_decode_utf8_writer(writer, str, length, - _Py_ERROR_REPLACE, "replace", NULL); + _Py_ERROR_REPLACE, "replace", pconsumed); } PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length, - "replace", NULL); + "replace", pconsumed); if (unicode == NULL) return -1; From 6f80f9c98766a76b3e121784f17d6c9621bda860 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 11 Jun 2024 23:31:01 +0300 Subject: [PATCH 2/3] Fix a case when the precision is larger than strlen(). --- Lib/test/test_capi/test_unicode.py | 4 ++++ Objects/unicodeobject.c | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index d5a8ada8381e32..d964fb01baa198 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -425,6 +425,8 @@ def check_format(expected, format, *args): b'%.6s', b'abc[\xe2\x82]') check_format('abc[\ufffd]', b'%.7s', b'abc[\xe2\x82]') + check_format('abc[\ufffd', + b'%.7s', b'abc[\xe2\x82\0') check_format(' abc[', b'%10.6s', 'abc[\u20ac]'.encode('utf8')) check_format(' abc[\u20ac', @@ -471,6 +473,8 @@ def check_format(expected, format, *args): b'%10.6V', None, b'abc[\xe2\x82]') check_format(' abc[\ufffd]', b'%10.7V', None, b'abc[\xe2\x82]') + check_format(' abc[\ufffd', + b'%10.7V', None, b'abc[\xe2\x82\0') # following tests comes from #7330 # test width modifier and precision modifier with %S diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index c3182490289886..9194d2182c70d4 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2401,7 +2401,7 @@ unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str, while (length < precision && str[length]) { length++; } - pconsumed = &consumed; + pconsumed = (length < precision) ? NULL : &consumed; } if (width < 0) { From 4000cdfd192fd9a32e29a120581bfbc79da03019 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 12 Jun 2024 13:32:41 +0300 Subject: [PATCH 3/3] Add a comment and refactor. --- Objects/unicodeobject.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 9194d2182c70d4..f3f47c4c62630c 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2389,19 +2389,24 @@ unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str, Py_ssize_t width, Py_ssize_t precision, int flags) { /* UTF-8 */ + Py_ssize_t *pconsumed = NULL; Py_ssize_t length; - Py_ssize_t consumed; - Py_ssize_t *pconsumed; if (precision == -1) { length = strlen(str); - pconsumed = NULL; } else { length = 0; while (length < precision && str[length]) { length++; } - pconsumed = (length < precision) ? NULL : &consumed; + if (length == precision) { + /* The input string is not NUL-terminated. If it ends with an + * incomplete UTF-8 sequence, truncate the string just before it. + * Incomplete sequences in the middle and sequences which cannot + * be valid prefixes are still treated as errors and replaced + * with \xfffd. */ + pconsumed = &length; + } } if (width < 0) {