Skip to content

Commit 6eb23b1

Browse files
gh-70278: Fix PyUnicode_FromFormat() with precision for %s and %V (GH-120365)
PyUnicode_FromFormat() no longer produces the ending \ufffd character for truncated C string when use precision with %s and %V. It now truncates the string before the start of truncated multibyte sequences.
1 parent 22b8a35 commit 6eb23b1

File tree

3 files changed

+59
-4
lines changed

3 files changed

+59
-4
lines changed

Lib/test/test_capi/test_unicode.py

Lines changed: 44 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -419,8 +419,29 @@ def check_format(expected, format, *args):
419419
# truncated string
420420
check_format('abc',
421421
b'%.3s', b'abcdef')
422+
check_format('abc[',
423+
b'%.6s', 'abc[\u20ac]'.encode('utf8'))
424+
check_format('abc[\u20ac',
425+
b'%.7s', 'abc[\u20ac]'.encode('utf8'))
422426
check_format('abc[\ufffd',
423-
b'%.5s', 'abc[\u20ac]'.encode('utf8'))
427+
b'%.5s', b'abc[\xff]')
428+
check_format('abc[',
429+
b'%.6s', b'abc[\xe2\x82]')
430+
check_format('abc[\ufffd]',
431+
b'%.7s', b'abc[\xe2\x82]')
432+
check_format('abc[\ufffd',
433+
b'%.7s', b'abc[\xe2\x82\0')
434+
check_format(' abc[',
435+
b'%10.6s', 'abc[\u20ac]'.encode('utf8'))
436+
check_format(' abc[\u20ac',
437+
b'%10.7s', 'abc[\u20ac]'.encode('utf8'))
438+
check_format(' abc[\ufffd',
439+
b'%10.5s', b'abc[\xff]')
440+
check_format(' abc[',
441+
b'%10.6s', b'abc[\xe2\x82]')
442+
check_format(' abc[\ufffd]',
443+
b'%10.7s', b'abc[\xe2\x82]')
444+
424445
check_format("'\\u20acABC'",
425446
b'%A', '\u20acABC')
426447
check_format("'\\u20",
@@ -433,10 +454,31 @@ def check_format(expected, format, *args):
433454
b'%.3S', '\u20acABCDEF')
434455
check_format('\u20acAB',
435456
b'%.3U', '\u20acABCDEF')
457+
436458
check_format('\u20acAB',
437459
b'%.3V', '\u20acABCDEF', None)
460+
check_format('abc[',
461+
b'%.6V', None, 'abc[\u20ac]'.encode('utf8'))
462+
check_format('abc[\u20ac',
463+
b'%.7V', None, 'abc[\u20ac]'.encode('utf8'))
438464
check_format('abc[\ufffd',
439-
b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
465+
b'%.5V', None, b'abc[\xff]')
466+
check_format('abc[',
467+
b'%.6V', None, b'abc[\xe2\x82]')
468+
check_format('abc[\ufffd]',
469+
b'%.7V', None, b'abc[\xe2\x82]')
470+
check_format(' abc[',
471+
b'%10.6V', None, 'abc[\u20ac]'.encode('utf8'))
472+
check_format(' abc[\u20ac',
473+
b'%10.7V', None, 'abc[\u20ac]'.encode('utf8'))
474+
check_format(' abc[\ufffd',
475+
b'%10.5V', None, b'abc[\xff]')
476+
check_format(' abc[',
477+
b'%10.6V', None, b'abc[\xe2\x82]')
478+
check_format(' abc[\ufffd]',
479+
b'%10.7V', None, b'abc[\xe2\x82]')
480+
check_format(' abc[\ufffd',
481+
b'%10.7V', None, b'abc[\xe2\x82\0')
440482

441483
# following tests comes from #7330
442484
# test width modifier and precision modifier with %S
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
:c:func:`PyUnicode_FromFormat` no longer produces the ending ``\ufffd``
2+
character for truncated C string when use precision with ``%s`` and ``%V``.
3+
It now truncates the string before the start of truncated multibyte
4+
sequences.

Objects/unicodeobject.c

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2581,6 +2581,7 @@ unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
25812581
Py_ssize_t width, Py_ssize_t precision, int flags)
25822582
{
25832583
/* UTF-8 */
2584+
Py_ssize_t *pconsumed = NULL;
25842585
Py_ssize_t length;
25852586
if (precision == -1) {
25862587
length = strlen(str);
@@ -2590,15 +2591,23 @@ unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
25902591
while (length < precision && str[length]) {
25912592
length++;
25922593
}
2594+
if (length == precision) {
2595+
/* The input string is not NUL-terminated. If it ends with an
2596+
* incomplete UTF-8 sequence, truncate the string just before it.
2597+
* Incomplete sequences in the middle and sequences which cannot
2598+
* be valid prefixes are still treated as errors and replaced
2599+
* with \xfffd. */
2600+
pconsumed = &length;
2601+
}
25932602
}
25942603

25952604
if (width < 0) {
25962605
return unicode_decode_utf8_writer(writer, str, length,
2597-
_Py_ERROR_REPLACE, "replace", NULL);
2606+
_Py_ERROR_REPLACE, "replace", pconsumed);
25982607
}
25992608

26002609
PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
2601-
"replace", NULL);
2610+
"replace", pconsumed);
26022611
if (unicode == NULL)
26032612
return -1;
26042613

0 commit comments

Comments
 (0)