Skip to content

Commit 4123226

Browse files
gh-119182: Add PyUnicodeWriter_DecodeUTF8Stateful() (#120639)
Add PyUnicodeWriter_WriteWideChar() and PyUnicodeWriter_DecodeUTF8Stateful() functions. Co-authored-by: Serhiy Storchaka <[email protected]>
1 parent aed31be commit 4123226

File tree

5 files changed

+332
-68
lines changed

5 files changed

+332
-68
lines changed

Doc/c-api/unicode.rst

+32-3
Original file line numberDiff line numberDiff line change
@@ -1551,9 +1551,17 @@ object.
15511551
On success, return ``0``.
15521552
On error, set an exception, leave the writer unchanged, and return ``-1``.
15531553
1554-
To use a different error handler than ``strict``,
1555-
:c:func:`PyUnicode_DecodeUTF8` can be used with
1556-
:c:func:`PyUnicodeWriter_WriteStr`.
1554+
See also :c:func:`PyUnicodeWriter_DecodeUTF8Stateful`.
1555+
1556+
.. c:function:: int PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *writer, const wchar_t *str, Py_ssize_t size)
1557+
1558+
Writer the wide string *str* into *writer*.
1559+
1560+
*size* is a number of wide characters. If *size* is equal to ``-1``, call
1561+
``wcslen(str)`` to get the string length.
1562+
1563+
On success, return ``0``.
1564+
On error, set an exception, leave the writer unchanged, and return ``-1``.
15571565
15581566
.. c:function:: int PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj)
15591567
@@ -1586,3 +1594,24 @@ object.
15861594
15871595
On success, return ``0``.
15881596
On error, set an exception, leave the writer unchanged, and return ``-1``.
1597+
1598+
.. c:function:: int PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer, const char *string, Py_ssize_t length, const char *errors, Py_ssize_t *consumed)
1599+
1600+
Decode the string *str* from UTF-8 with *errors* error handler and write the
1601+
output into *writer*.
1602+
1603+
*size* is the string length in bytes. If *size* is equal to ``-1``, call
1604+
``strlen(str)`` to get the string length.
1605+
1606+
*errors* is an error handler name, such as ``"replace"``. If *errors* is
1607+
``NULL``, use the strict error handler.
1608+
1609+
If *consumed* is not ``NULL``, set *\*consumed* to the number of decoded
1610+
bytes on success.
1611+
If *consumed* is ``NULL``, treat trailing incomplete UTF-8 byte sequences
1612+
as an error.
1613+
1614+
On success, return ``0``.
1615+
On error, set an exception, leave the writer unchanged, and return ``-1``.
1616+
1617+
See also :c:func:`PyUnicodeWriter_WriteUTF8`.

Doc/whatsnew/3.14.rst

+2
Original file line numberDiff line numberDiff line change
@@ -291,10 +291,12 @@ New Features
291291
* :c:func:`PyUnicodeWriter_Finish`.
292292
* :c:func:`PyUnicodeWriter_WriteChar`.
293293
* :c:func:`PyUnicodeWriter_WriteUTF8`.
294+
* :c:func:`PyUnicodeWriter_WriteWideChar`.
294295
* :c:func:`PyUnicodeWriter_WriteStr`.
295296
* :c:func:`PyUnicodeWriter_WriteRepr`.
296297
* :c:func:`PyUnicodeWriter_WriteSubstring`.
297298
* :c:func:`PyUnicodeWriter_Format`.
299+
* :c:func:`PyUnicodeWriter_DecodeUTF8Stateful`.
298300

299301
(Contributed by Victor Stinner in :gh:`119182`.)
300302

Include/cpython/unicodeobject.h

+10
Original file line numberDiff line numberDiff line change
@@ -459,6 +459,10 @@ PyAPI_FUNC(int) PyUnicodeWriter_WriteUTF8(
459459
PyUnicodeWriter *writer,
460460
const char *str,
461461
Py_ssize_t size);
462+
PyAPI_FUNC(int) PyUnicodeWriter_WriteWideChar(
463+
PyUnicodeWriter *writer,
464+
const wchar_t *str,
465+
Py_ssize_t size);
462466

463467
PyAPI_FUNC(int) PyUnicodeWriter_WriteStr(
464468
PyUnicodeWriter *writer,
@@ -475,6 +479,12 @@ PyAPI_FUNC(int) PyUnicodeWriter_Format(
475479
PyUnicodeWriter *writer,
476480
const char *format,
477481
...);
482+
PyAPI_FUNC(int) PyUnicodeWriter_DecodeUTF8Stateful(
483+
PyUnicodeWriter *writer,
484+
const char *string, /* UTF-8 encoded string */
485+
Py_ssize_t length, /* size of string */
486+
const char *errors, /* error handling */
487+
Py_ssize_t *consumed); /* bytes consumed */
478488

479489

480490
/* --- Private _PyUnicodeWriter API --------------------------------------- */

Modules/_testcapi/unicode.c

+152
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,119 @@ test_unicodewriter_recover_error(PyObject *self, PyObject *Py_UNUSED(args))
374374
}
375375

376376

377+
static PyObject *
378+
test_unicodewriter_decode_utf8(PyObject *self, PyObject *Py_UNUSED(args))
379+
{
380+
// test PyUnicodeWriter_DecodeUTF8Stateful()
381+
PyUnicodeWriter *writer = PyUnicodeWriter_Create(0);
382+
if (writer == NULL) {
383+
return NULL;
384+
}
385+
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "ign\xFFore", -1, "ignore", NULL) < 0) {
386+
goto error;
387+
}
388+
if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
389+
goto error;
390+
}
391+
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "replace\xFF", -1, "replace", NULL) < 0) {
392+
goto error;
393+
}
394+
if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
395+
goto error;
396+
}
397+
398+
// incomplete trailing UTF-8 sequence
399+
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "incomplete\xC3", -1, "replace", NULL) < 0) {
400+
goto error;
401+
}
402+
403+
PyObject *result = PyUnicodeWriter_Finish(writer);
404+
if (result == NULL) {
405+
return NULL;
406+
}
407+
assert(PyUnicode_EqualToUTF8(result,
408+
"ignore-replace\xef\xbf\xbd"
409+
"-incomplete\xef\xbf\xbd"));
410+
Py_DECREF(result);
411+
412+
Py_RETURN_NONE;
413+
414+
error:
415+
PyUnicodeWriter_Discard(writer);
416+
return NULL;
417+
}
418+
419+
420+
static PyObject *
421+
test_unicodewriter_decode_utf8_consumed(PyObject *self, PyObject *Py_UNUSED(args))
422+
{
423+
// test PyUnicodeWriter_DecodeUTF8Stateful()
424+
PyUnicodeWriter *writer = PyUnicodeWriter_Create(0);
425+
if (writer == NULL) {
426+
return NULL;
427+
}
428+
Py_ssize_t consumed;
429+
430+
// valid string
431+
consumed = 12345;
432+
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "text", -1, NULL, &consumed) < 0) {
433+
goto error;
434+
}
435+
assert(consumed == 4);
436+
if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
437+
goto error;
438+
}
439+
440+
// non-ASCII
441+
consumed = 12345;
442+
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "\xC3\xA9-\xE2\x82\xAC", 6, NULL, &consumed) < 0) {
443+
goto error;
444+
}
445+
assert(consumed == 6);
446+
if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
447+
goto error;
448+
}
449+
450+
// consumed is 0 if write fails
451+
consumed = 12345;
452+
assert(PyUnicodeWriter_DecodeUTF8Stateful(writer, "invalid\xFF", -1, NULL, &consumed) < 0);
453+
PyErr_Clear();
454+
assert(consumed == 0);
455+
456+
// ignore error handler
457+
consumed = 12345;
458+
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "more\xFF", -1, "ignore", &consumed) < 0) {
459+
goto error;
460+
}
461+
assert(consumed == 5);
462+
if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
463+
goto error;
464+
}
465+
466+
// incomplete trailing UTF-8 sequence
467+
consumed = 12345;
468+
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "incomplete\xC3", -1, "ignore", &consumed) < 0) {
469+
goto error;
470+
}
471+
assert(consumed == 10);
472+
473+
PyObject *result = PyUnicodeWriter_Finish(writer);
474+
if (result == NULL) {
475+
return NULL;
476+
}
477+
assert(PyUnicode_EqualToUTF8(result,
478+
"text-\xC3\xA9-\xE2\x82\xAC-"
479+
"more-incomplete"));
480+
Py_DECREF(result);
481+
482+
Py_RETURN_NONE;
483+
484+
error:
485+
PyUnicodeWriter_Discard(writer);
486+
return NULL;
487+
}
488+
489+
377490
static PyObject *
378491
test_unicodewriter_format(PyObject *self, PyObject *Py_UNUSED(args))
379492
{
@@ -436,6 +549,42 @@ test_unicodewriter_format_recover_error(PyObject *self, PyObject *Py_UNUSED(args
436549
}
437550

438551

552+
static PyObject *
553+
test_unicodewriter_widechar(PyObject *self, PyObject *Py_UNUSED(args))
554+
{
555+
PyUnicodeWriter *writer = PyUnicodeWriter_Create(0);
556+
if (writer == NULL) {
557+
return NULL;
558+
}
559+
if (PyUnicodeWriter_WriteWideChar(writer, L"latin1=\xE9 IGNORED", 8) < 0) {
560+
goto error;
561+
}
562+
if (PyUnicodeWriter_WriteWideChar(writer, L"-", 1) < 0) {
563+
goto error;
564+
}
565+
if (PyUnicodeWriter_WriteWideChar(writer, L"euro=\u20AC", -1) < 0) {
566+
goto error;
567+
}
568+
if (PyUnicodeWriter_WriteChar(writer, '.') < 0) {
569+
goto error;
570+
}
571+
572+
PyObject *result = PyUnicodeWriter_Finish(writer);
573+
if (result == NULL) {
574+
return NULL;
575+
}
576+
assert(PyUnicode_EqualToUTF8(result,
577+
"latin1=\xC3\xA9-euro=\xE2\x82\xAC."));
578+
Py_DECREF(result);
579+
580+
Py_RETURN_NONE;
581+
582+
error:
583+
PyUnicodeWriter_Discard(writer);
584+
return NULL;
585+
}
586+
587+
439588
static PyMethodDef TestMethods[] = {
440589
{"unicode_new", unicode_new, METH_VARARGS},
441590
{"unicode_fill", unicode_fill, METH_VARARGS},
@@ -448,8 +597,11 @@ static PyMethodDef TestMethods[] = {
448597
{"test_unicodewriter_utf8", test_unicodewriter_utf8, METH_NOARGS},
449598
{"test_unicodewriter_invalid_utf8", test_unicodewriter_invalid_utf8, METH_NOARGS},
450599
{"test_unicodewriter_recover_error", test_unicodewriter_recover_error, METH_NOARGS},
600+
{"test_unicodewriter_decode_utf8", test_unicodewriter_decode_utf8, METH_NOARGS},
601+
{"test_unicodewriter_decode_utf8_consumed", test_unicodewriter_decode_utf8_consumed, METH_NOARGS},
451602
{"test_unicodewriter_format", test_unicodewriter_format, METH_NOARGS},
452603
{"test_unicodewriter_format_recover_error", test_unicodewriter_format_recover_error, METH_NOARGS},
604+
{"test_unicodewriter_widechar", test_unicodewriter_widechar, METH_NOARGS},
453605
{NULL},
454606
};
455607

0 commit comments

Comments
 (0)