Skip to content

Commit eb50cd3

Browse files
gh-110289: C API: Add PyUnicode_EqualToUTF8() and PyUnicode_EqualToUTF8AndSize() functions (GH-110297)
1 parent d1f7fae commit eb50cd3

File tree

11 files changed

+280
-0
lines changed

11 files changed

+280
-0
lines changed

Doc/c-api/unicode.rst

+22
Original file line numberDiff line numberDiff line change
@@ -1396,6 +1396,28 @@ They all return ``NULL`` or ``-1`` if an exception occurs.
13961396
:c:func:`PyErr_Occurred` to check for errors.
13971397
13981398
1399+
.. c:function:: int PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *string, Py_ssize_t size)
1400+
1401+
Compare a Unicode object with a char buffer which is interpreted as
1402+
being UTF-8 or ASCII encoded and return true (``1``) if they are equal,
1403+
or false (``0``) otherwise.
1404+
If the Unicode object contains surrogate characters or
1405+
the C string is not valid UTF-8, false (``0``) is returned.
1406+
1407+
This function does not raise exceptions.
1408+
1409+
.. versionadded:: 3.13
1410+
1411+
1412+
.. c:function:: int PyUnicode_EqualToUTF8(PyObject *unicode, const char *string)
1413+
1414+
Similar to :c:func:`PyUnicode_EqualToUTF8AndSize`, but compute *string*
1415+
length using :c:func:`!strlen`.
1416+
If the Unicode object contains null characters, false (``0``) is returned.
1417+
1418+
.. versionadded:: 3.13
1419+
1420+
13991421
.. c:function:: int PyUnicode_CompareWithASCIIString(PyObject *uni, const char *string)
14001422
14011423
Compare a Unicode object, *uni*, with *string* and return ``-1``, ``0``, ``1`` for less

Doc/data/stable_abi.dat

+2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Doc/whatsnew/3.13.rst

+6
Original file line numberDiff line numberDiff line change
@@ -1024,6 +1024,12 @@ New Features
10241024
functions on Python 3.11 and 3.12.
10251025
(Contributed by Victor Stinner in :gh:`107073`.)
10261026

1027+
* Add :c:func:`PyUnicode_EqualToUTF8AndSize` and :c:func:`PyUnicode_EqualToUTF8`
1028+
functions: compare Unicode object with a :c:expr:`const char*` UTF-8 encoded
1029+
string and return true (``1``) if they are equal, or false (``0``) otherwise.
1030+
These functions do not raise exceptions.
1031+
(Contributed by Serhiy Storchaka in :gh:`110289`.)
1032+
10271033
* Add :c:func:`PyThreadState_GetUnchecked()` function: similar to
10281034
:c:func:`PyThreadState_Get()`, but don't kill the process with a fatal error
10291035
if it is NULL. The caller is responsible to check if the result is NULL.

Include/unicodeobject.h

+9
Original file line numberDiff line numberDiff line change
@@ -957,6 +957,15 @@ PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
957957
const char *right /* ASCII-encoded string */
958958
);
959959

960+
#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030D0000
961+
/* Compare a Unicode object with UTF-8 encoded C string.
962+
Return 1 if they are equal, or 0 otherwise.
963+
This function does not raise exceptions. */
964+
965+
PyAPI_FUNC(int) PyUnicode_EqualToUTF8(PyObject *, const char *);
966+
PyAPI_FUNC(int) PyUnicode_EqualToUTF8AndSize(PyObject *, const char *, Py_ssize_t);
967+
#endif
968+
960969
/* Rich compare two strings and return one of the following:
961970
962971
- NULL in case an exception was raised

Lib/test/test_capi/test_unicode.py

+112
Original file line numberDiff line numberDiff line change
@@ -1297,6 +1297,118 @@ def test_comparewithasciistring(self):
12971297
# CRASHES comparewithasciistring([], b'abc')
12981298
# CRASHES comparewithasciistring(NULL, b'abc')
12991299

1300+
@support.cpython_only
1301+
@unittest.skipIf(_testcapi is None, 'need _testcapi module')
1302+
def test_equaltoutf8(self):
1303+
# Test PyUnicode_EqualToUTF8()
1304+
from _testcapi import unicode_equaltoutf8 as equaltoutf8
1305+
from _testcapi import unicode_asutf8andsize as asutf8andsize
1306+
1307+
strings = [
1308+
'abc', '\xa1\xa2\xa3', '\u4f60\u597d\u4e16',
1309+
'\U0001f600\U0001f601\U0001f602',
1310+
'\U0010ffff',
1311+
]
1312+
for s in strings:
1313+
# Call PyUnicode_AsUTF8AndSize() which creates the UTF-8
1314+
# encoded string cached in the Unicode object.
1315+
asutf8andsize(s, 0)
1316+
b = s.encode()
1317+
self.assertEqual(equaltoutf8(s, b), 1) # Use the UTF-8 cache.
1318+
s2 = b.decode() # New Unicode object without the UTF-8 cache.
1319+
self.assertEqual(equaltoutf8(s2, b), 1)
1320+
self.assertEqual(equaltoutf8(s + 'x', b + b'x'), 1)
1321+
self.assertEqual(equaltoutf8(s + 'x', b + b'y'), 0)
1322+
self.assertEqual(equaltoutf8(s, b + b'\0'), 1)
1323+
self.assertEqual(equaltoutf8(s2, b + b'\0'), 1)
1324+
self.assertEqual(equaltoutf8(s + '\0', b + b'\0'), 0)
1325+
self.assertEqual(equaltoutf8(s + '\0', b), 0)
1326+
self.assertEqual(equaltoutf8(s2, b + b'x'), 0)
1327+
self.assertEqual(equaltoutf8(s2, b[:-1]), 0)
1328+
self.assertEqual(equaltoutf8(s2, b[:-1] + b'x'), 0)
1329+
1330+
self.assertEqual(equaltoutf8('', b''), 1)
1331+
self.assertEqual(equaltoutf8('', b'\0'), 1)
1332+
1333+
# embedded null chars/bytes
1334+
self.assertEqual(equaltoutf8('abc', b'abc\0def\0'), 1)
1335+
self.assertEqual(equaltoutf8('a\0bc', b'abc'), 0)
1336+
self.assertEqual(equaltoutf8('abc', b'a\0bc'), 0)
1337+
1338+
# Surrogate characters are always treated as not equal
1339+
self.assertEqual(equaltoutf8('\udcfe',
1340+
'\udcfe'.encode("utf8", "surrogateescape")), 0)
1341+
self.assertEqual(equaltoutf8('\udcfe',
1342+
'\udcfe'.encode("utf8", "surrogatepass")), 0)
1343+
self.assertEqual(equaltoutf8('\ud801',
1344+
'\ud801'.encode("utf8", "surrogatepass")), 0)
1345+
1346+
@support.cpython_only
1347+
@unittest.skipIf(_testcapi is None, 'need _testcapi module')
1348+
def test_equaltoutf8andsize(self):
1349+
# Test PyUnicode_EqualToUTF8AndSize()
1350+
from _testcapi import unicode_equaltoutf8andsize as equaltoutf8andsize
1351+
from _testcapi import unicode_asutf8andsize as asutf8andsize
1352+
1353+
strings = [
1354+
'abc', '\xa1\xa2\xa3', '\u4f60\u597d\u4e16',
1355+
'\U0001f600\U0001f601\U0001f602',
1356+
'\U0010ffff',
1357+
]
1358+
for s in strings:
1359+
# Call PyUnicode_AsUTF8AndSize() which creates the UTF-8
1360+
# encoded string cached in the Unicode object.
1361+
asutf8andsize(s, 0)
1362+
b = s.encode()
1363+
self.assertEqual(equaltoutf8andsize(s, b), 1) # Use the UTF-8 cache.
1364+
s2 = b.decode() # New Unicode object without the UTF-8 cache.
1365+
self.assertEqual(equaltoutf8andsize(s2, b), 1)
1366+
self.assertEqual(equaltoutf8andsize(s + 'x', b + b'x'), 1)
1367+
self.assertEqual(equaltoutf8andsize(s + 'x', b + b'y'), 0)
1368+
self.assertEqual(equaltoutf8andsize(s, b + b'\0'), 0)
1369+
self.assertEqual(equaltoutf8andsize(s2, b + b'\0'), 0)
1370+
self.assertEqual(equaltoutf8andsize(s + '\0', b + b'\0'), 1)
1371+
self.assertEqual(equaltoutf8andsize(s + '\0', b), 0)
1372+
self.assertEqual(equaltoutf8andsize(s2, b + b'x'), 0)
1373+
self.assertEqual(equaltoutf8andsize(s2, b[:-1]), 0)
1374+
self.assertEqual(equaltoutf8andsize(s2, b[:-1] + b'x'), 0)
1375+
# Not null-terminated,
1376+
self.assertEqual(equaltoutf8andsize(s, b + b'x', len(b)), 1)
1377+
self.assertEqual(equaltoutf8andsize(s2, b + b'x', len(b)), 1)
1378+
self.assertEqual(equaltoutf8andsize(s + '\0', b + b'\0x', len(b) + 1), 1)
1379+
self.assertEqual(equaltoutf8andsize(s2, b, len(b) - 1), 0)
1380+
1381+
self.assertEqual(equaltoutf8andsize('', b''), 1)
1382+
self.assertEqual(equaltoutf8andsize('', b'\0'), 0)
1383+
self.assertEqual(equaltoutf8andsize('', b'x', 0), 1)
1384+
1385+
# embedded null chars/bytes
1386+
self.assertEqual(equaltoutf8andsize('abc\0def', b'abc\0def'), 1)
1387+
self.assertEqual(equaltoutf8andsize('abc\0def\0', b'abc\0def\0'), 1)
1388+
1389+
# Surrogate characters are always treated as not equal
1390+
self.assertEqual(equaltoutf8andsize('\udcfe',
1391+
'\udcfe'.encode("utf8", "surrogateescape")), 0)
1392+
self.assertEqual(equaltoutf8andsize('\udcfe',
1393+
'\udcfe'.encode("utf8", "surrogatepass")), 0)
1394+
self.assertEqual(equaltoutf8andsize('\ud801',
1395+
'\ud801'.encode("utf8", "surrogatepass")), 0)
1396+
1397+
def check_not_equal_encoding(text, encoding):
1398+
self.assertEqual(equaltoutf8andsize(text, text.encode(encoding)), 0)
1399+
self.assertNotEqual(text.encode(encoding), text.encode("utf8"))
1400+
1401+
# Strings encoded to other encodings are not equal to expected UTF8-encoding string
1402+
check_not_equal_encoding('Stéphane', 'latin1')
1403+
check_not_equal_encoding('Stéphane', 'utf-16-le') # embedded null characters
1404+
check_not_equal_encoding('北京市', 'gbk')
1405+
1406+
# CRASHES equaltoutf8andsize('abc', b'abc', -1)
1407+
# CRASHES equaltoutf8andsize(b'abc', b'abc')
1408+
# CRASHES equaltoutf8andsize([], b'abc')
1409+
# CRASHES equaltoutf8andsize(NULL, b'abc')
1410+
# CRASHES equaltoutf8andsize('abc', NULL)
1411+
13001412
@support.cpython_only
13011413
@unittest.skipIf(_testcapi is None, 'need _testcapi module')
13021414
def test_richcompare(self):

Lib/test/test_stable_abi_ctypes.py

+2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add :c:func:`PyUnicode_EqualToUTF8AndSize` and :c:func:`PyUnicode_EqualToUTF8` functions.

Misc/stable_abi.toml

+4
Original file line numberDiff line numberDiff line change
@@ -2462,3 +2462,7 @@
24622462
added = '3.13'
24632463
[function.Py_IsFinalizing]
24642464
added = '3.13'
2465+
[function.PyUnicode_EqualToUTF8]
2466+
added = '3.13'
2467+
[function.PyUnicode_EqualToUTF8AndSize]
2468+
added = '3.13'

Modules/_testcapi/unicode.c

+44
Original file line numberDiff line numberDiff line change
@@ -1429,6 +1429,48 @@ unicode_comparewithasciistring(PyObject *self, PyObject *args)
14291429
return PyLong_FromLong(result);
14301430
}
14311431

1432+
/* Test PyUnicode_EqualToUTF8() */
1433+
static PyObject *
1434+
unicode_equaltoutf8(PyObject *self, PyObject *args)
1435+
{
1436+
PyObject *left;
1437+
const char *right = NULL;
1438+
Py_ssize_t right_len;
1439+
int result;
1440+
1441+
if (!PyArg_ParseTuple(args, "Oz#", &left, &right, &right_len)) {
1442+
return NULL;
1443+
}
1444+
1445+
NULLABLE(left);
1446+
result = PyUnicode_EqualToUTF8(left, right);
1447+
assert(!PyErr_Occurred());
1448+
return PyLong_FromLong(result);
1449+
}
1450+
1451+
/* Test PyUnicode_EqualToUTF8AndSize() */
1452+
static PyObject *
1453+
unicode_equaltoutf8andsize(PyObject *self, PyObject *args)
1454+
{
1455+
PyObject *left;
1456+
const char *right = NULL;
1457+
Py_ssize_t right_len;
1458+
Py_ssize_t size = -100;
1459+
int result;
1460+
1461+
if (!PyArg_ParseTuple(args, "Oz#|n", &left, &right, &right_len, &size)) {
1462+
return NULL;
1463+
}
1464+
1465+
NULLABLE(left);
1466+
if (size == -100) {
1467+
size = right_len;
1468+
}
1469+
result = PyUnicode_EqualToUTF8AndSize(left, right, size);
1470+
assert(!PyErr_Occurred());
1471+
return PyLong_FromLong(result);
1472+
}
1473+
14321474
/* Test PyUnicode_RichCompare() */
14331475
static PyObject *
14341476
unicode_richcompare(PyObject *self, PyObject *args)
@@ -2044,6 +2086,8 @@ static PyMethodDef TestMethods[] = {
20442086
{"unicode_replace", unicode_replace, METH_VARARGS},
20452087
{"unicode_compare", unicode_compare, METH_VARARGS},
20462088
{"unicode_comparewithasciistring",unicode_comparewithasciistring,METH_VARARGS},
2089+
{"unicode_equaltoutf8", unicode_equaltoutf8, METH_VARARGS},
2090+
{"unicode_equaltoutf8andsize",unicode_equaltoutf8andsize, METH_VARARGS},
20472091
{"unicode_richcompare", unicode_richcompare, METH_VARARGS},
20482092
{"unicode_format", unicode_format, METH_VARARGS},
20492093
{"unicode_contains", unicode_contains, METH_VARARGS},

Objects/unicodeobject.c

+76
Original file line numberDiff line numberDiff line change
@@ -10673,6 +10673,82 @@ PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
1067310673
}
1067410674
}
1067510675

10676+
int
10677+
PyUnicode_EqualToUTF8(PyObject *unicode, const char *str)
10678+
{
10679+
return PyUnicode_EqualToUTF8AndSize(unicode, str, strlen(str));
10680+
}
10681+
10682+
int
10683+
PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size)
10684+
{
10685+
assert(_PyUnicode_CHECK(unicode));
10686+
assert(str);
10687+
10688+
if (PyUnicode_IS_ASCII(unicode)) {
10689+
Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
10690+
return size == len &&
10691+
memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
10692+
}
10693+
if (PyUnicode_UTF8(unicode) != NULL) {
10694+
Py_ssize_t len = PyUnicode_UTF8_LENGTH(unicode);
10695+
return size == len &&
10696+
memcmp(PyUnicode_UTF8(unicode), str, len) == 0;
10697+
}
10698+
10699+
Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
10700+
if ((size_t)len >= (size_t)size || (size_t)len < (size_t)size / 4) {
10701+
return 0;
10702+
}
10703+
const unsigned char *s = (const unsigned char *)str;
10704+
const unsigned char *ends = s + (size_t)size;
10705+
int kind = PyUnicode_KIND(unicode);
10706+
const void *data = PyUnicode_DATA(unicode);
10707+
/* Compare Unicode string and UTF-8 string */
10708+
for (Py_ssize_t i = 0; i < len; i++) {
10709+
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10710+
if (ch < 0x80) {
10711+
if (ends == s || s[0] != ch) {
10712+
return 0;
10713+
}
10714+
s += 1;
10715+
}
10716+
else if (ch < 0x800) {
10717+
if ((ends - s) < 2 ||
10718+
s[0] != (0xc0 | (ch >> 6)) ||
10719+
s[1] != (0x80 | (ch & 0x3f)))
10720+
{
10721+
return 0;
10722+
}
10723+
s += 2;
10724+
}
10725+
else if (ch < 0x10000) {
10726+
if (Py_UNICODE_IS_SURROGATE(ch) ||
10727+
(ends - s) < 3 ||
10728+
s[0] != (0xe0 | (ch >> 12)) ||
10729+
s[1] != (0x80 | ((ch >> 6) & 0x3f)) ||
10730+
s[2] != (0x80 | (ch & 0x3f)))
10731+
{
10732+
return 0;
10733+
}
10734+
s += 3;
10735+
}
10736+
else {
10737+
assert(ch <= MAX_UNICODE);
10738+
if ((ends - s) < 4 ||
10739+
s[0] != (0xf0 | (ch >> 18)) ||
10740+
s[1] != (0x80 | ((ch >> 12) & 0x3f)) ||
10741+
s[2] != (0x80 | ((ch >> 6) & 0x3f)) ||
10742+
s[3] != (0x80 | (ch & 0x3f)))
10743+
{
10744+
return 0;
10745+
}
10746+
s += 4;
10747+
}
10748+
}
10749+
return s == ends;
10750+
}
10751+
1067610752
int
1067710753
_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
1067810754
{

PC/python3dll.c

+2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)