Skip to content

Commit 5650e76

Browse files
bpo-40596: Fix str.isidentifier() for non-canonicalized strings containing non-BMP characters on Windows. (GH-20053)
1 parent 7c6e970 commit 5650e76

File tree

3 files changed

+31
-4
lines changed

3 files changed

+31
-4
lines changed

Lib/test/test_unicode.py

+7
Original file line numberDiff line numberDiff line change
@@ -720,6 +720,13 @@ def test_isidentifier(self):
720720
self.assertFalse("©".isidentifier())
721721
self.assertFalse("0".isidentifier())
722722

723+
@support.cpython_only
724+
def test_isidentifier_legacy(self):
725+
import _testcapi
726+
u = '𝖀𝖓𝖎𝖈𝖔𝖉𝖊'
727+
self.assertTrue(u.isidentifier())
728+
self.assertTrue(_testcapi.unicode_legacy_string(u).isidentifier())
729+
723730
def test_isprintable(self):
724731
self.assertTrue("".isprintable())
725732
self.assertTrue(" ".isprintable())
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fixed :meth:`str.isidentifier` for non-canonicalized strings containing
2+
non-BMP characters on Windows.

Objects/unicodeobject.c

+22-4
Original file line numberDiff line numberDiff line change
@@ -12356,20 +12356,38 @@ PyUnicode_IsIdentifier(PyObject *self)
1235612356
return len && i == len;
1235712357
}
1235812358
else {
12359-
Py_ssize_t i, len = PyUnicode_GET_SIZE(self);
12359+
Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
1236012360
if (len == 0) {
1236112361
/* an empty string is not a valid identifier */
1236212362
return 0;
1236312363
}
1236412364

1236512365
const wchar_t *wstr = _PyUnicode_WSTR(self);
12366-
Py_UCS4 ch = wstr[0];
12366+
Py_UCS4 ch = wstr[i++];
12367+
#if SIZEOF_WCHAR_T == 2
12368+
if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12369+
&& i < len
12370+
&& Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12371+
{
12372+
ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12373+
i++;
12374+
}
12375+
#endif
1236712376
if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
1236812377
return 0;
1236912378
}
1237012379

12371-
for (i = 1; i < len; i++) {
12372-
ch = wstr[i];
12380+
while (i < len) {
12381+
ch = wstr[i++];
12382+
#if SIZEOF_WCHAR_T == 2
12383+
if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12384+
&& i < len
12385+
&& Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12386+
{
12387+
ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12388+
i++;
12389+
}
12390+
#endif
1237312391
if (!_PyUnicode_IsXidContinue(ch)) {
1237412392
return 0;
1237512393
}

0 commit comments

Comments
 (0)