Skip to content

Commit ccab67b

Browse files
authored
pythongh-97982: Factorize PyUnicode_Count() and unicode_count() code (python#98025)
Add unicode_count_impl() to factorize PyUnicode_Count() and unicode_count() code.
1 parent e9569ec commit ccab67b

File tree

2 files changed

+36
-60
lines changed

2 files changed

+36
-60
lines changed

Lib/test/test_unicode.py

+10
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,10 @@ def test_count(self):
241241
self.checkequal(0, 'a' * 10, 'count', 'a\u0102')
242242
self.checkequal(0, 'a' * 10, 'count', 'a\U00100304')
243243
self.checkequal(0, '\u0102' * 10, 'count', '\u0102\U00100304')
244+
# test subclass
245+
class MyStr(str):
246+
pass
247+
self.checkequal(3, MyStr('aaa'), 'count', 'a')
244248

245249
def test_find(self):
246250
string_tests.CommonTest.test_find(self)
@@ -3002,6 +3006,12 @@ def test_count(self):
30023006
self.assertEqual(unicode_count(uni, ch, 0, len(uni)), 1)
30033007
self.assertEqual(unicode_count(st, ch, 0, len(st)), 0)
30043008

3009+
# subclasses should still work
3010+
class MyStr(str):
3011+
pass
3012+
3013+
self.assertEqual(unicode_count(MyStr('aab'), 'a', 0, 3), 2)
3014+
30053015
# Test PyUnicode_FindChar()
30063016
@support.cpython_only
30073017
@unittest.skipIf(_testcapi is None, 'need _testcapi module')

Objects/unicodeobject.c

+26-60
Original file line numberDiff line numberDiff line change
@@ -8964,21 +8964,20 @@ _PyUnicode_InsertThousandsGrouping(
89648964
return count;
89658965
}
89668966

8967-
8968-
Py_ssize_t
8969-
PyUnicode_Count(PyObject *str,
8970-
PyObject *substr,
8971-
Py_ssize_t start,
8972-
Py_ssize_t end)
8967+
static Py_ssize_t
8968+
unicode_count_impl(PyObject *str,
8969+
PyObject *substr,
8970+
Py_ssize_t start,
8971+
Py_ssize_t end)
89738972
{
8973+
assert(PyUnicode_Check(str));
8974+
assert(PyUnicode_Check(substr));
8975+
89748976
Py_ssize_t result;
89758977
int kind1, kind2;
89768978
const void *buf1 = NULL, *buf2 = NULL;
89778979
Py_ssize_t len1, len2;
89788980

8979-
if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
8980-
return -1;
8981-
89828981
kind1 = PyUnicode_KIND(str);
89838982
kind2 = PyUnicode_KIND(substr);
89848983
if (kind1 < kind2)
@@ -8998,6 +8997,7 @@ PyUnicode_Count(PyObject *str,
89988997
goto onError;
89998998
}
90008999

9000+
// We don't reuse `anylib_count` here because of the explicit casts.
90019001
switch (kind1) {
90029002
case PyUnicode_1BYTE_KIND:
90039003
result = ucs1lib_count(
@@ -9033,6 +9033,18 @@ PyUnicode_Count(PyObject *str,
90339033
return -1;
90349034
}
90359035

9036+
Py_ssize_t
9037+
PyUnicode_Count(PyObject *str,
9038+
PyObject *substr,
9039+
Py_ssize_t start,
9040+
Py_ssize_t end)
9041+
{
9042+
if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9043+
return -1;
9044+
9045+
return unicode_count_impl(str, substr, start, end);
9046+
}
9047+
90369048
Py_ssize_t
90379049
PyUnicode_Find(PyObject *str,
90389050
PyObject *substr,
@@ -10848,62 +10860,16 @@ unicode_count(PyObject *self, PyObject *args)
1084810860
PyObject *substring = NULL; /* initialize to fix a compiler warning */
1084910861
Py_ssize_t start = 0;
1085010862
Py_ssize_t end = PY_SSIZE_T_MAX;
10851-
PyObject *result;
10852-
int kind1, kind2;
10853-
const void *buf1, *buf2;
10854-
Py_ssize_t len1, len2, iresult;
10863+
Py_ssize_t result;
1085510864

1085610865
if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
1085710866
return NULL;
1085810867

10859-
kind1 = PyUnicode_KIND(self);
10860-
kind2 = PyUnicode_KIND(substring);
10861-
if (kind1 < kind2)
10862-
return PyLong_FromLong(0);
10863-
10864-
len1 = PyUnicode_GET_LENGTH(self);
10865-
len2 = PyUnicode_GET_LENGTH(substring);
10866-
ADJUST_INDICES(start, end, len1);
10867-
if (end - start < len2)
10868-
return PyLong_FromLong(0);
10869-
10870-
buf1 = PyUnicode_DATA(self);
10871-
buf2 = PyUnicode_DATA(substring);
10872-
if (kind2 != kind1) {
10873-
buf2 = unicode_askind(kind2, buf2, len2, kind1);
10874-
if (!buf2)
10875-
return NULL;
10876-
}
10877-
switch (kind1) {
10878-
case PyUnicode_1BYTE_KIND:
10879-
iresult = ucs1lib_count(
10880-
((const Py_UCS1*)buf1) + start, end - start,
10881-
buf2, len2, PY_SSIZE_T_MAX
10882-
);
10883-
break;
10884-
case PyUnicode_2BYTE_KIND:
10885-
iresult = ucs2lib_count(
10886-
((const Py_UCS2*)buf1) + start, end - start,
10887-
buf2, len2, PY_SSIZE_T_MAX
10888-
);
10889-
break;
10890-
case PyUnicode_4BYTE_KIND:
10891-
iresult = ucs4lib_count(
10892-
((const Py_UCS4*)buf1) + start, end - start,
10893-
buf2, len2, PY_SSIZE_T_MAX
10894-
);
10895-
break;
10896-
default:
10897-
Py_UNREACHABLE();
10898-
}
10899-
10900-
result = PyLong_FromSsize_t(iresult);
10901-
10902-
assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
10903-
if (kind2 != kind1)
10904-
PyMem_Free((void *)buf2);
10868+
result = unicode_count_impl(self, substring, start, end);
10869+
if (result == -1)
10870+
return NULL;
1090510871

10906-
return result;
10872+
return PyLong_FromSsize_t(result);
1090710873
}
1090810874

1090910875
/*[clinic input]

0 commit comments

Comments
 (0)