Skip to content

Commit f693f84

Browse files
picnixzencukou
andauthored
pythongh-129173: simplify PyCodec_XMLCharRefReplaceErrors logic (python#129894)
Writing the decimal representation of a Unicode codepoint only requires to know the number of digits. --------- Co-authored-by: Petr Viktorin <[email protected]>
1 parent efbc592 commit f693f84

File tree

1 file changed

+39
-60
lines changed

1 file changed

+39
-60
lines changed

Python/codecs.c

Lines changed: 39 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -730,6 +730,25 @@ codec_handler_write_unicode_hex(Py_UCS1 **p, Py_UCS4 ch)
730730
}
731731

732732

733+
/*
734+
* Determine the number of digits for a decimal representation of Unicode
735+
* codepoint 'ch' (by design, Unicode codepoints are limited to 7 digits).
736+
*/
737+
static inline int
738+
n_decimal_digits_for_codepoint(Py_UCS4 ch)
739+
{
740+
if (ch < 10) return 1;
741+
if (ch < 100) return 2;
742+
if (ch < 1000) return 3;
743+
if (ch < 10000) return 4;
744+
if (ch < 100000) return 5;
745+
if (ch < 1000000) return 6;
746+
if (ch < 10000000) return 7;
747+
// Unicode codepoints are limited to 1114111 (7 decimal digits)
748+
Py_UNREACHABLE();
749+
}
750+
751+
733752
/*
734753
* Create a Unicode string containing 'count' copies of the official
735754
* Unicode REPLACEMENT CHARACTER (0xFFFD).
@@ -867,9 +886,12 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc)
867886
}
868887
}
869888

889+
890+
// --- handler: 'xmlcharrefreplace' -------------------------------------------
891+
870892
PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
871893
{
872-
if (!PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
894+
if (!_PyIsUnicodeEncodeError(exc)) {
873895
wrong_exception_type(exc);
874896
return NULL;
875897
}
@@ -896,30 +918,11 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
896918

897919
Py_ssize_t ressize = 0;
898920
for (Py_ssize_t i = start; i < end; ++i) {
899-
/* object is guaranteed to be "ready" */
900921
Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
901-
if (ch < 10) {
902-
ressize += 2 + 1 + 1;
903-
}
904-
else if (ch < 100) {
905-
ressize += 2 + 2 + 1;
906-
}
907-
else if (ch < 1000) {
908-
ressize += 2 + 3 + 1;
909-
}
910-
else if (ch < 10000) {
911-
ressize += 2 + 4 + 1;
912-
}
913-
else if (ch < 100000) {
914-
ressize += 2 + 5 + 1;
915-
}
916-
else if (ch < 1000000) {
917-
ressize += 2 + 6 + 1;
918-
}
919-
else {
920-
assert(ch < 10000000);
921-
ressize += 2 + 7 + 1;
922-
}
922+
int k = n_decimal_digits_for_codepoint(ch);
923+
assert(k != 0);
924+
assert(k <= 7);
925+
ressize += 2 + k + 1;
923926
}
924927

925928
/* allocate replacement */
@@ -931,45 +934,20 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
931934
Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
932935
/* generate replacement */
933936
for (Py_ssize_t i = start; i < end; ++i) {
934-
int digits, base;
935937
Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
936-
if (ch < 10) {
937-
digits = 1;
938-
base = 1;
939-
}
940-
else if (ch < 100) {
941-
digits = 2;
942-
base = 10;
943-
}
944-
else if (ch < 1000) {
945-
digits = 3;
946-
base = 100;
947-
}
948-
else if (ch < 10000) {
949-
digits = 4;
950-
base = 1000;
951-
}
952-
else if (ch < 100000) {
953-
digits = 5;
954-
base = 10000;
955-
}
956-
else if (ch < 1000000) {
957-
digits = 6;
958-
base = 100000;
959-
}
960-
else {
961-
assert(ch < 10000000);
962-
digits = 7;
963-
base = 1000000;
964-
}
938+
/*
939+
* Write the decimal representation of 'ch' to the buffer pointed by 'p'
940+
* using at most 7 characters prefixed by '&#' and suffixed by ';'.
941+
*/
965942
*outp++ = '&';
966943
*outp++ = '#';
967-
while (digits-- > 0) {
968-
assert(base >= 1);
969-
*outp++ = '0' + ch / base;
970-
ch %= base;
971-
base /= 10;
944+
Py_UCS1 *digit_end = outp + n_decimal_digits_for_codepoint(ch);
945+
for (Py_UCS1 *p_digit = digit_end - 1; p_digit >= outp; --p_digit) {
946+
*p_digit = '0' + (ch % 10);
947+
ch /= 10;
972948
}
949+
assert(ch == 0);
950+
outp = digit_end;
973951
*outp++ = ';';
974952
}
975953
assert(_PyUnicode_CheckConsistency(res, 1));
@@ -1517,7 +1495,8 @@ replace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
15171495
}
15181496

15191497

1520-
static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1498+
static inline PyObject *
1499+
xmlcharrefreplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
15211500
{
15221501
return PyCodec_XMLCharRefReplaceErrors(exc);
15231502
}

0 commit comments

Comments
 (0)