Skip to content

Commit 04efd0d

Browse files
methaneDinoV
authored andcommitted
bpo-37348: optimize decoding ASCII string (pythonGH-14283)
`_PyUnicode_Writer` is a relatively complex structure. Initializing it is significant overhead when decoding short ASCII string.
1 parent a01a26d commit 04efd0d

File tree

2 files changed

+53
-34
lines changed

2 files changed

+53
-34
lines changed
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Optimized decoding short ASCII string with UTF-8 and ascii codecs.
2+
``b"foo".decode()`` is about 15% faster. Patch by Inada Naoki.

Objects/unicodeobject.c

Lines changed: 51 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,8 @@ unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
265265
/* Forward declaration */
266266
static inline int
267267
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
268+
static inline void
269+
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
268270
static PyObject *
269271
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
270272
const char *errors);
@@ -4877,16 +4879,6 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
48774879
_Py_error_handler error_handler, const char *errors,
48784880
Py_ssize_t *consumed)
48794881
{
4880-
_PyUnicodeWriter writer;
4881-
const char *starts = s;
4882-
const char *end = s + size;
4883-
4884-
Py_ssize_t startinpos;
4885-
Py_ssize_t endinpos;
4886-
const char *errmsg = "";
4887-
PyObject *error_handler_obj = NULL;
4888-
PyObject *exc = NULL;
4889-
48904882
if (size == 0) {
48914883
if (consumed)
48924884
*consumed = 0;
@@ -4900,13 +4892,29 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
49004892
return get_latin1_char((unsigned char)s[0]);
49014893
}
49024894

4903-
_PyUnicodeWriter_Init(&writer);
4904-
writer.min_length = size;
4905-
if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
4906-
goto onError;
4895+
const char *starts = s;
4896+
const char *end = s + size;
4897+
4898+
// fast path: try ASCII string.
4899+
PyObject *u = PyUnicode_New(size, 127);
4900+
if (u == NULL) {
4901+
return NULL;
4902+
}
4903+
s += ascii_decode(s, end, PyUnicode_DATA(u));
4904+
if (s == end) {
4905+
return u;
4906+
}
4907+
4908+
// Use _PyUnicodeWriter after fast path is failed.
4909+
_PyUnicodeWriter writer;
4910+
_PyUnicodeWriter_InitWithBuffer(&writer, u);
4911+
writer.pos = s - starts;
4912+
4913+
Py_ssize_t startinpos, endinpos;
4914+
const char *errmsg = "";
4915+
PyObject *error_handler_obj = NULL;
4916+
PyObject *exc = NULL;
49074917

4908-
writer.pos = ascii_decode(s, end, writer.data);
4909-
s += writer.pos;
49104918
while (s < end) {
49114919
Py_UCS4 ch;
49124920
int kind = writer.kind;
@@ -6451,7 +6459,7 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
64516459
length after conversion to the true value. (But decoding error
64526460
handler might have to resize the string) */
64536461
_PyUnicodeWriter_Init(&writer);
6454-
writer.min_length = size;
6462+
writer.min_length = size;
64556463
if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
64566464
goto onError;
64576465
}
@@ -6975,13 +6983,7 @@ PyUnicode_DecodeASCII(const char *s,
69756983
const char *errors)
69766984
{
69776985
const char *starts = s;
6978-
_PyUnicodeWriter writer;
6979-
int kind;
6980-
void *data;
6981-
Py_ssize_t startinpos;
6982-
Py_ssize_t endinpos;
6983-
Py_ssize_t outpos;
6984-
const char *e;
6986+
const char *e = s + size;
69856987
PyObject *error_handler_obj = NULL;
69866988
PyObject *exc = NULL;
69876989
_Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
@@ -6993,20 +6995,25 @@ PyUnicode_DecodeASCII(const char *s,
69936995
if (size == 1 && (unsigned char)s[0] < 128)
69946996
return get_latin1_char((unsigned char)s[0]);
69956997

6996-
_PyUnicodeWriter_Init(&writer);
6997-
writer.min_length = size;
6998-
if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
6998+
// Shortcut for simple case
6999+
PyObject *u = PyUnicode_New(size, 127);
7000+
if (u == NULL) {
69997001
return NULL;
7002+
}
7003+
Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_DATA(u));
7004+
if (outpos == size) {
7005+
return u;
7006+
}
70007007

7001-
e = s + size;
7002-
data = writer.data;
7003-
outpos = ascii_decode(s, e, (Py_UCS1 *)data);
7008+
_PyUnicodeWriter writer;
7009+
_PyUnicodeWriter_InitWithBuffer(&writer, u);
70047010
writer.pos = outpos;
7005-
if (writer.pos == size)
7006-
return _PyUnicodeWriter_Finish(&writer);
70077011

7008-
s += writer.pos;
7009-
kind = writer.kind;
7012+
s += outpos;
7013+
int kind = writer.kind;
7014+
void *data = writer.data;
7015+
Py_ssize_t startinpos, endinpos;
7016+
70107017
while (s < e) {
70117018
unsigned char c = (unsigned char)*s;
70127019
if (c < 128) {
@@ -13506,6 +13513,16 @@ _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
1350613513
assert(writer->kind <= PyUnicode_1BYTE_KIND);
1350713514
}
1350813515

13516+
// Initialize _PyUnicodeWriter with initial buffer
13517+
static inline void
13518+
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13519+
{
13520+
memset(writer, 0, sizeof(*writer));
13521+
writer->buffer = buffer;
13522+
_PyUnicodeWriter_Update(writer);
13523+
writer->min_length = writer->size;
13524+
}
13525+
1350913526
int
1351013527
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
1351113528
Py_ssize_t length, Py_UCS4 maxchar)

0 commit comments

Comments
 (0)