-
-
Notifications
You must be signed in to change notification settings - Fork 32k
gh-120196: Faster ascii_decode and find_max_char implementations #120212
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a54bf86
71ff457
98a6449
10527c6
f0f4139
849a068
fad19a0
f04bb2c
cd0fc5e
8f0fd56
37aee7a
a6fc417
104ca62
d465517
1ce308e
48f1e84
21de804
1ec2113
0258ae0
ec76b74
845eb4e
f8cc68d
89ab2c9
002b7ec
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
Improve performance of ASCII decoding and maximum character checking | ||
by allowing vectorization by the compiler on suitable platforms. |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,25 +20,53 @@ | |
Py_LOCAL_INLINE(Py_UCS4) | ||
STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end) | ||
{ | ||
const unsigned char *p = (const unsigned char *) begin; | ||
const unsigned char *restrict p = (const unsigned char *) begin; | ||
const unsigned char *_end = (const unsigned char *)end; | ||
|
||
while (p < _end) { | ||
if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) { | ||
/* Help register allocation */ | ||
const unsigned char *_p = p; | ||
while (_p + SIZEOF_SIZE_T <= _end) { | ||
size_t value = *(const size_t *) _p; | ||
if (value & UCS1_ASCII_CHAR_MASK) | ||
return 255; | ||
_p += SIZEOF_SIZE_T; | ||
} | ||
p = _p; | ||
if (p == _end) | ||
break; | ||
} | ||
if (*p++ & 0x80) | ||
const unsigned char *size_t_end = _end - SIZEOF_SIZE_T; | ||
const unsigned char *unrolled_end = _end - (4 * SIZEOF_SIZE_T - 1); | ||
while (p < unrolled_end) { | ||
/* Chunks of 4 size_t values allow for compiler optimizations using vectors */ | ||
const size_t *restrict _p = (const size_t *)p; | ||
size_t value0; | ||
size_t value1; | ||
size_t value2; | ||
size_t value3; | ||
/* Will be optimized to simple loads for architectures that support | ||
unaligned loads. */ | ||
memcpy(&value0, _p + 0, SIZEOF_SIZE_T); | ||
memcpy(&value1, _p + 1, SIZEOF_SIZE_T); | ||
memcpy(&value2, _p + 2, SIZEOF_SIZE_T); | ||
memcpy(&value3, _p + 3, SIZEOF_SIZE_T); | ||
size_t value = value0 | value1 | value2 | value3; | ||
if (value & UCS1_ASCII_CHAR_MASK) { | ||
return 255; | ||
} | ||
p += (4 * SIZEOF_SIZE_T); | ||
} | ||
size_t accumulator = 0; | ||
while (p < size_t_end) { | ||
size_t value; | ||
memcpy(&value, p, SIZEOF_SIZE_T); | ||
accumulator |= value; | ||
p += SIZEOF_SIZE_T; | ||
} | ||
/* In the end there will be up to SIZEOF_SIZE_T leftover characters. It is | ||
faster to do an unaligned load of a size_t integer at an offset from | ||
the end rather than breaking it up into single bytes. However, if the | ||
string is smaller than SIZEOF_SIZE_T this strategy is illegal. */ | ||
if (size_t_end >= (const unsigned char*)begin) { | ||
vstinner marked this conversation as resolved.
Show resolved
Hide resolved
|
||
size_t value; | ||
memcpy(&value, size_t_end, SIZEOF_SIZE_T); | ||
accumulator |= value; | ||
} else { | ||
/* Fallback for smaller than size_t strings. */ | ||
while (p < _end) { | ||
accumulator |= *p; | ||
p += 1; | ||
} | ||
} | ||
if (accumulator & UCS1_ASCII_CHAR_MASK) { | ||
return 255; | ||
} | ||
return 127; | ||
} | ||
|
@@ -71,13 +99,15 @@ STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end) | |
Py_UCS4 mask; | ||
Py_ssize_t n = end - begin; | ||
const STRINGLIB_CHAR *p = begin; | ||
const STRINGLIB_CHAR *unrolled_end = begin + _Py_SIZE_ROUND_DOWN(n, 4); | ||
const STRINGLIB_CHAR *unrolled_end = begin + _Py_SIZE_ROUND_DOWN(n, 8); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As in the other PR, maybe the 4/8 choice can be chosen at compile time depending on the architecture. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 99% of production work were performance matters will be done on ARM64 (with 16-byte neon vectors) and X86-64 (with SSE2 vectors) other platforms will not be hurt by this decision. I think there is no reason to complicate the build with choices like this. |
||
Py_UCS4 max_char; | ||
|
||
max_char = MAX_CHAR_ASCII; | ||
mask = MASK_ASCII; | ||
while (p < unrolled_end) { | ||
STRINGLIB_CHAR bits = p[0] | p[1] | p[2] | p[3]; | ||
/* Loading 8 values at once allows platforms that have 16-byte vectors | ||
to do a vector load and vector bitwise OR. */ | ||
STRINGLIB_CHAR bits = p[0] | p[1] | p[2] | p[3] | p[4] | p[5] | p[6] | p[7]; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. and here, you would have some #if arch == ... to choose between 4 or 8 values. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I feel that those vector load could be macros, for clarity purposes. It would then be optimized by the compiler but having them as macros might be helpful for future work. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would to not have a macro, the code is good as it is. |
||
if (bits & mask) { | ||
if (mask == mask_limit) { | ||
/* Limit reached */ | ||
|
@@ -96,7 +126,7 @@ STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end) | |
/* We check the new mask on the same chars in the next iteration */ | ||
continue; | ||
} | ||
p += 4; | ||
p += 8; | ||
} | ||
while (p < end) { | ||
if (p[0] & mask) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4973,58 +4973,54 @@ static Py_ssize_t | |
ascii_decode(const char *start, const char *end, Py_UCS1 *dest) | ||
{ | ||
const char *p = start; | ||
|
||
#if SIZEOF_SIZE_T <= SIZEOF_VOID_P | ||
if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T) | ||
&& _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T)) | ||
{ | ||
/* Fast path, see in STRINGLIB(utf8_decode) for | ||
an explanation. */ | ||
/* Help allocation */ | ||
const char *_p = p; | ||
Py_UCS1 * q = dest; | ||
while (_p + SIZEOF_SIZE_T <= end) { | ||
size_t value = *(const size_t *) _p; | ||
if (value & ASCII_CHAR_MASK) | ||
break; | ||
*((size_t *)q) = value; | ||
_p += SIZEOF_SIZE_T; | ||
q += SIZEOF_SIZE_T; | ||
Py_UCS1 *q = dest; | ||
const char *size_t_end = end - SIZEOF_SIZE_T; | ||
const char *unrolled_end = end - (4 * SIZEOF_SIZE_T); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IMO, we should prefer (There is no C API WG formal guideline yet; this is a personal recommendation. If anyone wants a wider discussion, do that in Discourse.) |
||
|
||
while (p < unrolled_end) { | ||
const size_t *restrict _p = (const size_t *)p; | ||
size_t *restrict _q = (size_t *)q; | ||
size_t value0; | ||
size_t value1; | ||
size_t value2; | ||
size_t value3; | ||
/* Memcpy optimizes to unaligned loads on supporting platforms*/ | ||
memcpy(&value0, _p + 0, SIZEOF_SIZE_T); | ||
memcpy(&value1, _p + 1, SIZEOF_SIZE_T); | ||
memcpy(&value2, _p + 2, SIZEOF_SIZE_T); | ||
memcpy(&value3, _p + 3, SIZEOF_SIZE_T); | ||
size_t value = value0 | value1 | value2 | value3; | ||
if (value & ASCII_CHAR_MASK) { | ||
break; | ||
} | ||
p = _p; | ||
while (p < end) { | ||
if ((unsigned char)*p & 0x80) | ||
break; | ||
*q++ = *p++; | ||
memcpy(_q + 0, &value0, SIZEOF_SIZE_T); | ||
memcpy(_q + 1, &value1, SIZEOF_SIZE_T); | ||
memcpy(_q + 2, &value2, SIZEOF_SIZE_T); | ||
memcpy(_q + 3, &value3, SIZEOF_SIZE_T); | ||
p += 4 * SIZEOF_SIZE_T; | ||
q += 4 * SIZEOF_SIZE_T; | ||
} | ||
while (p < size_t_end) { | ||
const size_t *restrict _p = (const size_t *)p; | ||
size_t *restrict _q = (size_t *)q; | ||
size_t value; | ||
memcpy(&value, _p, SIZEOF_SIZE_T); | ||
if (value & ASCII_CHAR_MASK) { | ||
break; | ||
} | ||
return p - start; | ||
memcpy(_q, &value, SIZEOF_SIZE_T); | ||
p += SIZEOF_SIZE_T; | ||
q += SIZEOF_SIZE_T; | ||
} | ||
#endif | ||
while (p < end) { | ||
/* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h | ||
for an explanation. */ | ||
if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) { | ||
/* Help allocation */ | ||
const char *_p = p; | ||
while (_p + SIZEOF_SIZE_T <= end) { | ||
size_t value = *(const size_t *) _p; | ||
if (value & ASCII_CHAR_MASK) | ||
break; | ||
_p += SIZEOF_SIZE_T; | ||
} | ||
p = _p; | ||
if (_p == end) | ||
break; | ||
} | ||
if ((unsigned char)*p & 0x80) | ||
if ((unsigned char)*p & 0x80) { | ||
break; | ||
++p; | ||
} | ||
*q++ = *p++; | ||
} | ||
memcpy(dest, start, p - start); | ||
return p - start; | ||
} | ||
|
||
|
||
static int | ||
unicode_decode_utf8_impl(_PyUnicodeWriter *writer, | ||
const char *starts, const char *s, const char *end, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would prefer to start processing bytes until we reach an address aligned of SIZEOF_SIZE_T, and then use
size_t*
pointers instead ofmemcpy()
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, good to know thanks. I will close another PR that simplifies the code by using this technique to allow unaligned loads:
#123895