python · rhpvorderman · Jun 5, 2024 · Jun 7, 2024 · Jun 7, 2024 · Jun 7, 2024
diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-06-07-13-29-57.gh-issue-120196.uf2pIh.rst b/Misc/NEWS.d/next/Core and Builtins/2024-06-07-13-29-57.gh-issue-120196.uf2pIh.rst
@@ -0,0 +1,2 @@
+Improve performance of ASCII decoding and maximum character checking
+by allowing vectorization by the compiler on suitable platforms.
diff --git a/Objects/stringlib/find_max_char.h b/Objects/stringlib/find_max_char.h
@@ -20,25 +20,53 @@
 Py_LOCAL_INLINE(Py_UCS4)
 STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end)
 {
-    const unsigned char *p = (const unsigned char *) begin;
+    const unsigned char *restrict p = (const unsigned char *) begin;
     const unsigned char *_end = (const unsigned char *)end;
-
-    while (p < _end) {
-        if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
-            /* Help register allocation */
-            const unsigned char *_p = p;
-            while (_p + SIZEOF_SIZE_T <= _end) {
-                size_t value = *(const size_t *) _p;
-                if (value & UCS1_ASCII_CHAR_MASK)
-                    return 255;
-                _p += SIZEOF_SIZE_T;
-            }
-            p = _p;
-            if (p == _end)
-                break;
-        }
-        if (*p++ & 0x80)
+    const unsigned char *size_t_end = _end - SIZEOF_SIZE_T;
+    const unsigned char *unrolled_end = _end - (4 * SIZEOF_SIZE_T - 1);
+    while (p < unrolled_end) {
+        /* Chunks of 4 size_t values allow for compiler optimizations using vectors */
+        const size_t *restrict _p = (const size_t *)p;
+        size_t value0;
+        size_t value1;
+        size_t value2;
+        size_t value3;
+        /* Will be optimized to simple loads for architectures that support
+           unaligned loads. */
+        memcpy(&value0, _p + 0, SIZEOF_SIZE_T);
+        memcpy(&value1, _p + 1, SIZEOF_SIZE_T);
+        memcpy(&value2, _p + 2, SIZEOF_SIZE_T);
+        memcpy(&value3, _p + 3, SIZEOF_SIZE_T);
+        size_t value = value0 | value1 | value2 | value3;
+        if (value & UCS1_ASCII_CHAR_MASK) {
             return 255;
+        }
+        p += (4 * SIZEOF_SIZE_T);
+    }
+    size_t accumulator = 0;
+    while (p < size_t_end) {
+        size_t value;
+        memcpy(&value, p, SIZEOF_SIZE_T);
+        accumulator |= value;
+        p += SIZEOF_SIZE_T;
+    }
+    /* In the end there will be up to SIZEOF_SIZE_T leftover characters. It is
+       faster to do an unaligned load of a size_t integer at an offset from
+       the end rather than breaking it up into single bytes. However, if the
+       string is smaller than SIZEOF_SIZE_T this strategy is illegal. */
+    if (size_t_end >= (const unsigned char*)begin) {
+        size_t value;
+        memcpy(&value, size_t_end, SIZEOF_SIZE_T);
+        accumulator |= value;
+    } else {
+        /* Fallback for smaller than size_t strings. */
+        while (p < _end) {
+            accumulator |= *p;
+            p += 1;
+        }
+    }
+    if (accumulator & UCS1_ASCII_CHAR_MASK) {
+        return 255;
     }
     return 127;
 }
@@ -71,13 +99,15 @@ STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end)
     Py_UCS4 mask;
     Py_ssize_t n = end - begin;
     const STRINGLIB_CHAR *p = begin;
-    const STRINGLIB_CHAR *unrolled_end = begin + _Py_SIZE_ROUND_DOWN(n, 4);
+    const STRINGLIB_CHAR *unrolled_end = begin + _Py_SIZE_ROUND_DOWN(n, 8);
     Py_UCS4 max_char;
 
     max_char = MAX_CHAR_ASCII;
     mask = MASK_ASCII;
     while (p < unrolled_end) {
-        STRINGLIB_CHAR bits = p[0] | p[1] | p[2] | p[3];
+        /* Loading 8 values at once allows platforms that have 16-byte vectors
+           to do a vector load and vector bitwise OR. */
+        STRINGLIB_CHAR bits = p[0] | p[1] | p[2] | p[3] | p[4] | p[5] | p[6] | p[7];
         if (bits & mask) {
             if (mask == mask_limit) {
                 /* Limit reached */
@@ -96,7 +126,7 @@ STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end)
             /* We check the new mask on the same chars in the next iteration */
             continue;
         }
-        p += 4;
+        p += 8;
     }
     while (p < end) {
         if (p[0] & mask) {

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
@@ -4973,58 +4973,54 @@ static Py_ssize_t
 ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
 {
     const char *p = start;
-
-#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
-    if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)
-        && _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
-    {
-        /* Fast path, see in STRINGLIB(utf8_decode) for
-           an explanation. */
-        /* Help allocation */
-        const char *_p = p;
-        Py_UCS1 * q = dest;
-        while (_p + SIZEOF_SIZE_T <= end) {
-            size_t value = *(const size_t *) _p;
-            if (value & ASCII_CHAR_MASK)
-                break;
-            *((size_t *)q) = value;
-            _p += SIZEOF_SIZE_T;
-            q += SIZEOF_SIZE_T;
+    Py_UCS1 *q = dest;
+    const char *size_t_end  = end - SIZEOF_SIZE_T;
+    const char *unrolled_end = end - (4 * SIZEOF_SIZE_T);
+
+    while (p < unrolled_end) {
+        const size_t *restrict _p = (const size_t *)p;
+        size_t *restrict _q = (size_t *)q;
+        size_t value0;
+        size_t value1;
+        size_t value2;
+        size_t value3;
+        /* Memcpy optimizes to unaligned loads on supporting platforms*/
+        memcpy(&value0, _p + 0, SIZEOF_SIZE_T);
+        memcpy(&value1, _p + 1, SIZEOF_SIZE_T);
+        memcpy(&value2, _p + 2, SIZEOF_SIZE_T);
+        memcpy(&value3, _p + 3, SIZEOF_SIZE_T);
+        size_t value = value0 | value1 | value2 | value3;
+        if (value & ASCII_CHAR_MASK) {
+            break;
         }
-        p = _p;
-        while (p < end) {
-            if ((unsigned char)*p & 0x80)
-                break;
-            *q++ = *p++;
+        memcpy(_q + 0, &value0, SIZEOF_SIZE_T);
+        memcpy(_q + 1, &value1, SIZEOF_SIZE_T);
+        memcpy(_q + 2, &value2, SIZEOF_SIZE_T);
+        memcpy(_q + 3, &value3, SIZEOF_SIZE_T);
+        p += 4 * SIZEOF_SIZE_T;
+        q += 4 * SIZEOF_SIZE_T;
+    }
+    while (p < size_t_end) {
+        const size_t *restrict _p = (const size_t *)p;
+        size_t *restrict _q = (size_t *)q;
+        size_t value;
+        memcpy(&value, _p, SIZEOF_SIZE_T);
+        if (value & ASCII_CHAR_MASK) {
+            break;
         }
-        return p - start;
+        memcpy(_q, &value, SIZEOF_SIZE_T);
+        p += SIZEOF_SIZE_T;
+        q += SIZEOF_SIZE_T;
     }
-#endif
     while (p < end) {
-        /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
-           for an explanation. */
-        if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
-            /* Help allocation */
-            const char *_p = p;
-            while (_p + SIZEOF_SIZE_T <= end) {
-                size_t value = *(const size_t *) _p;
-                if (value & ASCII_CHAR_MASK)
-                    break;
-                _p += SIZEOF_SIZE_T;
-            }
-            p = _p;
-            if (_p == end)
-                break;
-        }
-        if ((unsigned char)*p & 0x80)
+        if ((unsigned char)*p & 0x80) {
             break;
-        ++p;
+        }
+        *q++ = *p++;
     }
-    memcpy(dest, start, p - start);
     return p - start;
 }
 
-
 static int
 unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
                          const char *starts, const char *s, const char *end,
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		Improve performance of ASCII decoding and maximum character checking
		by allowing vectorization by the compiler on suitable platforms.