python · rhpvorderman · Jun 5, 2024 · Jun 7, 2024 · Jun 7, 2024 · Jun 7, 2024
diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-06-07-13-29-57.gh-issue-120196.uf2pIh.rst b/Misc/NEWS.d/next/Core and Builtins/2024-06-07-13-29-57.gh-issue-120196.uf2pIh.rst
@@ -0,0 +1,2 @@
+Improve performance of ASCII decoding and maximum character checking
+by allowing vectorization by the compiler on suitable platforms.
diff --git a/Objects/stringlib/find_max_char.h b/Objects/stringlib/find_max_char.h
@@ -20,23 +20,45 @@ Py_LOCAL_INLINE(Py_UCS4)
 STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end)
 {
     const unsigned char *p = (const unsigned char *) begin;
+    const unsigned char *_end = (const unsigned char *)end;
+    const size_t *aligned_end = (const size_t *)(_end - SIZEOF_SIZE_T);
+    const size_t *unrolled_end = aligned_end - 3;
+    unsigned char accumulator = 0;
+    /* Do not test each character individually, but use bitwise OR and test
+       all characters at once. */
+    while (p < _end && !_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
+        accumulator |= *p;
+        p += 1;
+    }
+    if (accumulator & 0x80) {
+        return 255;
+    } else if (p == end) {
+        return 127;
+    }
 
-    while (p < end) {
-        if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
-            /* Help register allocation */
-            const unsigned char *_p = p;
-            while (_p + SIZEOF_SIZE_T <= end) {
-                size_t value = *(const size_t *) _p;
-                if (value & UCS1_ASCII_CHAR_MASK)
-                    return 255;
-                _p += SIZEOF_SIZE_T;
-            }
-            p = _p;
-            if (p == end)
-                break;
-        }
-        if (*p++ & 0x80)
+    /* On 64-bit platforms with 128-bit vectors (x86-64, arm64) the
+       compiler can load 4 size_t values into two 16-byte vectors and do a
+       vector bitwise OR. */
+    const size_t *_p = (const size_t *)p;
+    while (_p < unrolled_end) {
+        size_t value = _p[0] | _p[1] | _p[2] | _p[3];
+        if (value & UCS1_ASCII_CHAR_MASK) {
             return 255;
+        }
+        _p += 4;
+    }
+    size_t value = 0;
+    while (_p < aligned_end) {
+        value |= *_p;
+        _p += 1;
+    }
+    p = (const unsigned char *)_p;
+    while (p < _end) {
+        value |= *p;
+        p += 1;
+    }
+    if (value & UCS1_ASCII_CHAR_MASK) {
+        return 255;
     }
     return 127;
 }
@@ -69,13 +91,15 @@ STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end)
     Py_UCS4 mask;
     Py_ssize_t n = end - begin;
     const STRINGLIB_CHAR *p = begin;
-    const STRINGLIB_CHAR *unrolled_end = begin + _Py_SIZE_ROUND_DOWN(n, 4);
+    const STRINGLIB_CHAR *unrolled_end = begin + _Py_SIZE_ROUND_DOWN(n, 8);
     Py_UCS4 max_char;
 
     max_char = MAX_CHAR_ASCII;
     mask = MASK_ASCII;
     while (p < unrolled_end) {
-        STRINGLIB_CHAR bits = p[0] | p[1] | p[2] | p[3];
+        /* Loading 8 values at once allows platforms that have 16-byte vectors
+           to do a vector load and vector bitwise OR. */
+        STRINGLIB_CHAR bits = p[0] | p[1] | p[2] | p[3] | p[4] | p[5] | p[6] | p[7];
         if (bits & mask) {
             if (mask == mask_limit) {
                 /* Limit reached */
@@ -94,7 +118,7 @@ STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end)
             /* We check the new mask on the same chars in the next iteration */
             continue;
         }
-        p += 4;
+        p += 8;
     }
     while (p < end) {
         if (p[0] & mask) {

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
@@ -4700,6 +4700,8 @@ static Py_ssize_t
 ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
 {
     const char *p = start;
+    const char *size_t_end  = end - SIZEOF_SIZE_T;
+    const char *unrolled_end = end - (4 * SIZEOF_SIZE_T);
 
 #if SIZEOF_SIZE_T <= SIZEOF_VOID_P
     if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)
@@ -4710,7 +4712,25 @@ ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
         /* Help allocation */
         const char *_p = p;
         Py_UCS1 * q = dest;
-        while (_p + SIZEOF_SIZE_T <= end) {
+        while (_p <= unrolled_end) {
+            const size_t *restrict __p = (const size_t *)_p;
+            size_t value0 = __p[0];
+            size_t value1 = __p[1];
+            size_t value2 = __p[2];
+            size_t value3 = __p[3];
+            size_t value = value0 | value1 | value2 | value3;
+            if (value & ASCII_CHAR_MASK) {
+                break;
+            }
+            size_t *restrict _q = (size_t *)q;
+            _q[0] = value0;
+            _q[1] = value1;
+            _q[2] = value2;
+            _q[3] = value3;
+            _p += (4 * SIZEOF_SIZE_T);
+            q += (4 * SIZEOF_SIZE_T);
+        }
+        while (_p <= size_t_end) {
             size_t value = *(const size_t *) _p;
             if (value & ASCII_CHAR_MASK)
                 break;
@@ -4733,7 +4753,15 @@ ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
         if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
             /* Help allocation */
             const char *_p = p;
-            while (_p + SIZEOF_SIZE_T <= end) {
+            while (_p <= unrolled_end) {
+                const size_t *restrict __p = (const size_t *)_p;
+                size_t value = __p[0] | __p[1] | __p[2] | __p[3];
+                if (value & ASCII_CHAR_MASK) {
+                    break;
+                }
+                _p += (4 * SIZEOF_SIZE_T);
+            }
+            while (_p <= size_t_end) {
                 size_t value = *(const size_t *) _p;
                 if (value & ASCII_CHAR_MASK)
                     break;
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		Improve performance of ASCII decoding and maximum character checking
		by allowing vectorization by the compiler on suitable platforms.