From b2f9fb52d3cdec8f56944574a1c2ac152925fea1 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 12 Jun 2024 13:30:57 +0200 Subject: [PATCH 01/15] Faster counting of characters due to autovectorization --- Objects/stringlib/fastsearch.h | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 309ed1554f4699..6461d6831fcef8 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -740,14 +740,31 @@ static inline Py_ssize_t STRINGLIB(count_char)(const STRINGLIB_CHAR *s, Py_ssize_t n, const STRINGLIB_CHAR p0, Py_ssize_t maxcount) { - Py_ssize_t i, count = 0; - for (i = 0; i < n; i++) { - if (s[i] == p0) { - count++; - if (count == maxcount) { - return maxcount; + Py_ssize_t count = 0; + const STRINGLIB_CHAR *restrict cursor = s; + const STRINGLIB_CHAR *end_ptr = s + n; + const STRINGLIB_CHAR *unroll_end_ptr = end_ptr - 31; + /* By unrolling in chunks of 32, the compiler can auto vectorize, resulting + in much better performance. */ + while (cursor < unroll_end_ptr) { + for(size_t i=0; i<32; i++) { + if (cursor[i] == p0) { + count += 1; } } + if (count >= maxcount) { + return maxcount; + } + cursor += 32; + } + while (cursor < end_ptr) { + if (*cursor == p0) { + count += 1; + } + cursor += 1; + } + if (count >= maxcount) { + return maxcount; } return count; } From cb564438a1afc062d9e4b220ce16e87c3a859596 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 12 Jun 2024 13:47:31 +0200 Subject: [PATCH 02/15] Add blurb entry for faster count method --- .../2024-06-12-13-47-25.gh-issue-120397.n-I_cc.rst | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2024-06-12-13-47-25.gh-issue-120397.n-I_cc.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-06-12-13-47-25.gh-issue-120397.n-I_cc.rst b/Misc/NEWS.d/next/Core and Builtins/2024-06-12-13-47-25.gh-issue-120397.n-I_cc.rst new file mode 100644 index 00000000000000..2ec0da3d67ba96 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2024-06-12-13-47-25.gh-issue-120397.n-I_cc.rst @@ -0,0 +1,3 @@ +Significantly improve the speed of the str.count, bytes.count and +bytearray.count method when the argument is a single character and the +target architecture and compiler support vectorization. From fff610bd4cf152a7c0ac8ebb3ef6446718f17870 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 12 Jun 2024 14:28:05 +0200 Subject: [PATCH 03/15] Rewrite count function as a for loop --- Objects/stringlib/fastsearch.h | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 6461d6831fcef8..dfb7296a999bc3 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -740,28 +740,25 @@ static inline Py_ssize_t STRINGLIB(count_char)(const STRINGLIB_CHAR *s, Py_ssize_t n, const STRINGLIB_CHAR p0, Py_ssize_t maxcount) { - Py_ssize_t count = 0; - const STRINGLIB_CHAR *restrict cursor = s; - const STRINGLIB_CHAR *end_ptr = s + n; - const STRINGLIB_CHAR *unroll_end_ptr = end_ptr - 31; + Py_ssize_t i, count = 0; + Py_ssize_t unroll_length = n - 31; /* By unrolling in chunks of 32, the compiler can auto vectorize, resulting in much better performance. */ - while (cursor < unroll_end_ptr) { - for(size_t i=0; i<32; i++) { - if (cursor[i] == p0) { + for (i = 0; i < unroll_length; i+=32) { + const STRINGLIB_CHAR *restrict cursor = s + i; + for(size_t j = 0; j < 32; j++) { + if (cursor[j] == p0) { count += 1; } } if (count >= maxcount) { return maxcount; } - cursor += 32; } - while (cursor < end_ptr) { - if (*cursor == p0) { + for (; i < n; i++) { + if (s[i] == p0) { count += 1; } - cursor += 1; } if (count >= maxcount) { return maxcount; @@ -770,6 +767,7 @@ STRINGLIB(count_char)(const STRINGLIB_CHAR *s, Py_ssize_t n, } + Py_LOCAL_INLINE(Py_ssize_t) FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, const STRINGLIB_CHAR* p, Py_ssize_t m, From 1359366d20b4768d7ec89150c826ea20d12f6d2c Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 12 Jun 2024 14:42:26 +0200 Subject: [PATCH 04/15] Do the max count check in the inner loop --- Objects/stringlib/fastsearch.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index dfb7296a999bc3..d362c1f7a49fc6 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -757,12 +757,12 @@ STRINGLIB(count_char)(const STRINGLIB_CHAR *s, Py_ssize_t n, } for (; i < n; i++) { if (s[i] == p0) { - count += 1; + count++; + if (count == maxcount) { + return maxcount; + } } } - if (count >= maxcount) { - return maxcount; - } return count; } From 5837e412f2fad138cf8030060f3f1e80b263b851 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 12 Jun 2024 14:44:23 +0200 Subject: [PATCH 05/15] Remove extraneous newline --- Objects/stringlib/fastsearch.h | 1 - 1 file changed, 1 deletion(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index d362c1f7a49fc6..09aa5b035a9791 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -767,7 +767,6 @@ STRINGLIB(count_char)(const STRINGLIB_CHAR *s, Py_ssize_t n, } - Py_LOCAL_INLINE(Py_ssize_t) FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, const STRINGLIB_CHAR* p, Py_ssize_t m, From c6f40682b444c9ca007dcf333a4a4323e0ffd9c9 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 12 Jun 2024 14:53:09 +0200 Subject: [PATCH 06/15] Add an explanatory comment --- Objects/stringlib/fastsearch.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 09aa5b035a9791..a3628daf4e0015 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -744,13 +744,17 @@ STRINGLIB(count_char)(const STRINGLIB_CHAR *s, Py_ssize_t n, Py_ssize_t unroll_length = n - 31; /* By unrolling in chunks of 32, the compiler can auto vectorize, resulting in much better performance. */ - for (i = 0; i < unroll_length; i+=32) { + for (i = 0; i < unroll_length; i += 32) { const STRINGLIB_CHAR *restrict cursor = s + i; for(size_t j = 0; j < 32; j++) { if (cursor[j] == p0) { count += 1; } } + /* By performing the check outside of the read/compare loop the + compiler is guaranteed that 32 bytes can be read and counted. + As a result it can vectorize. + */ if (count >= maxcount) { return maxcount; } From 0eaccd590ae575b2c38cab13b88402d17984b025 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 12 Jun 2024 15:32:58 +0200 Subject: [PATCH 07/15] Revert changes --- Objects/stringlib/fastsearch.h | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index a3628daf4e0015..309ed1554f4699 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -741,25 +741,7 @@ STRINGLIB(count_char)(const STRINGLIB_CHAR *s, Py_ssize_t n, const STRINGLIB_CHAR p0, Py_ssize_t maxcount) { Py_ssize_t i, count = 0; - Py_ssize_t unroll_length = n - 31; - /* By unrolling in chunks of 32, the compiler can auto vectorize, resulting - in much better performance. */ - for (i = 0; i < unroll_length; i += 32) { - const STRINGLIB_CHAR *restrict cursor = s + i; - for(size_t j = 0; j < 32; j++) { - if (cursor[j] == p0) { - count += 1; - } - } - /* By performing the check outside of the read/compare loop the - compiler is guaranteed that 32 bytes can be read and counted. - As a result it can vectorize. - */ - if (count >= maxcount) { - return maxcount; - } - } - for (; i < n; i++) { + for (i = 0; i < n; i++) { if (s[i] == p0) { count++; if (count == maxcount) { From a85202cc36333c50b57c2e0b065e481b7f55ce0f Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 12 Jun 2024 15:38:18 +0200 Subject: [PATCH 08/15] Use a no maximum count function --- Objects/stringlib/fastsearch.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 309ed1554f4699..29df3dadf79bd2 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -753,6 +753,22 @@ STRINGLIB(count_char)(const STRINGLIB_CHAR *s, Py_ssize_t n, } +static inline Py_ssize_t +STRINGLIB(count_char_no_maximum)(const STRINGLIB_CHAR *s, Py_ssize_t n, + const STRINGLIB_CHAR p0) +/* By removing the maximum out of the loop, the compiler can optimize using + vectors */ +{ + Py_ssize_t i, count = 0; + for (i = 0; i < n; i++) { + if (s[i] == p0) { + count++; + } + } + return count; +} + + Py_LOCAL_INLINE(Py_ssize_t) FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, const STRINGLIB_CHAR* p, Py_ssize_t m, @@ -773,6 +789,9 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, else if (mode == FAST_RSEARCH) return STRINGLIB(rfind_char)(s, n, p[0]); else { + if (maxcount == PY_SSIZE_T_MAX) { + return STRINGLIB(count_char_no_maximum)(s, n, p[0]); + } return STRINGLIB(count_char)(s, n, p[0], maxcount); } } From 807706d005fb8cedbe97afab16a5333a92e2cc9f Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 12 Jun 2024 16:24:22 +0200 Subject: [PATCH 09/15] Update Misc/NEWS.d/next/Core and Builtins/2024-06-12-13-47-25.gh-issue-120397.n-I_cc.rst MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com> --- .../2024-06-12-13-47-25.gh-issue-120397.n-I_cc.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-06-12-13-47-25.gh-issue-120397.n-I_cc.rst b/Misc/NEWS.d/next/Core and Builtins/2024-06-12-13-47-25.gh-issue-120397.n-I_cc.rst index 2ec0da3d67ba96..a20ce045060bf1 100644 --- a/Misc/NEWS.d/next/Core and Builtins/2024-06-12-13-47-25.gh-issue-120397.n-I_cc.rst +++ b/Misc/NEWS.d/next/Core and Builtins/2024-06-12-13-47-25.gh-issue-120397.n-I_cc.rst @@ -1,3 +1,2 @@ -Significantly improve the speed of the str.count, bytes.count and -bytearray.count method when the argument is a single character and the -target architecture and compiler support vectorization. +Improve the :meth:`str.count`, :meth:`bytes.count` and :meth:`bytearray.count` +methods for counting single characters. From fb83c6adb6eec0fd3f0d76cd6f741d6e054344ed Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 12 Jun 2024 16:26:43 +0200 Subject: [PATCH 10/15] Formatting --- Objects/stringlib/fastsearch.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 29df3dadf79bd2..7e62fd678ea3ff 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -755,7 +755,7 @@ STRINGLIB(count_char)(const STRINGLIB_CHAR *s, Py_ssize_t n, static inline Py_ssize_t STRINGLIB(count_char_no_maximum)(const STRINGLIB_CHAR *s, Py_ssize_t n, - const STRINGLIB_CHAR p0) + const STRINGLIB_CHAR p0) /* By removing the maximum out of the loop, the compiler can optimize using vectors */ { From 05a1fc200991764d15c21cdc6172a5ae8a288152 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Thu, 13 Jun 2024 07:10:11 +0200 Subject: [PATCH 11/15] Give a speed indication --- .../2024-06-12-13-47-25.gh-issue-120397.n-I_cc.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-06-12-13-47-25.gh-issue-120397.n-I_cc.rst b/Misc/NEWS.d/next/Core and Builtins/2024-06-12-13-47-25.gh-issue-120397.n-I_cc.rst index a20ce045060bf1..05c55e8a45eb12 100644 --- a/Misc/NEWS.d/next/Core and Builtins/2024-06-12-13-47-25.gh-issue-120397.n-I_cc.rst +++ b/Misc/NEWS.d/next/Core and Builtins/2024-06-12-13-47-25.gh-issue-120397.n-I_cc.rst @@ -1,2 +1,2 @@ -Improve the :meth:`str.count`, :meth:`bytes.count` and :meth:`bytearray.count` +Improve the througput by up to two times for the :meth:`str.count`, :meth:`bytes.count` and :meth:`bytearray.count` methods for counting single characters. From 2fab99b0a915611546dd191f9fc440b14a8fbb1b Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Thu, 13 Jun 2024 07:13:38 +0200 Subject: [PATCH 12/15] Update comment --- Objects/stringlib/fastsearch.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 7e62fd678ea3ff..a22076f09d08a7 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -756,11 +756,10 @@ STRINGLIB(count_char)(const STRINGLIB_CHAR *s, Py_ssize_t n, static inline Py_ssize_t STRINGLIB(count_char_no_maximum)(const STRINGLIB_CHAR *s, Py_ssize_t n, const STRINGLIB_CHAR p0) -/* By removing the maximum out of the loop, the compiler can optimize using - vectors */ +/* A specialized function of count_char that does not cut off at a maximum. + As a result, the compiler is able to vectorize the loop. */ { - Py_ssize_t i, count = 0; - for (i = 0; i < n; i++) { + for (Py_ssize_t i = 0; i < n; i++) { if (s[i] == p0) { count++; } From ce9ab9beacea812c42c2d930e59bad1371642386 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Thu, 13 Jun 2024 07:19:27 +0200 Subject: [PATCH 13/15] rename function --- Objects/stringlib/fastsearch.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index a22076f09d08a7..30e73ebee1c035 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -754,9 +754,9 @@ STRINGLIB(count_char)(const STRINGLIB_CHAR *s, Py_ssize_t n, static inline Py_ssize_t -STRINGLIB(count_char_no_maximum)(const STRINGLIB_CHAR *s, Py_ssize_t n, - const STRINGLIB_CHAR p0) -/* A specialized function of count_char that does not cut off at a maximum. +STRINGLIB(count_char_no_maxcount)(const STRINGLIB_CHAR *s, Py_ssize_t n, + const STRINGLIB_CHAR p0) +/* A specialized function of count_char that does not cut off at a maximum. As a result, the compiler is able to vectorize the loop. */ { for (Py_ssize_t i = 0; i < n; i++) { From 0cc9369fc22836bee234d43a2e5415089dcb9dd9 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 13 Jun 2024 11:38:32 +0200 Subject: [PATCH 14/15] Update Objects/stringlib/fastsearch.h Co-authored-by: Nice Zombies --- Objects/stringlib/fastsearch.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 30e73ebee1c035..a7f19c5dfdda60 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -759,6 +759,7 @@ STRINGLIB(count_char_no_maxcount)(const STRINGLIB_CHAR *s, Py_ssize_t n, /* A specialized function of count_char that does not cut off at a maximum. As a result, the compiler is able to vectorize the loop. */ { + Py_ssize_t count = 0; for (Py_ssize_t i = 0; i < n; i++) { if (s[i] == p0) { count++; From 30a65a7f1efadb01ec4b95b6eefd9e0a1b0dfe54 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Thu, 13 Jun 2024 14:06:02 +0200 Subject: [PATCH 15/15] Fix changing oversights --- Objects/stringlib/fastsearch.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 30e73ebee1c035..05e700b06258f0 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -759,6 +759,7 @@ STRINGLIB(count_char_no_maxcount)(const STRINGLIB_CHAR *s, Py_ssize_t n, /* A specialized function of count_char that does not cut off at a maximum. As a result, the compiler is able to vectorize the loop. */ { + Py_ssize_t count = 0; for (Py_ssize_t i = 0; i < n; i++) { if (s[i] == p0) { count++; @@ -789,7 +790,7 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, return STRINGLIB(rfind_char)(s, n, p[0]); else { if (maxcount == PY_SSIZE_T_MAX) { - return STRINGLIB(count_char_no_maximum)(s, n, p[0]); + return STRINGLIB(count_char_no_maxcount)(s, n, p[0]); } return STRINGLIB(count_char)(s, n, p[0], maxcount); }