Skip to content

Commit 1711bc3

Browse files
Vectorize basic_string::rfind (the single character overload) (#5087)
Co-authored-by: Stephan T. Lavavej <[email protected]>
1 parent 64d143d commit 1711bc3

File tree

5 files changed

+107
-45
lines changed

5 files changed

+107
-45
lines changed

benchmarks/src/find_and_count.cpp

+40-12
Original file line numberDiff line numberDiff line change
@@ -7,25 +7,38 @@
77
#include <cstdint>
88
#include <cstdlib>
99
#include <ranges>
10+
#include <string>
11+
#include <type_traits>
1012
#include <vector>
1113

14+
#include "skewed_allocator.hpp"
15+
1216
enum class Op {
1317
FindSized,
1418
FindUnsized,
1519
Count,
20+
StringFind,
21+
StringRFind,
1622
};
1723

1824
using namespace std;
1925

20-
template <class T, Op Operation>
26+
template <class T, template <class> class Alloc, Op Operation>
2127
void bm(benchmark::State& state) {
2228
const auto size = static_cast<size_t>(state.range(0));
2329
const auto pos = static_cast<size_t>(state.range(1));
2430

25-
vector<T> a(size, T{'0'});
31+
using Container = conditional_t<Operation == Op::StringFind || Operation == Op::StringRFind,
32+
basic_string<T, char_traits<T>, Alloc<T>>, vector<T, Alloc<T>>>;
33+
34+
Container a(size, T{'0'});
2635

2736
if (pos < size) {
28-
a[pos] = T{'1'};
37+
if constexpr (Operation == Op::StringRFind) {
38+
a[size - pos - 1] = T{'1'};
39+
} else {
40+
a[pos] = T{'1'};
41+
}
2942
} else {
3043
if constexpr (Operation == Op::FindUnsized) {
3144
abort();
@@ -39,6 +52,10 @@ void bm(benchmark::State& state) {
3952
benchmark::DoNotOptimize(ranges::find(a.begin(), unreachable_sentinel, T{'1'}));
4053
} else if constexpr (Operation == Op::Count) {
4154
benchmark::DoNotOptimize(ranges::count(a.begin(), a.end(), T{'1'}));
55+
} else if constexpr (Operation == Op::StringFind) {
56+
benchmark::DoNotOptimize(a.find(T{'1'}));
57+
} else if constexpr (Operation == Op::StringRFind) {
58+
benchmark::DoNotOptimize(a.rfind(T{'1'}));
4259
}
4360
}
4461
}
@@ -50,17 +67,28 @@ void common_args(auto bm) {
5067
}
5168

5269

53-
BENCHMARK(bm<uint8_t, Op::FindSized>)->Apply(common_args);
54-
BENCHMARK(bm<uint8_t, Op::FindUnsized>)->Apply(common_args);
55-
BENCHMARK(bm<uint8_t, Op::Count>)->Apply(common_args);
70+
BENCHMARK(bm<uint8_t, not_highly_aligned_allocator, Op::FindSized>)->Apply(common_args);
71+
BENCHMARK(bm<uint8_t, highly_aligned_allocator, Op::FindSized>)->Apply(common_args);
72+
BENCHMARK(bm<uint8_t, not_highly_aligned_allocator, Op::FindUnsized>)->Apply(common_args);
73+
BENCHMARK(bm<uint8_t, highly_aligned_allocator, Op::FindUnsized>)->Apply(common_args);
74+
BENCHMARK(bm<uint8_t, not_highly_aligned_allocator, Op::Count>)->Apply(common_args);
75+
BENCHMARK(bm<uint8_t, highly_aligned_allocator, Op::Count>)->Apply(common_args);
76+
BENCHMARK(bm<char, not_highly_aligned_allocator, Op::StringFind>)->Apply(common_args);
77+
BENCHMARK(bm<char, highly_aligned_allocator, Op::StringFind>)->Apply(common_args);
78+
BENCHMARK(bm<char, not_highly_aligned_allocator, Op::StringRFind>)->Apply(common_args);
79+
BENCHMARK(bm<char, highly_aligned_allocator, Op::StringRFind>)->Apply(common_args);
5680

57-
BENCHMARK(bm<uint16_t, Op::FindSized>)->Apply(common_args);
58-
BENCHMARK(bm<uint16_t, Op::Count>)->Apply(common_args);
81+
BENCHMARK(bm<uint16_t, not_highly_aligned_allocator, Op::FindSized>)->Apply(common_args);
82+
BENCHMARK(bm<uint16_t, not_highly_aligned_allocator, Op::Count>)->Apply(common_args);
83+
BENCHMARK(bm<wchar_t, not_highly_aligned_allocator, Op::StringFind>)->Apply(common_args);
84+
BENCHMARK(bm<wchar_t, not_highly_aligned_allocator, Op::StringRFind>)->Apply(common_args);
5985

60-
BENCHMARK(bm<uint32_t, Op::FindSized>)->Apply(common_args);
61-
BENCHMARK(bm<uint32_t, Op::Count>)->Apply(common_args);
86+
BENCHMARK(bm<uint32_t, not_highly_aligned_allocator, Op::FindSized>)->Apply(common_args);
87+
BENCHMARK(bm<uint32_t, not_highly_aligned_allocator, Op::Count>)->Apply(common_args);
88+
BENCHMARK(bm<char32_t, not_highly_aligned_allocator, Op::StringFind>)->Apply(common_args);
89+
BENCHMARK(bm<char32_t, not_highly_aligned_allocator, Op::StringRFind>)->Apply(common_args);
6290

63-
BENCHMARK(bm<uint64_t, Op::FindSized>)->Apply(common_args);
64-
BENCHMARK(bm<uint64_t, Op::Count>)->Apply(common_args);
91+
BENCHMARK(bm<uint64_t, not_highly_aligned_allocator, Op::FindSized>)->Apply(common_args);
92+
BENCHMARK(bm<uint64_t, not_highly_aligned_allocator, Op::Count>)->Apply(common_args);
6593

6694
BENCHMARK_MAIN();

stl/inc/__msvc_string_view.hpp

+18-1
Original file line numberDiff line numberDiff line change
@@ -724,7 +724,24 @@ constexpr size_t _Traits_rfind_ch(_In_reads_(_Hay_size) const _Traits_ptr_t<_Tra
724724
return static_cast<size_t>(-1);
725725
}
726726

727-
for (auto _Match_try = _Haystack + (_STD min)(_Start_at, _Hay_size - 1);; --_Match_try) {
727+
const size_t _Actual_start_at = (_STD min)(_Start_at, _Hay_size - 1);
728+
729+
#if _USE_STD_VECTOR_ALGORITHMS
730+
if constexpr (_Is_implementation_handled_char_traits<_Traits>) {
731+
if (!_STD _Is_constant_evaluated()) {
732+
const auto _End = _Haystack + _Actual_start_at + 1;
733+
const auto _Ptr = _STD _Find_last_vectorized(_Haystack, _End, _Ch);
734+
735+
if (_Ptr != _End) {
736+
return static_cast<size_t>(_Ptr - _Haystack);
737+
} else {
738+
return static_cast<size_t>(-1);
739+
}
740+
}
741+
}
742+
#endif // _USE_STD_VECTOR_ALGORITHMS
743+
744+
for (auto _Match_try = _Haystack + _Actual_start_at;; --_Match_try) {
728745
if (_Traits::eq(*_Match_try, _Ch)) {
729746
return static_cast<size_t>(_Match_try - _Haystack); // found a match
730747
}

stl/inc/algorithm

-32
Original file line numberDiff line numberDiff line change
@@ -54,11 +54,6 @@ _Min_max_element_t __stdcall __std_minmax_element_8(const void* _First, const vo
5454
_Min_max_element_t __stdcall __std_minmax_element_f(const void* _First, const void* _Last, bool _Unused) noexcept;
5555
_Min_max_element_t __stdcall __std_minmax_element_d(const void* _First, const void* _Last, bool _Unused) noexcept;
5656

57-
const void* __stdcall __std_find_last_trivial_1(const void* _First, const void* _Last, uint8_t _Val) noexcept;
58-
const void* __stdcall __std_find_last_trivial_2(const void* _First, const void* _Last, uint16_t _Val) noexcept;
59-
const void* __stdcall __std_find_last_trivial_4(const void* _First, const void* _Last, uint32_t _Val) noexcept;
60-
const void* __stdcall __std_find_last_trivial_8(const void* _First, const void* _Last, uint64_t _Val) noexcept;
61-
6257
__declspec(noalias) _Min_max_1i __stdcall __std_minmax_1i(const void* _First, const void* _Last) noexcept;
6358
__declspec(noalias) _Min_max_1u __stdcall __std_minmax_1u(const void* _First, const void* _Last) noexcept;
6459
__declspec(noalias) _Min_max_2i __stdcall __std_minmax_2i(const void* _First, const void* _Last) noexcept;
@@ -162,33 +157,6 @@ auto _Minmax_vectorized(_Ty* const _First, _Ty* const _Last) noexcept {
162157
}
163158
}
164159

165-
template <class _Ty, class _TVal>
166-
_Ty* _Find_last_vectorized(_Ty* const _First, _Ty* const _Last, const _TVal _Val) noexcept {
167-
if constexpr (is_pointer_v<_TVal> || is_null_pointer_v<_TVal>) {
168-
#ifdef _WIN64
169-
return const_cast<_Ty*>(
170-
static_cast<const _Ty*>(::__std_find_last_trivial_8(_First, _Last, reinterpret_cast<uint64_t>(_Val))));
171-
#else
172-
return const_cast<_Ty*>(
173-
static_cast<const _Ty*>(::__std_find_last_trivial_4(_First, _Last, reinterpret_cast<uint32_t>(_Val))));
174-
#endif
175-
} else if constexpr (sizeof(_Ty) == 1) {
176-
return const_cast<_Ty*>(
177-
static_cast<const _Ty*>(::__std_find_last_trivial_1(_First, _Last, static_cast<uint8_t>(_Val))));
178-
} else if constexpr (sizeof(_Ty) == 2) {
179-
return const_cast<_Ty*>(
180-
static_cast<const _Ty*>(::__std_find_last_trivial_2(_First, _Last, static_cast<uint16_t>(_Val))));
181-
} else if constexpr (sizeof(_Ty) == 4) {
182-
return const_cast<_Ty*>(
183-
static_cast<const _Ty*>(::__std_find_last_trivial_4(_First, _Last, static_cast<uint32_t>(_Val))));
184-
} else if constexpr (sizeof(_Ty) == 8) {
185-
return const_cast<_Ty*>(
186-
static_cast<const _Ty*>(::__std_find_last_trivial_8(_First, _Last, static_cast<uint64_t>(_Val))));
187-
} else {
188-
_STL_INTERNAL_STATIC_ASSERT(false); // unexpected size
189-
}
190-
}
191-
192160
template <class _Ty, class _TVal1, class _TVal2>
193161
__declspec(noalias) void _Replace_vectorized(
194162
_Ty* const _First, _Ty* const _Last, const _TVal1 _Old_val, const _TVal2 _New_val) noexcept {

stl/inc/xutility

+32
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,11 @@ const void* __stdcall __std_find_trivial_2(const void* _First, const void* _Last
9393
const void* __stdcall __std_find_trivial_4(const void* _First, const void* _Last, uint32_t _Val) noexcept;
9494
const void* __stdcall __std_find_trivial_8(const void* _First, const void* _Last, uint64_t _Val) noexcept;
9595

96+
const void* __stdcall __std_find_last_trivial_1(const void* _First, const void* _Last, uint8_t _Val) noexcept;
97+
const void* __stdcall __std_find_last_trivial_2(const void* _First, const void* _Last, uint16_t _Val) noexcept;
98+
const void* __stdcall __std_find_last_trivial_4(const void* _First, const void* _Last, uint32_t _Val) noexcept;
99+
const void* __stdcall __std_find_last_trivial_8(const void* _First, const void* _Last, uint64_t _Val) noexcept;
100+
96101
const void* __stdcall __std_find_first_of_trivial_1(
97102
const void* _First1, const void* _Last1, const void* _First2, const void* _Last2) noexcept;
98103
const void* __stdcall __std_find_first_of_trivial_2(
@@ -217,6 +222,33 @@ _Ty* _Find_vectorized(_Ty* const _First, _Ty* const _Last, const _TVal _Val) noe
217222
}
218223
}
219224

225+
template <class _Ty, class _TVal>
226+
_Ty* _Find_last_vectorized(_Ty* const _First, _Ty* const _Last, const _TVal _Val) noexcept {
227+
if constexpr (is_pointer_v<_TVal> || is_null_pointer_v<_TVal>) {
228+
#ifdef _WIN64
229+
return const_cast<_Ty*>(
230+
static_cast<const _Ty*>(::__std_find_last_trivial_8(_First, _Last, reinterpret_cast<uint64_t>(_Val))));
231+
#else
232+
return const_cast<_Ty*>(
233+
static_cast<const _Ty*>(::__std_find_last_trivial_4(_First, _Last, reinterpret_cast<uint32_t>(_Val))));
234+
#endif
235+
} else if constexpr (sizeof(_Ty) == 1) {
236+
return const_cast<_Ty*>(
237+
static_cast<const _Ty*>(::__std_find_last_trivial_1(_First, _Last, static_cast<uint8_t>(_Val))));
238+
} else if constexpr (sizeof(_Ty) == 2) {
239+
return const_cast<_Ty*>(
240+
static_cast<const _Ty*>(::__std_find_last_trivial_2(_First, _Last, static_cast<uint16_t>(_Val))));
241+
} else if constexpr (sizeof(_Ty) == 4) {
242+
return const_cast<_Ty*>(
243+
static_cast<const _Ty*>(::__std_find_last_trivial_4(_First, _Last, static_cast<uint32_t>(_Val))));
244+
} else if constexpr (sizeof(_Ty) == 8) {
245+
return const_cast<_Ty*>(
246+
static_cast<const _Ty*>(::__std_find_last_trivial_8(_First, _Last, static_cast<uint64_t>(_Val))));
247+
} else {
248+
_STL_INTERNAL_STATIC_ASSERT(false); // unexpected size
249+
}
250+
}
251+
220252
// find_first_of vectorization is likely to be a win after this size (in elements)
221253
_INLINE_VAR constexpr ptrdiff_t _Threshold_find_first_of = 16;
222254

tests/std/tests/VSO_0000000_vector_algorithms/test.cpp

+17
Original file line numberDiff line numberDiff line change
@@ -1128,6 +1128,22 @@ void test_case_string_rfind_str(const basic_string<T>& input_haystack, const bas
11281128
assert(expected == actual);
11291129
}
11301130

1131+
template <class T>
1132+
void test_case_string_rfind_ch(const basic_string<T>& input_haystack, const T value) {
1133+
ptrdiff_t expected;
1134+
1135+
const auto expected_iter = last_known_good_find_last(input_haystack.begin(), input_haystack.end(), value);
1136+
1137+
if (expected_iter != input_haystack.end()) {
1138+
expected = expected_iter - input_haystack.begin();
1139+
} else {
1140+
expected = -1;
1141+
}
1142+
1143+
const auto actual = static_cast<ptrdiff_t>(input_haystack.rfind(value));
1144+
assert(expected == actual);
1145+
}
1146+
11311147
template <class T, class D>
11321148
void test_basic_string_dis(mt19937_64& gen, D& dis) {
11331149
basic_string<T> input_haystack;
@@ -1144,6 +1160,7 @@ void test_basic_string_dis(mt19937_64& gen, D& dis) {
11441160
test_case_string_find_last_of(input_haystack, input_needle);
11451161
test_case_string_find_str(input_haystack, input_needle);
11461162
test_case_string_rfind_str(input_haystack, input_needle);
1163+
test_case_string_rfind_ch(input_haystack, static_cast<T>(dis(gen)));
11471164

11481165
for (size_t attempts = 0; attempts < needleDataCount; ++attempts) {
11491166
input_needle.push_back(static_cast<T>(dis(gen)));

0 commit comments

Comments
 (0)