Skip to content

Commit 0377ba6

Browse files
author
zhongxiao.yzx
committed
[simd_neon] opt find_first/last_set for neon and setup benchmark for it(to #32373348)
Summary: find_first_set and find_last_set method is not optimal for neon, it need to be improved by synthesized with horizontal adds(vaddv) which will reduce the generated assembly code; in the following cases, vaddvq_s16 will generate 2 instructions but vpadd_s16 will generate 4 instrunctions ``` #ifdef __aarch64__ return vaddvq_s16(__asint); // addv h0, v1.8h // smov w1, v0.h[0] #else return vpadd_s16( vpadd_s16(vpadd_s16(__lo64(__asint), __hi64(__asint)), __zero), __zero)[0]; // addp v1.8h,v1.8h,v2.8h // addp v1.8h,v1.8h,v2.8h // addp v1.8h,v1.8h,v2.8h // smov w1, v1.h[0] #endif ``` Further discussion following the linking: [1]. WebAssembly/simd#201 [2]. WebAssembly/simd#131 Test Plan: test_run.sh Reviewers: chengbin.cb, liangbin.mj, yifeng.dongyifeng, longfei.alf, chuanqi.xcq Issue: https://aone.alibaba-inc.com/req/32373348 CR: https://code.aone.alibaba-inc.com/cpp_libs/std-simd/codereview/4534679
1 parent 7541b23 commit 0377ba6

10 files changed

+420
-8
lines changed

ci/benchmark/CMakeLists.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
# limitations under the License.
1515
#
1616

17-
set(BENCHMARK_SRCS test.cpp)
17+
set(BENCHMARK_SRCS
18+
benchmark_simdmask_find.cpp
19+
)
1820

1921
add_benchmark_ctest(bm_test ${BENCHMARK_SRCS})
Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
#include "benchmark/benchmark.h"
2+
3+
#include <string.h>
4+
#include <random>
5+
#include <iostream>
6+
7+
#include "experimental/simd"
8+
9+
using namespace std::experimental;
10+
using namespace std::experimental::parallelism_v2;
11+
12+
static bool has_init = false;
13+
static int search_value;
14+
static std::vector<int> TestData;
15+
std::vector<int>&
16+
getTestData(size_t max_size = 10000)
17+
{
18+
if (has_init)
19+
{
20+
return TestData;
21+
}
22+
TestData.reserve(max_size);
23+
has_init = true;
24+
25+
std::random_device rd;
26+
std::mt19937 gen(rd());
27+
std::uniform_int_distribution<> distrib(1, max_size);
28+
std::vector<int> data;
29+
for (size_t i = 0; i < TestData.capacity(); ++i)
30+
{
31+
TestData.emplace_back(distrib(gen));
32+
}
33+
search_value = TestData[distrib(gen)];
34+
return TestData;
35+
}
36+
37+
// BM_SimpleFind {{{
38+
template <typename _InputIterator, typename _Tp>
39+
inline unsigned int
40+
simple_find(_InputIterator first, _InputIterator last, const _Tp& val)
41+
{
42+
unsigned int i = 0;
43+
for (; first != last; ++first, ++i)
44+
{
45+
if (*first == val)
46+
{
47+
break;
48+
}
49+
}
50+
return i;
51+
}
52+
53+
void
54+
BM_SimpleFind(benchmark::State& state)
55+
{
56+
int64_t Size = state.range(0);
57+
std::vector<int> TestData(Size, 0xff);
58+
int search_value = 9999;
59+
TestData[Size - 1] = search_value;
60+
for (auto _ : state)
61+
{
62+
benchmark::DoNotOptimize(
63+
simple_find(TestData.begin(), TestData.end(), search_value));
64+
}
65+
}
66+
BENCHMARK(BM_SimpleFind)->Arg(10000);
67+
// }}}
68+
69+
// BM_FindFirst {{{
70+
template <class Iterator, class T>
71+
inline unsigned int
72+
simd_first(Iterator first, Iterator last, const T& value)
73+
{
74+
std::experimental::native_simd<T> DataV;
75+
unsigned int i = 0;
76+
for (; first != last; i += std::experimental::native_simd<T>::size(),
77+
first += std::experimental::native_simd<T>::size())
78+
{
79+
DataV.copy_from(&(*first), std::experimental::vector_aligned);
80+
const auto mask = DataV == value;
81+
if (std::experimental::any_of(mask))
82+
{
83+
return i + std::experimental::find_first_set(mask);
84+
}
85+
}
86+
return i;
87+
}
88+
89+
void
90+
BM_SimdFind(benchmark::State& state)
91+
{
92+
int64_t Size = state.range(0);
93+
std::vector<int> TestData(Size, 0xff);
94+
int search_value = 9999;
95+
TestData[Size - 1] = search_value;
96+
for (auto _ : state)
97+
{
98+
benchmark::DoNotOptimize(
99+
simd_first(TestData.begin(), TestData.end(), search_value));
100+
}
101+
}
102+
BENCHMARK(BM_SimdFind)->Arg(10000);
103+
// }}}
104+
105+
// BM_StdFindFirstSet {{{
106+
template <typename MaskType>
107+
inline int
108+
std_find_first_set(MaskType& mask)
109+
{
110+
size_t i = 0;
111+
for (; i < MaskType::size(); ++i)
112+
{
113+
if (static_cast<bool>(mask[i]))
114+
{
115+
return i;
116+
}
117+
}
118+
return i;
119+
}
120+
121+
template <typename ElemT>
122+
void
123+
BM_StdFindFirstSet(benchmark::State& state)
124+
{
125+
// fprintf(stderr, "benchmark setup");
126+
native_simd_mask<ElemT> nat_simd(0);
127+
nat_simd[state.range(0)] = 1;
128+
for (auto _ : state)
129+
{
130+
benchmark::DoNotOptimize(std_find_first_set(nat_simd));
131+
}
132+
}
133+
BENCHMARK_TEMPLATE(BM_StdFindFirstSet, uint8_t)
134+
->DenseRange(1, native_simd<uint8_t>::size(), 1);
135+
// }}}
136+
137+
// BM_SimdFindFirstSet {{{
138+
template <typename MaskType>
139+
inline int
140+
simd_find_first_set(MaskType& mask)
141+
{
142+
return find_first_set(mask);
143+
}
144+
145+
template <typename ElemT>
146+
void
147+
BM_SimdFindFirstSet(benchmark::State& state)
148+
{
149+
native_simd_mask<ElemT> nat_simd_mask(0);
150+
nat_simd_mask[state.range(0)] = 1;
151+
for (auto _ : state)
152+
{
153+
benchmark::DoNotOptimize(simd_find_first_set(nat_simd_mask));
154+
}
155+
}
156+
BENCHMARK_TEMPLATE(BM_SimdFindFirstSet, uint8_t)
157+
->DenseRange(0, native_simd<uint8_t>::size(), 1);
158+
BENCHMARK_TEMPLATE(BM_SimdFindFirstSet, uint16_t)
159+
->DenseRange(0, native_simd<uint16_t>::size(), 1);
160+
BENCHMARK_TEMPLATE(BM_SimdFindFirstSet, uint32_t)
161+
->DenseRange(0, native_simd<uint32_t>::size(), 1);
162+
BENCHMARK_TEMPLATE(BM_SimdFindFirstSet, uint64_t)
163+
->DenseRange(0, native_simd<uint64_t>::size(), 1);
164+
165+
// }}}
166+
167+
// BM_SimdFindLastSet {{{
168+
template <typename ElemT>
169+
void
170+
BM_SimdFindLastSet(benchmark::State& state)
171+
{
172+
native_simd_mask<ElemT> nat_simd_mask(0);
173+
nat_simd_mask[state.range(0)] = 1;
174+
for (auto _ : state)
175+
{
176+
benchmark::DoNotOptimize(simd_find_first_set(nat_simd_mask));
177+
}
178+
}
179+
BENCHMARK_TEMPLATE(BM_SimdFindLastSet, uint8_t)
180+
->DenseRange(0, native_simd<uint8_t>::size(), 1);
181+
BENCHMARK_TEMPLATE(BM_SimdFindLastSet, uint16_t)
182+
->DenseRange(0, native_simd<uint16_t>::size(), 1);
183+
BENCHMARK_TEMPLATE(BM_SimdFindLastSet, uint32_t)
184+
->DenseRange(0, native_simd<uint32_t>::size(), 1);
185+
BENCHMARK_TEMPLATE(BM_SimdFindLastSet, uint64_t)
186+
->DenseRange(0, native_simd<uint64_t>::size(), 1);
187+
// }}}

ci/cmake/FindBenchmark.cmake

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,10 @@ set(GTEST_INCLUDE_DIR "${GTEST_SRC}/include")
8787
set(GTEST_LIB_DIR "${BINARY_DIR}/lib")
8888
set(GTEST_LIB "${GTEST_LIB_DIR}/libgtest.a")
8989
set(GTEST_MAIN "${GTEST_LIB_DIR}/libgtest_main.a")
90+
if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
91+
set(GTEST_LIB "${GTEST_LIB_DIR}/libgtestd.a")
92+
set(GTEST_MAIN "${GTEST_LIB_DIR}/libgtest_maind.a")
93+
endif()
9094

9195
add_library(gtest_ STATIC IMPORTED GLOBAL)
9296
set_property(TARGET gtest_ PROPERTY IMPORTED_LOCATION ${GTEST_LIB})
@@ -108,6 +112,10 @@ set(GMOCK_INCLUDE_DIR "${GMOCK_SRC}/include")
108112
set(GMOCK_LIB_DIR "${BINARY_DIR}/lib")
109113
set(GMOCK_LIB "${GMOCK_LIB_DIR}/libgmock.a")
110114
set(GMOCK_MAIN "${GMOCK_LIB_DIR}/libgmock_main.a")
115+
if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
116+
set(GMOCK_LIB "${GMOCK_LIB_DIR}/libgmockd.a")
117+
set(GMOCK_MAIN "${GMOCK_LIB_DIR}/libgmock_maind.a")
118+
endif()
111119

112120
add_library(gmock_ STATIC IMPORTED GLOBAL)
113121
set_property(TARGET gmock_ PROPERTY IMPORTED_LOCATION ${GMOCK_LIB})

ci/cmake/FindGTest.cmake

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,10 @@ set(GTEST_INCLUDE_DIR "${GTEST_SRC}/include")
7474
set(GTEST_LIB_DIR "${BINARY_DIR}/lib")
7575
set(GTEST_LIB "${GTEST_LIB_DIR}/libgtest.a")
7676
set(GTEST_MAIN "${GTEST_LIB_DIR}/libgtest_main.a")
77-
77+
if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
78+
set(GTEST_LIB "${GTEST_LIB_DIR}/libgtestd.a")
79+
set(GTEST_MAIN "${GTEST_LIB_DIR}/libgtest_maind.a")
80+
endif()
7881
# FIXME(): IMPORTED target can not set interface_directories with non-existing path
7982
# we use INTERFACE target to workaroud
8083
# target_include_directories(T INTERFACE DIR) DIR must existed
@@ -99,6 +102,10 @@ set(GMOCK_INCLUDE_DIR "${GMOCK_SRC}/include")
99102
set(GMOCK_LIB_DIR "${BINARY_DIR}/lib")
100103
set(GMOCK_LIB "${GMOCK_LIB_DIR}/libgmock.a")
101104
set(GMOCK_MAIN "${GMOCK_LIB_DIR}/libgmock_main.a")
105+
if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
106+
set(GMOCK_LIB "${GMOCK_LIB_DIR}/libgmockd.a")
107+
set(GMOCK_MAIN "${GMOCK_LIB_DIR}/libgmock_maind.a")
108+
endif()
102109

103110
add_library(gmock_ STATIC IMPORTED GLOBAL)
104111
set_property(TARGET gmock_ PROPERTY IMPORTED_LOCATION ${GMOCK_LIB})

ci/unittest/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ set(UNITTEST_SRCS
1919
test_simd_add.cpp
2020
test_simd_popcount.cpp
2121
test_simd_struct.cpp
22+
test_simdmask_find.cpp
23+
neon/test_simd_neon.cpp
2224
)
2325

2426
add_ctest(ut_gtest ${UNITTEST_SRCS})

ci/unittest/neon/test_simd_neon.cpp

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
#include "gtest/gtest.h"
2+
3+
#include <stdio.h>
4+
#include <stdint.h>
5+
6+
#include "experimental/simd"
7+
8+
#if _GLIBCXX_SIMD_HAVE_NEON
9+
10+
using namespace std::experimental;
11+
using namespace std::experimental::parallelism_v2;
12+
13+
TEST(_MaskImplNeonMixin, Test01_MoveMask)
14+
{
15+
native_simd_mask<uint8_t> mask8(0);
16+
mask8[7] = 1;
17+
using MaskImplNeon
18+
= _MaskImplNeon<typename native_simd_mask<uint8_t>::abi_type>;
19+
int bits = MaskImplNeon::__move_mask_aarch64(__data(mask8));
20+
asm volatile("" : : "r,m"(bits) : "memory");
21+
22+
int set_bit = MaskImplNeon::__find_first_set(mask8);
23+
EXPECT_TRUE(set_bit == 7);
24+
}
25+
26+
TEST(_MaskImplNeon, Test01_FindFirstSet)
27+
{
28+
#define FINDFIRSTSET_EXPECTED(ElemT, BIT) \
29+
({ \
30+
native_simd_mask<ElemT> MaskInst(0); \
31+
int idx = BIT; \
32+
int last_idx = (native_simd_mask<ElemT>::size() - idx) \
33+
% native_simd_mask<ElemT>::size(); \
34+
MaskInst[idx] = 1; \
35+
MaskInst[last_idx] = 1; \
36+
printf("MaskInst[%d] = MaskInst[%d] = 1\n", idx, last_idx); \
37+
int bit_set = find_first_set(MaskInst); \
38+
printf("first set %d ", bit_set); \
39+
EXPECT_TRUE(std::min(idx, last_idx) == bit_set); \
40+
bit_set = find_last_set(MaskInst); \
41+
printf("last set %d\n", bit_set); \
42+
EXPECT_TRUE(std::max(idx, last_idx) == bit_set); \
43+
})
44+
45+
printf("\nnative_simd_mask<uint8_t>\n");
46+
for (size_t i = 0; i < native_simd_mask<uint8_t>::size(); ++i)
47+
{
48+
FINDFIRSTSET_EXPECTED(uint8_t, i);
49+
}
50+
printf("\nnative_simd_mask<uint16_t>\n");
51+
for (size_t i = 0; i < native_simd_mask<uint16_t>::size(); ++i)
52+
{
53+
FINDFIRSTSET_EXPECTED(uint16_t, i);
54+
}
55+
printf("\nnative_simd_mask<uint32_t>\n");
56+
for (size_t i = 0; i < native_simd_mask<uint32_t>::size(); ++i)
57+
{
58+
FINDFIRSTSET_EXPECTED(uint32_t, i);
59+
}
60+
printf("\nnative_simd_mask<uint64_t>\n");
61+
for (size_t i = 0; i < native_simd_mask<uint64_t>::size(); ++i)
62+
{
63+
FINDFIRSTSET_EXPECTED(uint64_t, i);
64+
}
65+
}
66+
67+
#endif

ci/unittest/test_simd_popcount.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ TEST(PopCntTest, Test02_simdmask_popcount)
4848
printf("%d bits <-- vec[%lu] :", bit_count, Size); \
4949
for (size_t i = 0; i < Size; ++i) \
5050
{ \
51-
printf("0x%lx ", static_cast<uint64_t>(simd_val[i])); \
51+
printf("0x%lx ", static_cast<uint64_t>(simd_val[i])); \
5252
} \
5353
printf("\n"); \
5454
bit_count; \

ci/unittest/test_simdmask_find.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
#include "gtest/gtest.h"
2+
3+
#include <stdio.h>
4+
5+
#include "experimental/simd"
6+
7+
using namespace std::experimental;
8+
using namespace std::experimental::parallelism_v2;
9+
10+
TEST(SimdMaskTest, Test01_find_first_set)
11+
{
12+
native_simd<uint8_t> DataV(0);
13+
DataV[5] = 1;
14+
native_simd_mask<uint8_t> mask = DataV == 1;
15+
int idx = find_first_set(mask);
16+
std::cout << "first set bit is " << idx << "\n";
17+
EXPECT_TRUE(idx == 5);
18+
}
19+
20+
TEST(SimdMaskTest, Test02_find_last_set)
21+
{
22+
native_simd<uint8_t> DataV(0);
23+
DataV[5] = 1;
24+
native_simd_mask<uint8_t> mask = DataV == 1;
25+
int idx = find_last_set(mask);
26+
std::cout << "first set bit is " << idx << "\n";
27+
EXPECT_TRUE(idx == 5);
28+
}

0 commit comments

Comments
 (0)