1
+ #include < util/generic/ptr.h>
2
+ #include < util/system/cpu_id.h>
3
+ #include < util/system/types.h>
4
+ #include < util/stream/output.h>
5
+ #include < util/generic/string.h>
6
+ #include < vector>
7
+ #include < immintrin.h>
8
+ #include < avxintrin.h>
9
+ #include < chrono>
10
+ #include < ydb/library/yql/utils/simd/simd.h>
11
+
12
+
13
+ const size_t size = 64e5 ;
14
+
15
+ template <typename T>
16
+ inline double GetSum (std::vector<std::vector<T>>& columns, std::vector<T>& result) {
17
+ const size_t SIZE_OF_TYPE = 256 / (sizeof (T) * 8 );
18
+ const size_t align_size = columns[0 ].size ();
19
+
20
+ std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now ();
21
+
22
+ for (size_t i = 0 ; i < align_size; i += SIZE_OF_TYPE) {
23
+ NSimd::NAVX2::TSimd8<T> final_register (&columns[0 ][i]);
24
+
25
+ for (size_t j = 1 ; j < columns.size (); ++j) {
26
+ final_register.Add64 (&columns[j][i]);
27
+ }
28
+
29
+ final_register.Store (&result[i]);
30
+ }
31
+
32
+ std::chrono::steady_clock::time_point finish = std::chrono::steady_clock::now ();
33
+
34
+ return std::chrono::duration_cast<std::chrono::microseconds>(finish - start).count ();
35
+
36
+ }
37
+
38
+ double StandartAdding (std::vector<std::vector<ui64>>& columns, std::vector<ui64>& result) {
39
+ std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now ();
40
+
41
+ for (size_t j = 0 ; j < columns[0 ].size (); ++j) {
42
+
43
+ for (size_t i = 0 ; i < columns[i].size (); ++i) {
44
+ result[j] += columns[i][j];
45
+ }
46
+
47
+ }
48
+ std::chrono::steady_clock::time_point finish = std::chrono::steady_clock::now ();
49
+
50
+ return std::chrono::duration_cast<std::chrono::microseconds>(finish - start).count ();
51
+ }
52
+
53
+ int main () {
54
+ std::vector<std::vector<ui64>> vec1 (10 , std::vector<ui64>(size, 1e12 + 3 ));
55
+
56
+ std::vector<ui64> result1 (size, 0 );
57
+ std::vector<ui64> result2 (size, 0 );
58
+
59
+ double ans1 = GetSum (vec1, result1);
60
+ double ans2 = StandartAdding (vec1, result2);
61
+
62
+ for (size_t i = 0 ; i < result2.size (); ++i) {
63
+ if (result2[i] != result1[i]) {
64
+ Cerr << " something went wrong..." ;
65
+ return 0 ;
66
+ }
67
+ }
68
+
69
+ Cerr << " The results are the same. Let's compare times:\n " ;
70
+ Cerr << " Time, using AVX2: " << ans1 << " ms\n " ;
71
+ Cerr << " Time, using standart adding: " << ans2 << " ms" ;
72
+ }
0 commit comments