Skip to content

Commit 63a2b20

Browse files
committed
[libc++, std::vector] call the optimized version of __uninitialized_allocator_copy for trivial types
See: llvm/llvm-project#61987 Fix suggested by: @philnik and @var-const Reviewers: philnik, ldionne, EricWF, var-const Differential Revision: https://reviews.llvm.org/D147741 Testing: ninja check-cxx check-clang check-llvm Benchmark Testcases (BM_CopyConstruct, and BM_Assignment) added. performance improvement: Run on (8 X 4800 MHz CPU s) CPU Caches: L1 Data 48 KiB (x4) L1 Instruction 32 KiB (x4) L2 Unified 1280 KiB (x4) L3 Unified 12288 KiB (x1) Load Average: 1.66, 3.02, 2.43 Comparing build-runtimes-base/libcxx/benchmarks/vector_operations.libcxx.out to build-runtimes/libcxx/benchmarks/vector_operations.libcxx.out Benchmark Time CPU Time Old Time New CPU Old CPU New ---------------------------------------------------------------------------------------------------------------------------------------- BM_ConstructSize/vector_byte/5140480 +0.0362 +0.0362 116906 121132 116902 121131 BM_CopyConstruct/vector_int/5140480 -0.4563 -0.4577 1755224 954241 1755330 951987 BM_Assignment/vector_int/5140480 -0.0222 -0.0220 990045 968095 989917 968125 BM_ConstructSizeValue/vector_byte/5140480 +0.0308 +0.0307 116970 120567 116977 120573 BM_ConstructIterIter/vector_char/1024 -0.0831 -0.0831 19 17 19 17 BM_ConstructIterIter/vector_size_t/1024 +0.0129 +0.0131 88 89 88 89 BM_ConstructIterIter/vector_string/1024 -0.0064 -0.0018 54455 54109 54208 54112 OVERALL_GEOMEAN -0.0845 -0.0842 0 0 0 0 FYI, the perf improvements for BM_CopyConstruct due to this patch is mostly subsumed by the https://reviews.llvm.org/D149826. However this patch still adds value by converting copy to memmove (the second testcase). Before the patch: ``` define linkonce_odr dso_local void @_ZNSt3__16vectorIiNS_9allocatorIiEEE18__construct_at_endIPiS5_EEvT_T0_m(ptr noundef nonnull align 8 dereferenceable(24) %0, ptr noundef %1, ptr noundef %2, i64 noundef %3) local_unnamed_addr #4 comdat align 2 { %5 = getelementptr inbounds %"class.std::__1::vector", ptr %0, i64 0, i32 1 %6 = load ptr, ptr %5, align 8, !tbaa !12 %7 = icmp eq ptr %1, %2 br i1 %7, label %16, label %8 8: ; preds = %4, %8 %9 = phi ptr [ %13, %8 ], [ %1, %4 ] %10 = phi ptr [ %14, %8 ], [ %6, %4 ] %11 = icmp ne ptr %10, null tail call void @llvm.assume(i1 %11) %12 = load i32, ptr %9, align 4, !tbaa !14 store i32 %12, ptr %10, align 4, !tbaa !14 %13 = getelementptr inbounds i32, ptr %9, i64 1 %14 = getelementptr inbounds i32, ptr %10, i64 1 %15 = icmp eq ptr %13, %2 br i1 %15, label %16, label %8, !llvm.loop !16 16: ; preds = %8, %4 %17 = phi ptr [ %6, %4 ], [ %14, %8 ] store ptr %17, ptr %5, align 8, !tbaa !12 ret void } ``` After the patch: ``` define linkonce_odr dso_local void @_ZNSt3__16vectorIiNS_9allocatorIiEEE18__construct_at_endIPiS5_EEvT_T0_m(ptr noundef nonnull align 8 dereferenceable(24) %0, ptr noundef %1, ptr noundef %2, i64 noundef %3) local_unnamed_addr #4 comdat align 2 { %5 = getelementptr inbounds %"class.std::__1::vector", ptr %0, i64 0, i32 1 %6 = load ptr, ptr %5, align 8, !tbaa !12 %7 = ptrtoint ptr %2 to i64 %8 = ptrtoint ptr %1 to i64 %9 = sub i64 %7, %8 %10 = ashr exact i64 %9, 2 tail call void @llvm.memmove.p0.p0.i64(ptr align 4 %6, ptr align 4 %1, i64 %9, i1 false) %11 = getelementptr inbounds i32, ptr %6, i64 %10 store ptr %11, ptr %5, align 8, !tbaa !12 ret void } ``` This is due to the optimized version of uninitialized_allocator_copy function.
1 parent fa58f32 commit 63a2b20

File tree

3 files changed

+50
-9
lines changed

3 files changed

+50
-9
lines changed

libcxx/benchmarks/ContainerBenchmarks.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,28 @@ void BM_ConstructSize(benchmark::State& st, Container) {
2626
}
2727
}
2828

29+
template <class Container>
30+
void BM_CopyConstruct(benchmark::State& st, Container) {
31+
auto size = st.range(0);
32+
Container c(size);
33+
for (auto _ : st) {
34+
auto v = c;
35+
DoNotOptimizeData(v);
36+
}
37+
}
38+
39+
template <class Container>
40+
void BM_Assignment(benchmark::State& st, Container) {
41+
auto size = st.range(0);
42+
Container c1;
43+
Container c2(size);
44+
for (auto _ : st) {
45+
c1 = c2;
46+
DoNotOptimizeData(c1);
47+
DoNotOptimizeData(c2);
48+
}
49+
}
50+
2951
template <class Container>
3052
void BM_ConstructSizeValue(benchmark::State& st, Container, typename Container::value_type const& val) {
3153
const auto size = st.range(0);

libcxx/benchmarks/vector_operations.bench.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,14 @@ BENCHMARK_CAPTURE(BM_ConstructSize,
1717
vector_byte,
1818
std::vector<unsigned char>{})->Arg(5140480);
1919

20+
BENCHMARK_CAPTURE(BM_CopyConstruct,
21+
vector_int,
22+
std::vector<int>{})->Arg(5140480);
23+
24+
BENCHMARK_CAPTURE(BM_Assignment,
25+
vector_int,
26+
std::vector<int>{})->Arg(5140480);
27+
2028
BENCHMARK_CAPTURE(BM_ConstructSizeValue,
2129
vector_byte,
2230
std::vector<unsigned char>{}, 0)->Arg(5140480);

libcxx/include/__memory/uninitialized_algorithms.h

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212

1313
#include <__algorithm/copy.h>
1414
#include <__algorithm/move.h>
15+
#include <__algorithm/unwrap_iter.h>
16+
#include <__algorithm/unwrap_range.h>
1517
#include <__config>
1618
#include <__iterator/iterator_traits.h>
1719
#include <__iterator/reverse_iterator.h>
@@ -545,7 +547,7 @@ class _AllocatorDestroyRangeReverse {
545547
// already copied elements are destroyed in reverse order of their construction.
546548
template <class _Alloc, class _Iter1, class _Sent1, class _Iter2>
547549
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter2
548-
__uninitialized_allocator_copy(_Alloc& __alloc, _Iter1 __first1, _Sent1 __last1, _Iter2 __first2) {
550+
__uninitialized_allocator_copy_impl(_Alloc& __alloc, _Iter1 __first1, _Sent1 __last1, _Iter2 __first2) {
549551
auto __destruct_first = __first2;
550552
auto __guard =
551553
std::__make_exception_guard(_AllocatorDestroyRangeReverse<_Alloc, _Iter2>(__alloc, __destruct_first, __first2));
@@ -565,14 +567,16 @@ template <class _Type>
565567
struct __allocator_has_trivial_copy_construct<allocator<_Type>, _Type> : true_type {};
566568

567569
template <class _Alloc,
568-
class _Type,
569-
class _RawType = __remove_const_t<_Type>,
570+
class _In,
571+
class _RawTypeIn = __remove_const_t<_In>,
572+
class _Out,
570573
__enable_if_t<
571-
// using _RawType because of the allocator<T const> extension
572-
is_trivially_copy_constructible<_RawType>::value && is_trivially_copy_assignable<_RawType>::value &&
573-
__allocator_has_trivial_copy_construct<_Alloc, _RawType>::value>* = nullptr>
574-
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Type*
575-
__uninitialized_allocator_copy(_Alloc&, const _Type* __first1, const _Type* __last1, _Type* __first2) {
574+
// using _RawTypeIn because of the allocator<T const> extension
575+
is_trivially_copy_constructible<_RawTypeIn>::value && is_trivially_copy_assignable<_RawTypeIn>::value &&
576+
is_same<__remove_cv_t<_In>, __remove_cv_t<_Out> >::value &&
577+
__allocator_has_trivial_copy_construct<_Alloc, _RawTypeIn>::value>* = nullptr>
578+
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Out*
579+
__uninitialized_allocator_copy_impl(_Alloc&, _In* __first1, _In* __last1, _Out* __first2) {
576580
// TODO: Remove the const_cast once we drop support for std::allocator<T const>
577581
if (__libcpp_is_constant_evaluated()) {
578582
while (__first1 != __last1) {
@@ -582,10 +586,17 @@ __uninitialized_allocator_copy(_Alloc&, const _Type* __first1, const _Type* __la
582586
}
583587
return __first2;
584588
} else {
585-
return std::copy(__first1, __last1, const_cast<_RawType*>(__first2));
589+
return std::copy(__first1, __last1, const_cast<_RawTypeIn*>(__first2));
586590
}
587591
}
588592

593+
template <class _Alloc, class _Iter1, class _Sent1, class _Iter2>
594+
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter2 __uninitialized_allocator_copy(_Alloc& __alloc, _Iter1 __first1, _Sent1 __last1, _Iter2 __first2) {
595+
auto __unwrapped_range = std::__unwrap_range(__first1, __last1);
596+
auto __result = std::__uninitialized_allocator_copy_impl(__alloc, __unwrapped_range.first, __unwrapped_range.second, std::__unwrap_iter(__first2));
597+
return std::__rewrap_iter(__first2, __result);
598+
}
599+
589600
// Move-construct the elements [__first1, __last1) into [__first2, __first2 + N)
590601
// if the move constructor is noexcept, where N is distance(__first1, __last1).
591602
//

0 commit comments

Comments
 (0)