-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[libc++] Optimize ranges::copy{, _n} for vector<bool>::iterator #121013
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
578bf8b
to
6995237
Compare
@llvm/pr-subscribers-libcxx Author: Peng Liu (winner245) ChangesThis PR optimizes the performance of
ranges::copy
ranges::copy_n
Patch is 33.87 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/121013.diff 9 Files Affected:
diff --git a/libcxx/docs/ReleaseNotes/20.rst b/libcxx/docs/ReleaseNotes/20.rst
index c8a07fb8b73348..4cb2eff0688176 100644
--- a/libcxx/docs/ReleaseNotes/20.rst
+++ b/libcxx/docs/ReleaseNotes/20.rst
@@ -73,6 +73,9 @@ Improvements and New Features
optimized, resulting in a performance improvement of up to 2x for trivial element types (e.g., `std::vector<int>`),
and up to 3.4x for non-trivial element types (e.g., `std::vector<std::vector<int>>`).
+- The ``std::ranges::copy`` and ``std::ranges::copy_n`` algorithms have been optimized for ``std::vector<bool>::iterator``\s,
+ resulting in a performance improvement of up to 2000x.
+
Deprecations and Removals
-------------------------
diff --git a/libcxx/include/__algorithm/copy.h b/libcxx/include/__algorithm/copy.h
index 4f30b2050abbaf..745056fa9ec058 100644
--- a/libcxx/include/__algorithm/copy.h
+++ b/libcxx/include/__algorithm/copy.h
@@ -13,8 +13,10 @@
#include <__algorithm/for_each_segment.h>
#include <__algorithm/min.h>
#include <__config>
+#include <__fwd/bit_reference.h>
#include <__iterator/iterator_traits.h>
#include <__iterator/segmented_iterator.h>
+#include <__memory/pointer_traits.h>
#include <__type_traits/common_type.h>
#include <__type_traits/enable_if.h>
#include <__utility/move.h>
@@ -29,9 +31,129 @@ _LIBCPP_PUSH_MACROS
_LIBCPP_BEGIN_NAMESPACE_STD
+template <class _InputIterator, class _OutputIterator>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
+copy(_InputIterator __first, _InputIterator __last, _OutputIterator __result);
+
template <class _InIter, class _Sent, class _OutIter>
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter> __copy(_InIter, _Sent, _OutIter);
+template <class _Cp, bool _IsConst>
+_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_aligned(
+ __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
+ using _In = __bit_iterator<_Cp, _IsConst>;
+ using difference_type = typename _In::difference_type;
+ using __storage_type = typename _In::__storage_type;
+
+ const int __bits_per_word = _In::__bits_per_word;
+ difference_type __n = __last - __first;
+ if (__n > 0) {
+ // do first word
+ if (__first.__ctz_ != 0) {
+ unsigned __clz = __bits_per_word - __first.__ctz_;
+ difference_type __dn = std::min(static_cast<difference_type>(__clz), __n);
+ __n -= __dn;
+ __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
+ __storage_type __b = *__first.__seg_ & __m;
+ *__result.__seg_ &= ~__m;
+ *__result.__seg_ |= __b;
+ __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
+ __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
+ ++__first.__seg_;
+ // __first.__ctz_ = 0;
+ }
+ // __first.__ctz_ == 0;
+ // do middle words
+ __storage_type __nw = __n / __bits_per_word;
+ std::copy(std::__to_address(__first.__seg_),
+ std::__to_address(__first.__seg_ + __nw),
+ std::__to_address(__result.__seg_));
+ __n -= __nw * __bits_per_word;
+ __result.__seg_ += __nw;
+ // do last word
+ if (__n > 0) {
+ __first.__seg_ += __nw;
+ __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+ __storage_type __b = *__first.__seg_ & __m;
+ *__result.__seg_ &= ~__m;
+ *__result.__seg_ |= __b;
+ __result.__ctz_ = static_cast<unsigned>(__n);
+ }
+ }
+ return __result;
+}
+
+template <class _Cp, bool _IsConst>
+_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_unaligned(
+ __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
+ using _In = __bit_iterator<_Cp, _IsConst>;
+ using difference_type = typename _In::difference_type;
+ using __storage_type = typename _In::__storage_type;
+
+ const int __bits_per_word = _In::__bits_per_word;
+ difference_type __n = __last - __first;
+ if (__n > 0) {
+ // do first word
+ if (__first.__ctz_ != 0) {
+ unsigned __clz_f = __bits_per_word - __first.__ctz_;
+ difference_type __dn = std::min(static_cast<difference_type>(__clz_f), __n);
+ __n -= __dn;
+ __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
+ __storage_type __b = *__first.__seg_ & __m;
+ unsigned __clz_r = __bits_per_word - __result.__ctz_;
+ __storage_type __ddn = std::min<__storage_type>(__dn, __clz_r);
+ __m = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __ddn));
+ *__result.__seg_ &= ~__m;
+ if (__result.__ctz_ > __first.__ctz_)
+ *__result.__seg_ |= __b << (__result.__ctz_ - __first.__ctz_);
+ else
+ *__result.__seg_ |= __b >> (__first.__ctz_ - __result.__ctz_);
+ __result.__seg_ += (__ddn + __result.__ctz_) / __bits_per_word;
+ __result.__ctz_ = static_cast<unsigned>((__ddn + __result.__ctz_) % __bits_per_word);
+ __dn -= __ddn;
+ if (__dn > 0) {
+ __m = ~__storage_type(0) >> (__bits_per_word - __dn);
+ *__result.__seg_ &= ~__m;
+ *__result.__seg_ |= __b >> (__first.__ctz_ + __ddn);
+ __result.__ctz_ = static_cast<unsigned>(__dn);
+ }
+ ++__first.__seg_;
+ // __first.__ctz_ = 0;
+ }
+ // __first.__ctz_ == 0;
+ // do middle words
+ unsigned __clz_r = __bits_per_word - __result.__ctz_;
+ __storage_type __m = ~__storage_type(0) << __result.__ctz_;
+ for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_) {
+ __storage_type __b = *__first.__seg_;
+ *__result.__seg_ &= ~__m;
+ *__result.__seg_ |= __b << __result.__ctz_;
+ ++__result.__seg_;
+ *__result.__seg_ &= __m;
+ *__result.__seg_ |= __b >> __clz_r;
+ }
+ // do last word
+ if (__n > 0) {
+ __m = ~__storage_type(0) >> (__bits_per_word - __n);
+ __storage_type __b = *__first.__seg_ & __m;
+ __storage_type __dn = std::min(__n, static_cast<difference_type>(__clz_r));
+ __m = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __dn));
+ *__result.__seg_ &= ~__m;
+ *__result.__seg_ |= __b << __result.__ctz_;
+ __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
+ __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
+ __n -= __dn;
+ if (__n > 0) {
+ __m = ~__storage_type(0) >> (__bits_per_word - __n);
+ *__result.__seg_ &= ~__m;
+ *__result.__seg_ |= __b >> __dn;
+ __result.__ctz_ = static_cast<unsigned>(__n);
+ }
+ }
+ }
+ return __result;
+}
+
struct __copy_impl {
template <class _InIter, class _Sent, class _OutIter>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter>
@@ -95,6 +217,16 @@ struct __copy_impl {
}
}
+ template <class _Cp, bool _IsConst>
+ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<__bit_iterator<_Cp, _IsConst>, __bit_iterator<_Cp, false> >
+ operator()(__bit_iterator<_Cp, _IsConst> __first,
+ __bit_iterator<_Cp, _IsConst> __last,
+ __bit_iterator<_Cp, false> __result) {
+ if (__first.__ctz_ == __result.__ctz_)
+ return std::make_pair(__last, std::__copy_aligned(__first, __last, __result));
+ return std::make_pair(__last, std::__copy_unaligned(__first, __last, __result));
+ }
+
// At this point, the iterators have been unwrapped so any `contiguous_iterator` has been unwrapped to a pointer.
template <class _In, class _Out, __enable_if_t<__can_lower_copy_assignment_to_memmove<_In, _Out>::value, int> = 0>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*>
@@ -110,7 +242,7 @@ __copy(_InIter __first, _Sent __last, _OutIter __result) {
}
template <class _InputIterator, class _OutputIterator>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
copy(_InputIterator __first, _InputIterator __last, _OutputIterator __result) {
return std::__copy(__first, __last, __result).second;
}
diff --git a/libcxx/include/__bit_reference b/libcxx/include/__bit_reference
index 9fa24c98d493fd..b51ee1c58dc009 100644
--- a/libcxx/include/__bit_reference
+++ b/libcxx/include/__bit_reference
@@ -10,6 +10,7 @@
#ifndef _LIBCPP___BIT_REFERENCE
#define _LIBCPP___BIT_REFERENCE
+#include <__algorithm/copy.h>
#include <__algorithm/copy_n.h>
#include <__algorithm/min.h>
#include <__bit/countr.h>
@@ -22,6 +23,7 @@
#include <__memory/pointer_traits.h>
#include <__type_traits/conditional.h>
#include <__type_traits/is_constant_evaluated.h>
+#include <__utility/pair.h>
#include <__utility/swap.h>
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -169,130 +171,6 @@ private:
__mask_(__m) {}
};
-// copy
-
-template <class _Cp, bool _IsConst>
-_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_aligned(
- __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
- using _In = __bit_iterator<_Cp, _IsConst>;
- using difference_type = typename _In::difference_type;
- using __storage_type = typename _In::__storage_type;
-
- const int __bits_per_word = _In::__bits_per_word;
- difference_type __n = __last - __first;
- if (__n > 0) {
- // do first word
- if (__first.__ctz_ != 0) {
- unsigned __clz = __bits_per_word - __first.__ctz_;
- difference_type __dn = std::min(static_cast<difference_type>(__clz), __n);
- __n -= __dn;
- __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
- __storage_type __b = *__first.__seg_ & __m;
- *__result.__seg_ &= ~__m;
- *__result.__seg_ |= __b;
- __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
- __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
- ++__first.__seg_;
- // __first.__ctz_ = 0;
- }
- // __first.__ctz_ == 0;
- // do middle words
- __storage_type __nw = __n / __bits_per_word;
- std::copy_n(std::__to_address(__first.__seg_), __nw, std::__to_address(__result.__seg_));
- __n -= __nw * __bits_per_word;
- __result.__seg_ += __nw;
- // do last word
- if (__n > 0) {
- __first.__seg_ += __nw;
- __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
- __storage_type __b = *__first.__seg_ & __m;
- *__result.__seg_ &= ~__m;
- *__result.__seg_ |= __b;
- __result.__ctz_ = static_cast<unsigned>(__n);
- }
- }
- return __result;
-}
-
-template <class _Cp, bool _IsConst>
-_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_unaligned(
- __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
- using _In = __bit_iterator<_Cp, _IsConst>;
- using difference_type = typename _In::difference_type;
- using __storage_type = typename _In::__storage_type;
-
- const int __bits_per_word = _In::__bits_per_word;
- difference_type __n = __last - __first;
- if (__n > 0) {
- // do first word
- if (__first.__ctz_ != 0) {
- unsigned __clz_f = __bits_per_word - __first.__ctz_;
- difference_type __dn = std::min(static_cast<difference_type>(__clz_f), __n);
- __n -= __dn;
- __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
- __storage_type __b = *__first.__seg_ & __m;
- unsigned __clz_r = __bits_per_word - __result.__ctz_;
- __storage_type __ddn = std::min<__storage_type>(__dn, __clz_r);
- __m = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __ddn));
- *__result.__seg_ &= ~__m;
- if (__result.__ctz_ > __first.__ctz_)
- *__result.__seg_ |= __b << (__result.__ctz_ - __first.__ctz_);
- else
- *__result.__seg_ |= __b >> (__first.__ctz_ - __result.__ctz_);
- __result.__seg_ += (__ddn + __result.__ctz_) / __bits_per_word;
- __result.__ctz_ = static_cast<unsigned>((__ddn + __result.__ctz_) % __bits_per_word);
- __dn -= __ddn;
- if (__dn > 0) {
- __m = ~__storage_type(0) >> (__bits_per_word - __dn);
- *__result.__seg_ &= ~__m;
- *__result.__seg_ |= __b >> (__first.__ctz_ + __ddn);
- __result.__ctz_ = static_cast<unsigned>(__dn);
- }
- ++__first.__seg_;
- // __first.__ctz_ = 0;
- }
- // __first.__ctz_ == 0;
- // do middle words
- unsigned __clz_r = __bits_per_word - __result.__ctz_;
- __storage_type __m = ~__storage_type(0) << __result.__ctz_;
- for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_) {
- __storage_type __b = *__first.__seg_;
- *__result.__seg_ &= ~__m;
- *__result.__seg_ |= __b << __result.__ctz_;
- ++__result.__seg_;
- *__result.__seg_ &= __m;
- *__result.__seg_ |= __b >> __clz_r;
- }
- // do last word
- if (__n > 0) {
- __m = ~__storage_type(0) >> (__bits_per_word - __n);
- __storage_type __b = *__first.__seg_ & __m;
- __storage_type __dn = std::min(__n, static_cast<difference_type>(__clz_r));
- __m = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __dn));
- *__result.__seg_ &= ~__m;
- *__result.__seg_ |= __b << __result.__ctz_;
- __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
- __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
- __n -= __dn;
- if (__n > 0) {
- __m = ~__storage_type(0) >> (__bits_per_word - __n);
- *__result.__seg_ &= ~__m;
- *__result.__seg_ |= __b >> __dn;
- __result.__ctz_ = static_cast<unsigned>(__n);
- }
- }
- }
- return __result;
-}
-
-template <class _Cp, bool _IsConst>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator<_Cp, false>
-copy(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
- if (__first.__ctz_ == __result.__ctz_)
- return std::__copy_aligned(__first, __last, __result);
- return std::__copy_unaligned(__first, __last, __result);
-}
-
// copy_backward
template <class _Cp, bool _IsConst>
@@ -975,8 +853,9 @@ private:
_LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false> __copy_unaligned(
__bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
template <class _Dp, bool _IC>
- _LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false>
- copy(__bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
+ _LIBCPP_CONSTEXPR_SINCE_CXX20 friend pair<__bit_iterator<_Dp, _IC>, __bit_iterator<_Dp, false> >
+ __copy_impl::operator()(
+ __bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
template <class _Dp, bool _IC>
_LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false> __copy_backward_aligned(
__bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
diff --git a/libcxx/include/bitset b/libcxx/include/bitset
index 8b361824805571..bb09bc5415fb03 100644
--- a/libcxx/include/bitset
+++ b/libcxx/include/bitset
@@ -129,6 +129,8 @@ template <size_t N> struct hash<std::bitset<N>>;
#if __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
# include <__cxx03/bitset>
#else
+# include <__algorithm/copy.h>
+# include <__algorithm/copy_backward.h>
# include <__algorithm/count.h>
# include <__algorithm/fill.h>
# include <__algorithm/fill_n.h>
diff --git a/libcxx/test/benchmarks/algorithms/copy.bench.cpp b/libcxx/test/benchmarks/algorithms/copy.bench.cpp
new file mode 100644
index 00000000000000..54006d5a72edea
--- /dev/null
+++ b/libcxx/test/benchmarks/algorithms/copy.bench.cpp
@@ -0,0 +1,89 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+#include <algorithm>
+#include <benchmark/benchmark.h>
+#include <vector>
+
+static void bm_ranges_copy(benchmark::State& state, bool aligned) {
+ auto n = state.range();
+ std::vector<bool> in(n, true);
+ std::vector<bool> out(aligned ? n : n + 8);
+ benchmark::DoNotOptimize(&in);
+ auto dst = aligned ? out.begin() : out.begin() + 4;
+ for (auto _ : state) {
+ benchmark::DoNotOptimize(std::ranges::copy(in, dst));
+ benchmark::DoNotOptimize(&out);
+ }
+}
+
+static void bm_ranges_copy_n(benchmark::State& state, bool aligned) {
+ auto n = state.range();
+ std::vector<bool> in(n, true);
+ std::vector<bool> out(aligned ? n : n + 8);
+ benchmark::DoNotOptimize(&in);
+ auto src = in.begin();
+ auto dst = aligned ? out.begin() : out.begin() + 4;
+ for (auto _ : state) {
+ benchmark::DoNotOptimize(std::ranges::copy_n(src, n, dst));
+ benchmark::DoNotOptimize(&out);
+ }
+}
+
+static void bm_copy(benchmark::State& state, bool aligned) {
+ auto n = state.range();
+ std::vector<bool> in(n, true);
+ std::vector<bool> out(aligned ? n : n + 8);
+ benchmark::DoNotOptimize(&in);
+ auto beg = in.begin();
+ auto end = in.end();
+ auto dst = aligned ? out.begin() : out.begin() + 4;
+ for (auto _ : state) {
+ benchmark::DoNotOptimize(std::copy(beg, end, dst));
+ benchmark::DoNotOptimize(&out);
+ }
+}
+
+static void bm_copy_n(benchmark::State& state, bool aligned) {
+ auto n = state.range();
+ std::vector<bool> in(n, true);
+ std::vector<bool> out(aligned ? n : n + 8);
+ benchmark::DoNotOptimize(&in);
+ auto src = in.begin();
+ auto dst = aligned ? out.begin() : out.begin() + 4;
+ for (auto _ : state) {
+ benchmark::DoNotOptimize(std::copy_n(src, n, dst));
+ benchmark::DoNotOptimize(&out);
+ }
+}
+
+static void bm_ranges_copy_aligned(benchmark::State& state) { bm_ranges_copy(state, true); }
+static void bm_ranges_copy_unaligned(benchmark::State& state) { bm_ranges_copy(state, false); }
+static void bm_ranges_copy_n_aligned(benchmark::State& state) { bm_ranges_copy_n(state, true); }
+static void bm_ranges_copy_n_unaligned(benchmark::State& state) { bm_ranges_copy_n(state, false); }
+
+static void bm_copy_aligned(benchmark::State& state) { bm_copy(state, true); }
+static void bm_copy_unaligned(benchmark::State& state) { bm_copy(state, false); }
+static void bm_copy_n_aligned(benchmark::State& state) { bm_copy_n(state, true); }
+static void bm_copy_n_unaligned(benchmark::State& state) { bm_copy_n(state, false); }
+
+// Test the range version of std::copy for vector<bool>::iterator
+BENCHMARK(bm_ranges_copy_aligned)->Range(8, 1 << 16)->DenseRange(102400, 204800, 4096);
+BENCHMARK(bm_ranges_copy_n_aligned)->Range(8, 1 << 20);
+BENCHMARK(bm_ranges_copy_unaligned)->Range(8, 1 << 20);
+BENCHMARK(bm_ranges_copy_n_unaligned)->Range(8, 1 << 20);
+
+// Test the iterator-pair version of std::copy for vector<bool>::iterator
+BENCHMARK(bm_copy_aligned)->Range(8, 1 << 20);
+BENCHMARK(bm_copy_n_aligned)->Range(8, 1 << 20);
+BENCHMARK(bm_copy_unaligned)->Range(8, 1 << 20);
+BENCHMARK(bm_copy_n_unaligned)->Range(8, 1 << 20);
+
+BENCHMARK_MAIN();
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp
index b5f0a32b986a03..553e4c206ac08a 100644
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp
@@ -14,6 +14,7 @@
#include <algorithm>
#include <cassert>
+#include <vector>
#...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This almost LGTM, I'd like to see again but this is a great improvement. Thanks!
libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp
Outdated
Show resolved
Hide resolved
23da6d9
to
87d762b
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This LGTM with comments applied!
libcxx/docs/ReleaseNotes/20.rst
Outdated
@@ -111,6 +111,9 @@ Improvements and New Features | |||
std::errc::not_a_directory``, or use ``err.default_error_condition()`` to map to an ``error_condition``, and then test | |||
its ``value()`` and ``category()``. | |||
|
|||
- The ``std::ranges::copy`` and ``std::ranges::copy_n`` algorithms have been optimized for ``std::vector<bool>::iterator``\s, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let's move this to 21.rst
since the LLVM 20 release branch was just cut.
@@ -78,13 +99,21 @@ TEST_CONSTEXPR_CXX20 bool test() { | |||
assert(std::equal(a, a + 10, expected)); | |||
} | |||
|
|||
{ // Test vector<bool>::iterator optimization | |||
assert(test_vector_bool(8)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What about testing with an odd-sized vector? I'm sure it doesn't matter for performance, but it could matter for correctness and we don't seem to have coverage for that right now.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've added more tests cases for odd-sized vector. Same as below.
test_copy_n<const int*, int*>(); | ||
|
||
{ // Test vector<bool>::iterator optimization | ||
assert(test_vector_bool(8)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same comment here for odd-sizes.
@@ -204,6 +227,16 @@ constexpr bool test() { | |||
} | |||
} | |||
|
|||
#if TEST_STD_VER >= 23 | |||
{ // Test vector<bool>::iterator optimization | |||
assert(test_vector_bool(8)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same.
87d762b
to
7cc72af
Compare
…121026) As a follow-up to #121013 (which focused on `std::ranges::copy`), this PR optimizes the performance of `std::ranges::copy_backward` for `vector<bool>::iterator`, addressing a subtask outlined in issue #64038. The optimizations yield performance improvements of up to 2000x for aligned copies and 60x for unaligned copies.
…#121109) As a follow-up to #121013 (which optimized `ranges::copy`) and #121026 (which optimized `ranges::copy_backward`), this PR enhances the performance of `std::ranges::{move, move_backward}` for `vector<bool>::iterator`, addressing a subtask outlined in issue #64038. The optimizations bring performance improvements analogous to those achieved for the `{copy, copy_backward}` algorithms: up to 2000x for aligned moves and 60x for unaligned moves. Moreover, comprehensive tests covering up to 4 storage words (256 bytes) with odd and even bit sizes are provided, which validate the proposed optimizations in this patch.
This PR optimizes the performance of
std::ranges::copy
andstd::ranges::copy_n
specifically forvector<bool>::iterator
, addressing a subtask outlined in issue #64038. The optimizations yield performance improvements of up to 2000x for aligned copies and 60x for unaligned copies. Additionally, new tests have been added to validate these enhancements.ranges::copy
ranges::copy_n