Skip to content

Commit edc3dc6

Browse files
authored
[libc++] Optimize ranges::copy_backward for vector<bool>::iterator (#121026)
As a follow-up to #121013 (which focused on `std::ranges::copy`), this PR optimizes the performance of `std::ranges::copy_backward` for `vector<bool>::iterator`, addressing a subtask outlined in issue #64038. The optimizations yield performance improvements of up to 2000x for aligned copies and 60x for unaligned copies.
1 parent e058c73 commit edc3dc6

File tree

8 files changed

+321
-199
lines changed

8 files changed

+321
-199
lines changed

libcxx/docs/ReleaseNotes/21.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ Implemented Papers
4444
Improvements and New Features
4545
-----------------------------
4646

47-
- The ``std::ranges::copy`` and ``std::ranges::copy_n`` algorithms have been optimized for ``std::vector<bool>::iterator``\s,
47+
- The ``std::ranges::{copy, copy_n, copy_backward}`` algorithms have been optimized for ``std::vector<bool>::iterator``\s,
4848
resulting in a performance improvement of up to 2000x.
4949

5050

libcxx/include/__algorithm/copy_backward.h

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,14 @@
1010
#define _LIBCPP___ALGORITHM_COPY_BACKWARD_H
1111

1212
#include <__algorithm/copy_move_common.h>
13+
#include <__algorithm/copy_n.h>
1314
#include <__algorithm/iterator_operations.h>
1415
#include <__algorithm/min.h>
1516
#include <__config>
17+
#include <__fwd/bit_reference.h>
1618
#include <__iterator/iterator_traits.h>
1719
#include <__iterator/segmented_iterator.h>
20+
#include <__memory/pointer_traits.h>
1821
#include <__type_traits/common_type.h>
1922
#include <__type_traits/enable_if.h>
2023
#include <__type_traits/is_constructible.h>
@@ -34,6 +37,124 @@ template <class _AlgPolicy, class _InIter, class _Sent, class _OutIter>
3437
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_InIter, _OutIter>
3538
__copy_backward(_InIter __first, _Sent __last, _OutIter __result);
3639

40+
template <class _Cp, bool _IsConst>
41+
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_backward_aligned(
42+
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
43+
using _In = __bit_iterator<_Cp, _IsConst>;
44+
using difference_type = typename _In::difference_type;
45+
using __storage_type = typename _In::__storage_type;
46+
47+
const int __bits_per_word = _In::__bits_per_word;
48+
difference_type __n = __last - __first;
49+
if (__n > 0) {
50+
// do first word
51+
if (__last.__ctz_ != 0) {
52+
difference_type __dn = std::min(static_cast<difference_type>(__last.__ctz_), __n);
53+
__n -= __dn;
54+
unsigned __clz = __bits_per_word - __last.__ctz_;
55+
__storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz);
56+
__storage_type __b = *__last.__seg_ & __m;
57+
*__result.__seg_ &= ~__m;
58+
*__result.__seg_ |= __b;
59+
__result.__ctz_ = static_cast<unsigned>(((-__dn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
60+
// __last.__ctz_ = 0
61+
}
62+
// __last.__ctz_ == 0 || __n == 0
63+
// __result.__ctz_ == 0 || __n == 0
64+
// do middle words
65+
__storage_type __nw = __n / __bits_per_word;
66+
__result.__seg_ -= __nw;
67+
__last.__seg_ -= __nw;
68+
std::copy_n(std::__to_address(__last.__seg_), __nw, std::__to_address(__result.__seg_));
69+
__n -= __nw * __bits_per_word;
70+
// do last word
71+
if (__n > 0) {
72+
__storage_type __m = ~__storage_type(0) << (__bits_per_word - __n);
73+
__storage_type __b = *--__last.__seg_ & __m;
74+
*--__result.__seg_ &= ~__m;
75+
*__result.__seg_ |= __b;
76+
__result.__ctz_ = static_cast<unsigned>(-__n & (__bits_per_word - 1));
77+
}
78+
}
79+
return __result;
80+
}
81+
82+
template <class _Cp, bool _IsConst>
83+
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_backward_unaligned(
84+
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
85+
using _In = __bit_iterator<_Cp, _IsConst>;
86+
using difference_type = typename _In::difference_type;
87+
using __storage_type = typename _In::__storage_type;
88+
89+
const int __bits_per_word = _In::__bits_per_word;
90+
difference_type __n = __last - __first;
91+
if (__n > 0) {
92+
// do first word
93+
if (__last.__ctz_ != 0) {
94+
difference_type __dn = std::min(static_cast<difference_type>(__last.__ctz_), __n);
95+
__n -= __dn;
96+
unsigned __clz_l = __bits_per_word - __last.__ctz_;
97+
__storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_l);
98+
__storage_type __b = *__last.__seg_ & __m;
99+
unsigned __clz_r = __bits_per_word - __result.__ctz_;
100+
__storage_type __ddn = std::min(__dn, static_cast<difference_type>(__result.__ctz_));
101+
if (__ddn > 0) {
102+
__m = (~__storage_type(0) << (__result.__ctz_ - __ddn)) & (~__storage_type(0) >> __clz_r);
103+
*__result.__seg_ &= ~__m;
104+
if (__result.__ctz_ > __last.__ctz_)
105+
*__result.__seg_ |= __b << (__result.__ctz_ - __last.__ctz_);
106+
else
107+
*__result.__seg_ |= __b >> (__last.__ctz_ - __result.__ctz_);
108+
__result.__ctz_ = static_cast<unsigned>(((-__ddn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
109+
__dn -= __ddn;
110+
}
111+
if (__dn > 0) {
112+
// __result.__ctz_ == 0
113+
--__result.__seg_;
114+
__result.__ctz_ = static_cast<unsigned>(-__dn & (__bits_per_word - 1));
115+
__m = ~__storage_type(0) << __result.__ctz_;
116+
*__result.__seg_ &= ~__m;
117+
__last.__ctz_ -= __dn + __ddn;
118+
*__result.__seg_ |= __b << (__result.__ctz_ - __last.__ctz_);
119+
}
120+
// __last.__ctz_ = 0
121+
}
122+
// __last.__ctz_ == 0 || __n == 0
123+
// __result.__ctz_ != 0 || __n == 0
124+
// do middle words
125+
unsigned __clz_r = __bits_per_word - __result.__ctz_;
126+
__storage_type __m = ~__storage_type(0) >> __clz_r;
127+
for (; __n >= __bits_per_word; __n -= __bits_per_word) {
128+
__storage_type __b = *--__last.__seg_;
129+
*__result.__seg_ &= ~__m;
130+
*__result.__seg_ |= __b >> __clz_r;
131+
*--__result.__seg_ &= __m;
132+
*__result.__seg_ |= __b << __result.__ctz_;
133+
}
134+
// do last word
135+
if (__n > 0) {
136+
__m = ~__storage_type(0) << (__bits_per_word - __n);
137+
__storage_type __b = *--__last.__seg_ & __m;
138+
__clz_r = __bits_per_word - __result.__ctz_;
139+
__storage_type __dn = std::min(__n, static_cast<difference_type>(__result.__ctz_));
140+
__m = (~__storage_type(0) << (__result.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_r);
141+
*__result.__seg_ &= ~__m;
142+
*__result.__seg_ |= __b >> (__bits_per_word - __result.__ctz_);
143+
__result.__ctz_ = static_cast<unsigned>(((-__dn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
144+
__n -= __dn;
145+
if (__n > 0) {
146+
// __result.__ctz_ == 0
147+
--__result.__seg_;
148+
__result.__ctz_ = static_cast<unsigned>(-__n & (__bits_per_word - 1));
149+
__m = ~__storage_type(0) << __result.__ctz_;
150+
*__result.__seg_ &= ~__m;
151+
*__result.__seg_ |= __b << (__result.__ctz_ - (__bits_per_word - __n - __dn));
152+
}
153+
}
154+
}
155+
return __result;
156+
}
157+
37158
template <class _AlgPolicy>
38159
struct __copy_backward_impl {
39160
template <class _InIter, class _Sent, class _OutIter>
@@ -107,6 +228,16 @@ struct __copy_backward_impl {
107228
}
108229
}
109230

231+
template <class _Cp, bool _IsConst>
232+
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<__bit_iterator<_Cp, _IsConst>, __bit_iterator<_Cp, false> >
233+
operator()(__bit_iterator<_Cp, _IsConst> __first,
234+
__bit_iterator<_Cp, _IsConst> __last,
235+
__bit_iterator<_Cp, false> __result) {
236+
if (__last.__ctz_ == __result.__ctz_)
237+
return std::make_pair(__last, std::__copy_backward_aligned(__first, __last, __result));
238+
return std::make_pair(__last, std::__copy_backward_unaligned(__first, __last, __result));
239+
}
240+
110241
// At this point, the iterators have been unwrapped so any `contiguous_iterator` has been unwrapped to a pointer.
111242
template <class _In, class _Out, __enable_if_t<__can_lower_copy_assignment_to_memmove<_In, _Out>::value, int> = 0>
112243
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*>

libcxx/include/__bit_reference

Lines changed: 3 additions & 131 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#define _LIBCPP___BIT_REFERENCE
1212

1313
#include <__algorithm/copy.h>
14+
#include <__algorithm/copy_backward.h>
1415
#include <__algorithm/copy_n.h>
1516
#include <__algorithm/min.h>
1617
#include <__bit/countr.h>
@@ -185,134 +186,6 @@ private:
185186
__mask_(__m) {}
186187
};
187188

188-
// copy_backward
189-
190-
template <class _Cp, bool _IsConst>
191-
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_backward_aligned(
192-
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
193-
using _In = __bit_iterator<_Cp, _IsConst>;
194-
using difference_type = typename _In::difference_type;
195-
using __storage_type = typename _In::__storage_type;
196-
197-
const int __bits_per_word = _In::__bits_per_word;
198-
difference_type __n = __last - __first;
199-
if (__n > 0) {
200-
// do first word
201-
if (__last.__ctz_ != 0) {
202-
difference_type __dn = std::min(static_cast<difference_type>(__last.__ctz_), __n);
203-
__n -= __dn;
204-
unsigned __clz = __bits_per_word - __last.__ctz_;
205-
__storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz);
206-
__storage_type __b = *__last.__seg_ & __m;
207-
*__result.__seg_ &= ~__m;
208-
*__result.__seg_ |= __b;
209-
__result.__ctz_ = static_cast<unsigned>(((-__dn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
210-
// __last.__ctz_ = 0
211-
}
212-
// __last.__ctz_ == 0 || __n == 0
213-
// __result.__ctz_ == 0 || __n == 0
214-
// do middle words
215-
__storage_type __nw = __n / __bits_per_word;
216-
__result.__seg_ -= __nw;
217-
__last.__seg_ -= __nw;
218-
std::copy_n(std::__to_address(__last.__seg_), __nw, std::__to_address(__result.__seg_));
219-
__n -= __nw * __bits_per_word;
220-
// do last word
221-
if (__n > 0) {
222-
__storage_type __m = ~__storage_type(0) << (__bits_per_word - __n);
223-
__storage_type __b = *--__last.__seg_ & __m;
224-
*--__result.__seg_ &= ~__m;
225-
*__result.__seg_ |= __b;
226-
__result.__ctz_ = static_cast<unsigned>(-__n & (__bits_per_word - 1));
227-
}
228-
}
229-
return __result;
230-
}
231-
232-
template <class _Cp, bool _IsConst>
233-
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_backward_unaligned(
234-
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
235-
using _In = __bit_iterator<_Cp, _IsConst>;
236-
using difference_type = typename _In::difference_type;
237-
using __storage_type = typename _In::__storage_type;
238-
239-
const int __bits_per_word = _In::__bits_per_word;
240-
difference_type __n = __last - __first;
241-
if (__n > 0) {
242-
// do first word
243-
if (__last.__ctz_ != 0) {
244-
difference_type __dn = std::min(static_cast<difference_type>(__last.__ctz_), __n);
245-
__n -= __dn;
246-
unsigned __clz_l = __bits_per_word - __last.__ctz_;
247-
__storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_l);
248-
__storage_type __b = *__last.__seg_ & __m;
249-
unsigned __clz_r = __bits_per_word - __result.__ctz_;
250-
__storage_type __ddn = std::min(__dn, static_cast<difference_type>(__result.__ctz_));
251-
if (__ddn > 0) {
252-
__m = (~__storage_type(0) << (__result.__ctz_ - __ddn)) & (~__storage_type(0) >> __clz_r);
253-
*__result.__seg_ &= ~__m;
254-
if (__result.__ctz_ > __last.__ctz_)
255-
*__result.__seg_ |= __b << (__result.__ctz_ - __last.__ctz_);
256-
else
257-
*__result.__seg_ |= __b >> (__last.__ctz_ - __result.__ctz_);
258-
__result.__ctz_ = static_cast<unsigned>(((-__ddn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
259-
__dn -= __ddn;
260-
}
261-
if (__dn > 0) {
262-
// __result.__ctz_ == 0
263-
--__result.__seg_;
264-
__result.__ctz_ = static_cast<unsigned>(-__dn & (__bits_per_word - 1));
265-
__m = ~__storage_type(0) << __result.__ctz_;
266-
*__result.__seg_ &= ~__m;
267-
__last.__ctz_ -= __dn + __ddn;
268-
*__result.__seg_ |= __b << (__result.__ctz_ - __last.__ctz_);
269-
}
270-
// __last.__ctz_ = 0
271-
}
272-
// __last.__ctz_ == 0 || __n == 0
273-
// __result.__ctz_ != 0 || __n == 0
274-
// do middle words
275-
unsigned __clz_r = __bits_per_word - __result.__ctz_;
276-
__storage_type __m = ~__storage_type(0) >> __clz_r;
277-
for (; __n >= __bits_per_word; __n -= __bits_per_word) {
278-
__storage_type __b = *--__last.__seg_;
279-
*__result.__seg_ &= ~__m;
280-
*__result.__seg_ |= __b >> __clz_r;
281-
*--__result.__seg_ &= __m;
282-
*__result.__seg_ |= __b << __result.__ctz_;
283-
}
284-
// do last word
285-
if (__n > 0) {
286-
__m = ~__storage_type(0) << (__bits_per_word - __n);
287-
__storage_type __b = *--__last.__seg_ & __m;
288-
__clz_r = __bits_per_word - __result.__ctz_;
289-
__storage_type __dn = std::min(__n, static_cast<difference_type>(__result.__ctz_));
290-
__m = (~__storage_type(0) << (__result.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_r);
291-
*__result.__seg_ &= ~__m;
292-
*__result.__seg_ |= __b >> (__bits_per_word - __result.__ctz_);
293-
__result.__ctz_ = static_cast<unsigned>(((-__dn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
294-
__n -= __dn;
295-
if (__n > 0) {
296-
// __result.__ctz_ == 0
297-
--__result.__seg_;
298-
__result.__ctz_ = static_cast<unsigned>(-__n & (__bits_per_word - 1));
299-
__m = ~__storage_type(0) << __result.__ctz_;
300-
*__result.__seg_ &= ~__m;
301-
*__result.__seg_ |= __b << (__result.__ctz_ - (__bits_per_word - __n - __dn));
302-
}
303-
}
304-
}
305-
return __result;
306-
}
307-
308-
template <class _Cp, bool _IsConst>
309-
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator<_Cp, false> copy_backward(
310-
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
311-
if (__last.__ctz_ == __result.__ctz_)
312-
return std::__copy_backward_aligned(__first, __last, __result);
313-
return std::__copy_backward_unaligned(__first, __last, __result);
314-
}
315-
316189
// move
317190

318191
template <class _Cp, bool _IsConst>
@@ -876,9 +749,8 @@ private:
876749
template <class _Dp, bool _IC>
877750
_LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false> __copy_backward_unaligned(
878751
__bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
879-
template <class _Dp, bool _IC>
880-
_LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false>
881-
copy_backward(__bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
752+
template <class _AlgPolicy>
753+
friend struct __copy_backward_impl;
882754
template <class _Cl, class _Cr>
883755
friend __bit_iterator<_Cr, false>
884756
__swap_ranges_aligned(__bit_iterator<_Cl, false>, __bit_iterator<_Cl, false>, __bit_iterator<_Cr, false>);

libcxx/include/__vector/vector_bool.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#define _LIBCPP___VECTOR_VECTOR_BOOL_H
1111

1212
#include <__algorithm/copy.h>
13+
#include <__algorithm/copy_backward.h>
1314
#include <__algorithm/fill_n.h>
1415
#include <__algorithm/iterator_operations.h>
1516
#include <__algorithm/max.h>

libcxx/include/bitset

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ template <size_t N> struct hash<std::bitset<N>>;
130130
# include <__cxx03/bitset>
131131
#else
132132
# include <__algorithm/copy.h>
133+
# include <__algorithm/copy_backward.h>
133134
# include <__algorithm/count.h>
134135
# include <__algorithm/fill.h>
135136
# include <__algorithm/fill_n.h>
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
10+
11+
#include <algorithm>
12+
#include <benchmark/benchmark.h>
13+
#include <vector>
14+
15+
static void bm_ranges_copy_backward_vb(benchmark::State& state, bool aligned) {
16+
auto n = state.range();
17+
std::vector<bool> in(n, true);
18+
std::vector<bool> out(aligned ? n : n + 8);
19+
benchmark::DoNotOptimize(&in);
20+
auto dst = aligned ? out.end() : out.end() - 4;
21+
for (auto _ : state) {
22+
benchmark::DoNotOptimize(std::ranges::copy_backward(in, dst));
23+
benchmark::DoNotOptimize(&out);
24+
}
25+
}
26+
27+
static void bm_copy_backward_vb(benchmark::State& state, bool aligned) {
28+
auto n = state.range();
29+
std::vector<bool> in(n, true);
30+
std::vector<bool> out(aligned ? n : n + 8);
31+
benchmark::DoNotOptimize(&in);
32+
auto beg = in.begin();
33+
auto end = in.end();
34+
auto dst = aligned ? out.end() : out.end() - 4;
35+
for (auto _ : state) {
36+
benchmark::DoNotOptimize(std::copy_backward(beg, end, dst));
37+
benchmark::DoNotOptimize(&out);
38+
}
39+
}
40+
41+
static void bm_ranges_copy_backward_vb_aligned(benchmark::State& state) { bm_ranges_copy_backward_vb(state, true); }
42+
static void bm_ranges_copy_backward_vb_unaligned(benchmark::State& state) { bm_ranges_copy_backward_vb(state, false); }
43+
44+
static void bm_copy_backward_vb_aligned(benchmark::State& state) { bm_copy_backward_vb(state, true); }
45+
static void bm_copy_backward_vb_unaligned(benchmark::State& state) { bm_copy_backward_vb(state, false); }
46+
47+
// Test std::ranges::copy_backward for vector<bool>::iterator
48+
BENCHMARK(bm_ranges_copy_backward_vb_aligned)->Range(8, 1 << 16)->DenseRange(102400, 204800, 4096);
49+
BENCHMARK(bm_ranges_copy_backward_vb_unaligned)->Range(8, 1 << 20);
50+
51+
// Test std::copy_backward for vector<bool>::iterator
52+
BENCHMARK(bm_copy_backward_vb_aligned)->Range(8, 1 << 20);
53+
BENCHMARK(bm_copy_backward_vb_unaligned)->Range(8, 1 << 20);
54+
55+
BENCHMARK_MAIN();

0 commit comments

Comments
 (0)