Skip to content

Commit 03028d1

Browse files
committed
Optimize ranges::copy_backward for vector<bool>::iterator
1 parent 1cbfac0 commit 03028d1

File tree

8 files changed

+321
-199
lines changed

8 files changed

+321
-199
lines changed

libcxx/docs/ReleaseNotes/21.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,8 @@ Implemented Papers
4444
Improvements and New Features
4545
-----------------------------
4646

47-
- TODO
47+
- The ``std::ranges::copy_backward`` algorithm has been optimized for ``std::vector<bool>::iterator``\s, resulting in
48+
a performance improvement of up to 2000x.
4849

4950

5051
Deprecations and Removals

libcxx/include/__algorithm/copy_backward.h

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,14 @@
1010
#define _LIBCPP___ALGORITHM_COPY_BACKWARD_H
1111

1212
#include <__algorithm/copy_move_common.h>
13+
#include <__algorithm/copy_n.h>
1314
#include <__algorithm/iterator_operations.h>
1415
#include <__algorithm/min.h>
1516
#include <__config>
17+
#include <__fwd/bit_reference.h>
1618
#include <__iterator/iterator_traits.h>
1719
#include <__iterator/segmented_iterator.h>
20+
#include <__memory/pointer_traits.h>
1821
#include <__type_traits/common_type.h>
1922
#include <__type_traits/enable_if.h>
2023
#include <__type_traits/is_constructible.h>
@@ -34,6 +37,124 @@ template <class _AlgPolicy, class _InIter, class _Sent, class _OutIter>
3437
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_InIter, _OutIter>
3538
__copy_backward(_InIter __first, _Sent __last, _OutIter __result);
3639

40+
template <class _Cp, bool _IsConst>
41+
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_backward_aligned(
42+
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
43+
using _In = __bit_iterator<_Cp, _IsConst>;
44+
using difference_type = typename _In::difference_type;
45+
using __storage_type = typename _In::__storage_type;
46+
47+
const int __bits_per_word = _In::__bits_per_word;
48+
difference_type __n = __last - __first;
49+
if (__n > 0) {
50+
// do first word
51+
if (__last.__ctz_ != 0) {
52+
difference_type __dn = std::min(static_cast<difference_type>(__last.__ctz_), __n);
53+
__n -= __dn;
54+
unsigned __clz = __bits_per_word - __last.__ctz_;
55+
__storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz);
56+
__storage_type __b = *__last.__seg_ & __m;
57+
*__result.__seg_ &= ~__m;
58+
*__result.__seg_ |= __b;
59+
__result.__ctz_ = static_cast<unsigned>(((-__dn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
60+
// __last.__ctz_ = 0
61+
}
62+
// __last.__ctz_ == 0 || __n == 0
63+
// __result.__ctz_ == 0 || __n == 0
64+
// do middle words
65+
__storage_type __nw = __n / __bits_per_word;
66+
__result.__seg_ -= __nw;
67+
__last.__seg_ -= __nw;
68+
std::copy_n(std::__to_address(__last.__seg_), __nw, std::__to_address(__result.__seg_));
69+
__n -= __nw * __bits_per_word;
70+
// do last word
71+
if (__n > 0) {
72+
__storage_type __m = ~__storage_type(0) << (__bits_per_word - __n);
73+
__storage_type __b = *--__last.__seg_ & __m;
74+
*--__result.__seg_ &= ~__m;
75+
*__result.__seg_ |= __b;
76+
__result.__ctz_ = static_cast<unsigned>(-__n & (__bits_per_word - 1));
77+
}
78+
}
79+
return __result;
80+
}
81+
82+
template <class _Cp, bool _IsConst>
83+
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_backward_unaligned(
84+
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
85+
using _In = __bit_iterator<_Cp, _IsConst>;
86+
using difference_type = typename _In::difference_type;
87+
using __storage_type = typename _In::__storage_type;
88+
89+
const int __bits_per_word = _In::__bits_per_word;
90+
difference_type __n = __last - __first;
91+
if (__n > 0) {
92+
// do first word
93+
if (__last.__ctz_ != 0) {
94+
difference_type __dn = std::min(static_cast<difference_type>(__last.__ctz_), __n);
95+
__n -= __dn;
96+
unsigned __clz_l = __bits_per_word - __last.__ctz_;
97+
__storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_l);
98+
__storage_type __b = *__last.__seg_ & __m;
99+
unsigned __clz_r = __bits_per_word - __result.__ctz_;
100+
__storage_type __ddn = std::min(__dn, static_cast<difference_type>(__result.__ctz_));
101+
if (__ddn > 0) {
102+
__m = (~__storage_type(0) << (__result.__ctz_ - __ddn)) & (~__storage_type(0) >> __clz_r);
103+
*__result.__seg_ &= ~__m;
104+
if (__result.__ctz_ > __last.__ctz_)
105+
*__result.__seg_ |= __b << (__result.__ctz_ - __last.__ctz_);
106+
else
107+
*__result.__seg_ |= __b >> (__last.__ctz_ - __result.__ctz_);
108+
__result.__ctz_ = static_cast<unsigned>(((-__ddn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
109+
__dn -= __ddn;
110+
}
111+
if (__dn > 0) {
112+
// __result.__ctz_ == 0
113+
--__result.__seg_;
114+
__result.__ctz_ = static_cast<unsigned>(-__dn & (__bits_per_word - 1));
115+
__m = ~__storage_type(0) << __result.__ctz_;
116+
*__result.__seg_ &= ~__m;
117+
__last.__ctz_ -= __dn + __ddn;
118+
*__result.__seg_ |= __b << (__result.__ctz_ - __last.__ctz_);
119+
}
120+
// __last.__ctz_ = 0
121+
}
122+
// __last.__ctz_ == 0 || __n == 0
123+
// __result.__ctz_ != 0 || __n == 0
124+
// do middle words
125+
unsigned __clz_r = __bits_per_word - __result.__ctz_;
126+
__storage_type __m = ~__storage_type(0) >> __clz_r;
127+
for (; __n >= __bits_per_word; __n -= __bits_per_word) {
128+
__storage_type __b = *--__last.__seg_;
129+
*__result.__seg_ &= ~__m;
130+
*__result.__seg_ |= __b >> __clz_r;
131+
*--__result.__seg_ &= __m;
132+
*__result.__seg_ |= __b << __result.__ctz_;
133+
}
134+
// do last word
135+
if (__n > 0) {
136+
__m = ~__storage_type(0) << (__bits_per_word - __n);
137+
__storage_type __b = *--__last.__seg_ & __m;
138+
__clz_r = __bits_per_word - __result.__ctz_;
139+
__storage_type __dn = std::min(__n, static_cast<difference_type>(__result.__ctz_));
140+
__m = (~__storage_type(0) << (__result.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_r);
141+
*__result.__seg_ &= ~__m;
142+
*__result.__seg_ |= __b >> (__bits_per_word - __result.__ctz_);
143+
__result.__ctz_ = static_cast<unsigned>(((-__dn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
144+
__n -= __dn;
145+
if (__n > 0) {
146+
// __result.__ctz_ == 0
147+
--__result.__seg_;
148+
__result.__ctz_ = static_cast<unsigned>(-__n & (__bits_per_word - 1));
149+
__m = ~__storage_type(0) << __result.__ctz_;
150+
*__result.__seg_ &= ~__m;
151+
*__result.__seg_ |= __b << (__result.__ctz_ - (__bits_per_word - __n - __dn));
152+
}
153+
}
154+
}
155+
return __result;
156+
}
157+
37158
template <class _AlgPolicy>
38159
struct __copy_backward_impl {
39160
template <class _InIter, class _Sent, class _OutIter>
@@ -107,6 +228,16 @@ struct __copy_backward_impl {
107228
}
108229
}
109230

231+
template <class _Cp, bool _IsConst>
232+
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<__bit_iterator<_Cp, _IsConst>, __bit_iterator<_Cp, false> >
233+
operator()(__bit_iterator<_Cp, _IsConst> __first,
234+
__bit_iterator<_Cp, _IsConst> __last,
235+
__bit_iterator<_Cp, false> __result) {
236+
if (__last.__ctz_ == __result.__ctz_)
237+
return std::make_pair(__last, std::__copy_backward_aligned(__first, __last, __result));
238+
return std::make_pair(__last, std::__copy_backward_unaligned(__first, __last, __result));
239+
}
240+
110241
// At this point, the iterators have been unwrapped so any `contiguous_iterator` has been unwrapped to a pointer.
111242
template <class _In, class _Out, __enable_if_t<__can_lower_copy_assignment_to_memmove<_In, _Out>::value, int> = 0>
112243
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*>

libcxx/include/__bit_reference

Lines changed: 5 additions & 131 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#ifndef _LIBCPP___BIT_REFERENCE
1111
#define _LIBCPP___BIT_REFERENCE
1212

13+
#include <__algorithm/copy_backward.h>
1314
#include <__algorithm/copy_n.h>
1415
#include <__algorithm/min.h>
1516
#include <__bit/countr.h>
@@ -307,134 +308,6 @@ copy(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last
307308
return std::__copy_unaligned(__first, __last, __result);
308309
}
309310

310-
// copy_backward
311-
312-
template <class _Cp, bool _IsConst>
313-
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_backward_aligned(
314-
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
315-
using _In = __bit_iterator<_Cp, _IsConst>;
316-
using difference_type = typename _In::difference_type;
317-
using __storage_type = typename _In::__storage_type;
318-
319-
const int __bits_per_word = _In::__bits_per_word;
320-
difference_type __n = __last - __first;
321-
if (__n > 0) {
322-
// do first word
323-
if (__last.__ctz_ != 0) {
324-
difference_type __dn = std::min(static_cast<difference_type>(__last.__ctz_), __n);
325-
__n -= __dn;
326-
unsigned __clz = __bits_per_word - __last.__ctz_;
327-
__storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz);
328-
__storage_type __b = *__last.__seg_ & __m;
329-
*__result.__seg_ &= ~__m;
330-
*__result.__seg_ |= __b;
331-
__result.__ctz_ = static_cast<unsigned>(((-__dn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
332-
// __last.__ctz_ = 0
333-
}
334-
// __last.__ctz_ == 0 || __n == 0
335-
// __result.__ctz_ == 0 || __n == 0
336-
// do middle words
337-
__storage_type __nw = __n / __bits_per_word;
338-
__result.__seg_ -= __nw;
339-
__last.__seg_ -= __nw;
340-
std::copy_n(std::__to_address(__last.__seg_), __nw, std::__to_address(__result.__seg_));
341-
__n -= __nw * __bits_per_word;
342-
// do last word
343-
if (__n > 0) {
344-
__storage_type __m = ~__storage_type(0) << (__bits_per_word - __n);
345-
__storage_type __b = *--__last.__seg_ & __m;
346-
*--__result.__seg_ &= ~__m;
347-
*__result.__seg_ |= __b;
348-
__result.__ctz_ = static_cast<unsigned>(-__n & (__bits_per_word - 1));
349-
}
350-
}
351-
return __result;
352-
}
353-
354-
template <class _Cp, bool _IsConst>
355-
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_backward_unaligned(
356-
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
357-
using _In = __bit_iterator<_Cp, _IsConst>;
358-
using difference_type = typename _In::difference_type;
359-
using __storage_type = typename _In::__storage_type;
360-
361-
const int __bits_per_word = _In::__bits_per_word;
362-
difference_type __n = __last - __first;
363-
if (__n > 0) {
364-
// do first word
365-
if (__last.__ctz_ != 0) {
366-
difference_type __dn = std::min(static_cast<difference_type>(__last.__ctz_), __n);
367-
__n -= __dn;
368-
unsigned __clz_l = __bits_per_word - __last.__ctz_;
369-
__storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_l);
370-
__storage_type __b = *__last.__seg_ & __m;
371-
unsigned __clz_r = __bits_per_word - __result.__ctz_;
372-
__storage_type __ddn = std::min(__dn, static_cast<difference_type>(__result.__ctz_));
373-
if (__ddn > 0) {
374-
__m = (~__storage_type(0) << (__result.__ctz_ - __ddn)) & (~__storage_type(0) >> __clz_r);
375-
*__result.__seg_ &= ~__m;
376-
if (__result.__ctz_ > __last.__ctz_)
377-
*__result.__seg_ |= __b << (__result.__ctz_ - __last.__ctz_);
378-
else
379-
*__result.__seg_ |= __b >> (__last.__ctz_ - __result.__ctz_);
380-
__result.__ctz_ = static_cast<unsigned>(((-__ddn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
381-
__dn -= __ddn;
382-
}
383-
if (__dn > 0) {
384-
// __result.__ctz_ == 0
385-
--__result.__seg_;
386-
__result.__ctz_ = static_cast<unsigned>(-__dn & (__bits_per_word - 1));
387-
__m = ~__storage_type(0) << __result.__ctz_;
388-
*__result.__seg_ &= ~__m;
389-
__last.__ctz_ -= __dn + __ddn;
390-
*__result.__seg_ |= __b << (__result.__ctz_ - __last.__ctz_);
391-
}
392-
// __last.__ctz_ = 0
393-
}
394-
// __last.__ctz_ == 0 || __n == 0
395-
// __result.__ctz_ != 0 || __n == 0
396-
// do middle words
397-
unsigned __clz_r = __bits_per_word - __result.__ctz_;
398-
__storage_type __m = ~__storage_type(0) >> __clz_r;
399-
for (; __n >= __bits_per_word; __n -= __bits_per_word) {
400-
__storage_type __b = *--__last.__seg_;
401-
*__result.__seg_ &= ~__m;
402-
*__result.__seg_ |= __b >> __clz_r;
403-
*--__result.__seg_ &= __m;
404-
*__result.__seg_ |= __b << __result.__ctz_;
405-
}
406-
// do last word
407-
if (__n > 0) {
408-
__m = ~__storage_type(0) << (__bits_per_word - __n);
409-
__storage_type __b = *--__last.__seg_ & __m;
410-
__clz_r = __bits_per_word - __result.__ctz_;
411-
__storage_type __dn = std::min(__n, static_cast<difference_type>(__result.__ctz_));
412-
__m = (~__storage_type(0) << (__result.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_r);
413-
*__result.__seg_ &= ~__m;
414-
*__result.__seg_ |= __b >> (__bits_per_word - __result.__ctz_);
415-
__result.__ctz_ = static_cast<unsigned>(((-__dn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
416-
__n -= __dn;
417-
if (__n > 0) {
418-
// __result.__ctz_ == 0
419-
--__result.__seg_;
420-
__result.__ctz_ = static_cast<unsigned>(-__n & (__bits_per_word - 1));
421-
__m = ~__storage_type(0) << __result.__ctz_;
422-
*__result.__seg_ &= ~__m;
423-
*__result.__seg_ |= __b << (__result.__ctz_ - (__bits_per_word - __n - __dn));
424-
}
425-
}
426-
}
427-
return __result;
428-
}
429-
430-
template <class _Cp, bool _IsConst>
431-
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator<_Cp, false> copy_backward(
432-
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
433-
if (__last.__ctz_ == __result.__ctz_)
434-
return std::__copy_backward_aligned(__first, __last, __result);
435-
return std::__copy_backward_unaligned(__first, __last, __result);
436-
}
437-
438311
// move
439312

440313
template <class _Cp, bool _IsConst>
@@ -997,9 +870,10 @@ private:
997870
template <class _Dp, bool _IC>
998871
_LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false> __copy_backward_unaligned(
999872
__bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
1000-
template <class _Dp, bool _IC>
1001-
_LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false>
1002-
copy_backward(__bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
873+
// Note: dependent nested name specifier __copy_backward_impl<_AlgPolicy>::operator() for friend declaration
874+
// is not supported in clang. Thus, we use a friend declaration for the entire class.
875+
template <class _AlgPolicy>
876+
friend struct __copy_backward_impl;
1003877
template <class _Cl, class _Cr>
1004878
friend __bit_iterator<_Cr, false>
1005879
__swap_ranges_aligned(__bit_iterator<_Cl, false>, __bit_iterator<_Cl, false>, __bit_iterator<_Cr, false>);

libcxx/include/__vector/vector_bool.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#define _LIBCPP___VECTOR_VECTOR_BOOL_H
1111

1212
#include <__algorithm/copy.h>
13+
#include <__algorithm/copy_backward.h>
1314
#include <__algorithm/fill_n.h>
1415
#include <__algorithm/iterator_operations.h>
1516
#include <__algorithm/max.h>

libcxx/include/bitset

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,8 @@ template <size_t N> struct hash<std::bitset<N>>;
129129
#if __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
130130
# include <__cxx03/bitset>
131131
#else
132+
# include <__algorithm/copy.h>
133+
# include <__algorithm/copy_backward.h>
132134
# include <__algorithm/count.h>
133135
# include <__algorithm/fill.h>
134136
# include <__algorithm/fill_n.h>
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
10+
11+
#include <algorithm>
12+
#include <benchmark/benchmark.h>
13+
#include <vector>
14+
15+
static void bm_ranges_copy_backward_vb(benchmark::State& state, bool aligned) {
16+
auto n = state.range();
17+
std::vector<bool> in(n, true);
18+
std::vector<bool> out(aligned ? n : n + 8);
19+
benchmark::DoNotOptimize(&in);
20+
auto dst = aligned ? out.end() : out.end() - 4;
21+
for (auto _ : state) {
22+
benchmark::DoNotOptimize(std::ranges::copy_backward(in, dst));
23+
benchmark::DoNotOptimize(&out);
24+
}
25+
}
26+
27+
static void bm_copy_backward_vb(benchmark::State& state, bool aligned) {
28+
auto n = state.range();
29+
std::vector<bool> in(n, true);
30+
std::vector<bool> out(aligned ? n : n + 8);
31+
benchmark::DoNotOptimize(&in);
32+
auto beg = in.begin();
33+
auto end = in.end();
34+
auto dst = aligned ? out.end() : out.end() - 4;
35+
for (auto _ : state) {
36+
benchmark::DoNotOptimize(std::copy_backward(beg, end, dst));
37+
benchmark::DoNotOptimize(&out);
38+
}
39+
}
40+
41+
static void bm_ranges_copy_backward_vb_aligned(benchmark::State& state) { bm_ranges_copy_backward_vb(state, true); }
42+
static void bm_ranges_copy_backward_vb_unaligned(benchmark::State& state) { bm_ranges_copy_backward_vb(state, false); }
43+
44+
static void bm_copy_backward_vb_aligned(benchmark::State& state) { bm_copy_backward_vb(state, true); }
45+
static void bm_copy_backward_vb_unaligned(benchmark::State& state) { bm_copy_backward_vb(state, false); }
46+
47+
// Test std::ranges::copy_backward for vector<bool>::iterator
48+
BENCHMARK(bm_ranges_copy_backward_vb_aligned)->Range(8, 1 << 16)->DenseRange(102400, 204800, 4096);
49+
BENCHMARK(bm_ranges_copy_backward_vb_unaligned)->Range(8, 1 << 20);
50+
51+
// Test std::copy_backward for vector<bool>::iterator
52+
BENCHMARK(bm_copy_backward_vb_aligned)->Range(8, 1 << 20);
53+
BENCHMARK(bm_copy_backward_vb_unaligned)->Range(8, 1 << 20);
54+
55+
BENCHMARK_MAIN();

0 commit comments

Comments
 (0)