Skip to content

Commit 4173a6e

Browse files
committed
Optimize ranges::copy_backward for vector<bool>::iterator
1 parent 6e3631d commit 4173a6e

File tree

8 files changed

+326
-198
lines changed

8 files changed

+326
-198
lines changed

libcxx/docs/ReleaseNotes/20.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,9 @@ Improvements and New Features
7373
optimized, resulting in a performance improvement of up to 2x for trivial element types (e.g., `std::vector<int>`),
7474
and up to 3.4x for non-trivial element types (e.g., `std::vector<std::vector<int>>`).
7575

76+
- The ``std::ranges::copy_backward`` algorithm has been optimized for ``std::vector<bool>::iterator``\s, resulting in
77+
a performance improvement of up to 2000x.
78+
7679
Deprecations and Removals
7780
-------------------------
7881

libcxx/include/__algorithm/copy_backward.h

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,14 @@
1010
#define _LIBCPP___ALGORITHM_COPY_BACKWARD_H
1111

1212
#include <__algorithm/copy_move_common.h>
13+
#include <__algorithm/copy_n.h>
1314
#include <__algorithm/iterator_operations.h>
1415
#include <__algorithm/min.h>
1516
#include <__config>
17+
#include <__fwd/bit_reference.h>
1618
#include <__iterator/iterator_traits.h>
1719
#include <__iterator/segmented_iterator.h>
20+
#include <__memory/pointer_traits.h>
1821
#include <__type_traits/common_type.h>
1922
#include <__type_traits/enable_if.h>
2023
#include <__type_traits/is_constructible.h>
@@ -34,6 +37,124 @@ template <class _AlgPolicy, class _InIter, class _Sent, class _OutIter>
3437
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_InIter, _OutIter>
3538
__copy_backward(_InIter __first, _Sent __last, _OutIter __result);
3639

40+
template <class _Cp, bool _IsConst>
41+
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_backward_aligned(
42+
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
43+
using _In = __bit_iterator<_Cp, _IsConst>;
44+
using difference_type = typename _In::difference_type;
45+
using __storage_type = typename _In::__storage_type;
46+
47+
const int __bits_per_word = _In::__bits_per_word;
48+
difference_type __n = __last - __first;
49+
if (__n > 0) {
50+
// do first word
51+
if (__last.__ctz_ != 0) {
52+
difference_type __dn = std::min(static_cast<difference_type>(__last.__ctz_), __n);
53+
__n -= __dn;
54+
unsigned __clz = __bits_per_word - __last.__ctz_;
55+
__storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz);
56+
__storage_type __b = *__last.__seg_ & __m;
57+
*__result.__seg_ &= ~__m;
58+
*__result.__seg_ |= __b;
59+
__result.__ctz_ = static_cast<unsigned>(((-__dn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
60+
// __last.__ctz_ = 0
61+
}
62+
// __last.__ctz_ == 0 || __n == 0
63+
// __result.__ctz_ == 0 || __n == 0
64+
// do middle words
65+
__storage_type __nw = __n / __bits_per_word;
66+
__result.__seg_ -= __nw;
67+
__last.__seg_ -= __nw;
68+
std::copy_n(std::__to_address(__last.__seg_), __nw, std::__to_address(__result.__seg_));
69+
__n -= __nw * __bits_per_word;
70+
// do last word
71+
if (__n > 0) {
72+
__storage_type __m = ~__storage_type(0) << (__bits_per_word - __n);
73+
__storage_type __b = *--__last.__seg_ & __m;
74+
*--__result.__seg_ &= ~__m;
75+
*__result.__seg_ |= __b;
76+
__result.__ctz_ = static_cast<unsigned>(-__n & (__bits_per_word - 1));
77+
}
78+
}
79+
return __result;
80+
}
81+
82+
template <class _Cp, bool _IsConst>
83+
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_backward_unaligned(
84+
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
85+
using _In = __bit_iterator<_Cp, _IsConst>;
86+
using difference_type = typename _In::difference_type;
87+
using __storage_type = typename _In::__storage_type;
88+
89+
const int __bits_per_word = _In::__bits_per_word;
90+
difference_type __n = __last - __first;
91+
if (__n > 0) {
92+
// do first word
93+
if (__last.__ctz_ != 0) {
94+
difference_type __dn = std::min(static_cast<difference_type>(__last.__ctz_), __n);
95+
__n -= __dn;
96+
unsigned __clz_l = __bits_per_word - __last.__ctz_;
97+
__storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_l);
98+
__storage_type __b = *__last.__seg_ & __m;
99+
unsigned __clz_r = __bits_per_word - __result.__ctz_;
100+
__storage_type __ddn = std::min(__dn, static_cast<difference_type>(__result.__ctz_));
101+
if (__ddn > 0) {
102+
__m = (~__storage_type(0) << (__result.__ctz_ - __ddn)) & (~__storage_type(0) >> __clz_r);
103+
*__result.__seg_ &= ~__m;
104+
if (__result.__ctz_ > __last.__ctz_)
105+
*__result.__seg_ |= __b << (__result.__ctz_ - __last.__ctz_);
106+
else
107+
*__result.__seg_ |= __b >> (__last.__ctz_ - __result.__ctz_);
108+
__result.__ctz_ = static_cast<unsigned>(((-__ddn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
109+
__dn -= __ddn;
110+
}
111+
if (__dn > 0) {
112+
// __result.__ctz_ == 0
113+
--__result.__seg_;
114+
__result.__ctz_ = static_cast<unsigned>(-__dn & (__bits_per_word - 1));
115+
__m = ~__storage_type(0) << __result.__ctz_;
116+
*__result.__seg_ &= ~__m;
117+
__last.__ctz_ -= __dn + __ddn;
118+
*__result.__seg_ |= __b << (__result.__ctz_ - __last.__ctz_);
119+
}
120+
// __last.__ctz_ = 0
121+
}
122+
// __last.__ctz_ == 0 || __n == 0
123+
// __result.__ctz_ != 0 || __n == 0
124+
// do middle words
125+
unsigned __clz_r = __bits_per_word - __result.__ctz_;
126+
__storage_type __m = ~__storage_type(0) >> __clz_r;
127+
for (; __n >= __bits_per_word; __n -= __bits_per_word) {
128+
__storage_type __b = *--__last.__seg_;
129+
*__result.__seg_ &= ~__m;
130+
*__result.__seg_ |= __b >> __clz_r;
131+
*--__result.__seg_ &= __m;
132+
*__result.__seg_ |= __b << __result.__ctz_;
133+
}
134+
// do last word
135+
if (__n > 0) {
136+
__m = ~__storage_type(0) << (__bits_per_word - __n);
137+
__storage_type __b = *--__last.__seg_ & __m;
138+
__clz_r = __bits_per_word - __result.__ctz_;
139+
__storage_type __dn = std::min(__n, static_cast<difference_type>(__result.__ctz_));
140+
__m = (~__storage_type(0) << (__result.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_r);
141+
*__result.__seg_ &= ~__m;
142+
*__result.__seg_ |= __b >> (__bits_per_word - __result.__ctz_);
143+
__result.__ctz_ = static_cast<unsigned>(((-__dn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
144+
__n -= __dn;
145+
if (__n > 0) {
146+
// __result.__ctz_ == 0
147+
--__result.__seg_;
148+
__result.__ctz_ = static_cast<unsigned>(-__n & (__bits_per_word - 1));
149+
__m = ~__storage_type(0) << __result.__ctz_;
150+
*__result.__seg_ &= ~__m;
151+
*__result.__seg_ |= __b << (__result.__ctz_ - (__bits_per_word - __n - __dn));
152+
}
153+
}
154+
}
155+
return __result;
156+
}
157+
37158
template <class _AlgPolicy>
38159
struct __copy_backward_impl {
39160
template <class _InIter, class _Sent, class _OutIter>
@@ -107,6 +228,16 @@ struct __copy_backward_impl {
107228
}
108229
}
109230

231+
template <class _Cp, bool _IsConst>
232+
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<__bit_iterator<_Cp, _IsConst>, __bit_iterator<_Cp, false> >
233+
operator()(__bit_iterator<_Cp, _IsConst> __first,
234+
__bit_iterator<_Cp, _IsConst> __last,
235+
__bit_iterator<_Cp, false> __result) {
236+
if (__last.__ctz_ == __result.__ctz_)
237+
return std::make_pair(__last, std::__copy_backward_aligned(__first, __last, __result));
238+
return std::make_pair(__last, std::__copy_backward_unaligned(__first, __last, __result));
239+
}
240+
110241
// At this point, the iterators have been unwrapped so any `contiguous_iterator` has been unwrapped to a pointer.
111242
template <class _In, class _Out, __enable_if_t<__can_lower_copy_assignment_to_memmove<_In, _Out>::value, int> = 0>
112243
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*>

libcxx/include/__bit_reference

Lines changed: 3 additions & 131 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#ifndef _LIBCPP___BIT_REFERENCE
1111
#define _LIBCPP___BIT_REFERENCE
1212

13+
#include <__algorithm/copy_backward.h>
1314
#include <__algorithm/copy_n.h>
1415
#include <__algorithm/min.h>
1516
#include <__bit/countr.h>
@@ -293,134 +294,6 @@ copy(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last
293294
return std::__copy_unaligned(__first, __last, __result);
294295
}
295296

296-
// copy_backward
297-
298-
template <class _Cp, bool _IsConst>
299-
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_backward_aligned(
300-
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
301-
using _In = __bit_iterator<_Cp, _IsConst>;
302-
using difference_type = typename _In::difference_type;
303-
using __storage_type = typename _In::__storage_type;
304-
305-
const int __bits_per_word = _In::__bits_per_word;
306-
difference_type __n = __last - __first;
307-
if (__n > 0) {
308-
// do first word
309-
if (__last.__ctz_ != 0) {
310-
difference_type __dn = std::min(static_cast<difference_type>(__last.__ctz_), __n);
311-
__n -= __dn;
312-
unsigned __clz = __bits_per_word - __last.__ctz_;
313-
__storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz);
314-
__storage_type __b = *__last.__seg_ & __m;
315-
*__result.__seg_ &= ~__m;
316-
*__result.__seg_ |= __b;
317-
__result.__ctz_ = static_cast<unsigned>(((-__dn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
318-
// __last.__ctz_ = 0
319-
}
320-
// __last.__ctz_ == 0 || __n == 0
321-
// __result.__ctz_ == 0 || __n == 0
322-
// do middle words
323-
__storage_type __nw = __n / __bits_per_word;
324-
__result.__seg_ -= __nw;
325-
__last.__seg_ -= __nw;
326-
std::copy_n(std::__to_address(__last.__seg_), __nw, std::__to_address(__result.__seg_));
327-
__n -= __nw * __bits_per_word;
328-
// do last word
329-
if (__n > 0) {
330-
__storage_type __m = ~__storage_type(0) << (__bits_per_word - __n);
331-
__storage_type __b = *--__last.__seg_ & __m;
332-
*--__result.__seg_ &= ~__m;
333-
*__result.__seg_ |= __b;
334-
__result.__ctz_ = static_cast<unsigned>(-__n & (__bits_per_word - 1));
335-
}
336-
}
337-
return __result;
338-
}
339-
340-
template <class _Cp, bool _IsConst>
341-
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_backward_unaligned(
342-
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
343-
using _In = __bit_iterator<_Cp, _IsConst>;
344-
using difference_type = typename _In::difference_type;
345-
using __storage_type = typename _In::__storage_type;
346-
347-
const int __bits_per_word = _In::__bits_per_word;
348-
difference_type __n = __last - __first;
349-
if (__n > 0) {
350-
// do first word
351-
if (__last.__ctz_ != 0) {
352-
difference_type __dn = std::min(static_cast<difference_type>(__last.__ctz_), __n);
353-
__n -= __dn;
354-
unsigned __clz_l = __bits_per_word - __last.__ctz_;
355-
__storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_l);
356-
__storage_type __b = *__last.__seg_ & __m;
357-
unsigned __clz_r = __bits_per_word - __result.__ctz_;
358-
__storage_type __ddn = std::min(__dn, static_cast<difference_type>(__result.__ctz_));
359-
if (__ddn > 0) {
360-
__m = (~__storage_type(0) << (__result.__ctz_ - __ddn)) & (~__storage_type(0) >> __clz_r);
361-
*__result.__seg_ &= ~__m;
362-
if (__result.__ctz_ > __last.__ctz_)
363-
*__result.__seg_ |= __b << (__result.__ctz_ - __last.__ctz_);
364-
else
365-
*__result.__seg_ |= __b >> (__last.__ctz_ - __result.__ctz_);
366-
__result.__ctz_ = static_cast<unsigned>(((-__ddn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
367-
__dn -= __ddn;
368-
}
369-
if (__dn > 0) {
370-
// __result.__ctz_ == 0
371-
--__result.__seg_;
372-
__result.__ctz_ = static_cast<unsigned>(-__dn & (__bits_per_word - 1));
373-
__m = ~__storage_type(0) << __result.__ctz_;
374-
*__result.__seg_ &= ~__m;
375-
__last.__ctz_ -= __dn + __ddn;
376-
*__result.__seg_ |= __b << (__result.__ctz_ - __last.__ctz_);
377-
}
378-
// __last.__ctz_ = 0
379-
}
380-
// __last.__ctz_ == 0 || __n == 0
381-
// __result.__ctz_ != 0 || __n == 0
382-
// do middle words
383-
unsigned __clz_r = __bits_per_word - __result.__ctz_;
384-
__storage_type __m = ~__storage_type(0) >> __clz_r;
385-
for (; __n >= __bits_per_word; __n -= __bits_per_word) {
386-
__storage_type __b = *--__last.__seg_;
387-
*__result.__seg_ &= ~__m;
388-
*__result.__seg_ |= __b >> __clz_r;
389-
*--__result.__seg_ &= __m;
390-
*__result.__seg_ |= __b << __result.__ctz_;
391-
}
392-
// do last word
393-
if (__n > 0) {
394-
__m = ~__storage_type(0) << (__bits_per_word - __n);
395-
__storage_type __b = *--__last.__seg_ & __m;
396-
__clz_r = __bits_per_word - __result.__ctz_;
397-
__storage_type __dn = std::min(__n, static_cast<difference_type>(__result.__ctz_));
398-
__m = (~__storage_type(0) << (__result.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_r);
399-
*__result.__seg_ &= ~__m;
400-
*__result.__seg_ |= __b >> (__bits_per_word - __result.__ctz_);
401-
__result.__ctz_ = static_cast<unsigned>(((-__dn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
402-
__n -= __dn;
403-
if (__n > 0) {
404-
// __result.__ctz_ == 0
405-
--__result.__seg_;
406-
__result.__ctz_ = static_cast<unsigned>(-__n & (__bits_per_word - 1));
407-
__m = ~__storage_type(0) << __result.__ctz_;
408-
*__result.__seg_ &= ~__m;
409-
*__result.__seg_ |= __b << (__result.__ctz_ - (__bits_per_word - __n - __dn));
410-
}
411-
}
412-
}
413-
return __result;
414-
}
415-
416-
template <class _Cp, bool _IsConst>
417-
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator<_Cp, false> copy_backward(
418-
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
419-
if (__last.__ctz_ == __result.__ctz_)
420-
return std::__copy_backward_aligned(__first, __last, __result);
421-
return std::__copy_backward_unaligned(__first, __last, __result);
422-
}
423-
424297
// move
425298

426299
template <class _Cp, bool _IsConst>
@@ -983,9 +856,8 @@ private:
983856
template <class _Dp, bool _IC>
984857
_LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false> __copy_backward_unaligned(
985858
__bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
986-
template <class _Dp, bool _IC>
987-
_LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false>
988-
copy_backward(__bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
859+
template <class _AlgPolicy>
860+
friend struct __copy_backward_impl;
989861
template <class _Cl, class _Cr>
990862
friend __bit_iterator<_Cr, false>
991863
__swap_ranges_aligned(__bit_iterator<_Cl, false>, __bit_iterator<_Cl, false>, __bit_iterator<_Cr, false>);

libcxx/include/__vector/vector_bool.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#define _LIBCPP___VECTOR_VECTOR_BOOL_H
1111

1212
#include <__algorithm/copy.h>
13+
#include <__algorithm/copy_backward.h>
1314
#include <__algorithm/fill_n.h>
1415
#include <__algorithm/iterator_operations.h>
1516
#include <__algorithm/max.h>

libcxx/include/bitset

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,8 @@ template <size_t N> struct hash<std::bitset<N>>;
129129
#if __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
130130
# include <__cxx03/bitset>
131131
#else
132+
# include <__algorithm/copy.h>
133+
# include <__algorithm/copy_backward.h>
132134
# include <__algorithm/count.h>
133135
# include <__algorithm/fill.h>
134136
# include <__algorithm/fill_n.h>
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
10+
11+
#include <algorithm>
12+
#include <benchmark/benchmark.h>
13+
#include <vector>
14+
15+
static void bm_ranges_copy_backward(benchmark::State& state, bool aligned) {
16+
auto n = state.range();
17+
std::vector<bool> in(n, true);
18+
std::vector<bool> out(aligned ? n : n + 8);
19+
benchmark::DoNotOptimize(&in);
20+
auto dst = aligned ? out.end() : out.end() - 4;
21+
for (auto _ : state) {
22+
benchmark::DoNotOptimize(std::ranges::copy_backward(in, dst));
23+
benchmark::DoNotOptimize(&out);
24+
}
25+
}
26+
27+
static void bm_copy_backward(benchmark::State& state, bool aligned) {
28+
auto n = state.range();
29+
std::vector<bool> in(n, true);
30+
std::vector<bool> out(aligned ? n : n + 8);
31+
benchmark::DoNotOptimize(&in);
32+
auto beg = in.begin();
33+
auto end = in.end();
34+
auto dst = aligned ? out.end() : out.end() - 4;
35+
for (auto _ : state) {
36+
benchmark::DoNotOptimize(std::copy_backward(beg, end, dst));
37+
benchmark::DoNotOptimize(&out);
38+
}
39+
}
40+
41+
static void bm_ranges_copy_backward_aligned(benchmark::State& state) { bm_ranges_copy_backward(state, true); }
42+
static void bm_ranges_copy_backward_unaligned(benchmark::State& state) { bm_ranges_copy_backward(state, false); }
43+
44+
static void bm_copy_backward_aligned(benchmark::State& state) { bm_copy_backward(state, true); }
45+
static void bm_copy_backward_unaligned(benchmark::State& state) { bm_copy_backward(state, false); }
46+
47+
// Test the range version of std::copy for vector<bool>::iterator
48+
BENCHMARK(bm_ranges_copy_backward_aligned)->Range(8, 1 << 16)->DenseRange(102400, 204800, 4096);
49+
BENCHMARK(bm_ranges_copy_backward_unaligned)->Range(8, 1 << 20);
50+
51+
// Test the iterator-pair version of std::copy for vector<bool>::iterator
52+
BENCHMARK(bm_copy_backward_aligned)->Range(8, 1 << 20);
53+
BENCHMARK(bm_copy_backward_unaligned)->Range(8, 1 << 20);
54+
55+
BENCHMARK_MAIN();

0 commit comments

Comments
 (0)