Skip to content

[libc++] Optimize ranges::copy_backward for vector<bool>::iterator #121026

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jan 30, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion libcxx/docs/ReleaseNotes/21.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ Implemented Papers
Improvements and New Features
-----------------------------

- The ``std::ranges::copy`` and ``std::ranges::copy_n`` algorithms have been optimized for ``std::vector<bool>::iterator``\s,
- The ``std::ranges::{copy, copy_n, copy_backward}`` algorithms have been optimized for ``std::vector<bool>::iterator``\s,
resulting in a performance improvement of up to 2000x.


Expand Down
131 changes: 131 additions & 0 deletions libcxx/include/__algorithm/copy_backward.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,14 @@
#define _LIBCPP___ALGORITHM_COPY_BACKWARD_H

#include <__algorithm/copy_move_common.h>
#include <__algorithm/copy_n.h>
#include <__algorithm/iterator_operations.h>
#include <__algorithm/min.h>
#include <__config>
#include <__fwd/bit_reference.h>
#include <__iterator/iterator_traits.h>
#include <__iterator/segmented_iterator.h>
#include <__memory/pointer_traits.h>
#include <__type_traits/common_type.h>
#include <__type_traits/enable_if.h>
#include <__type_traits/is_constructible.h>
Expand All @@ -34,6 +37,124 @@ template <class _AlgPolicy, class _InIter, class _Sent, class _OutIter>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_InIter, _OutIter>
__copy_backward(_InIter __first, _Sent __last, _OutIter __result);

template <class _Cp, bool _IsConst>
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_backward_aligned(
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
using _In = __bit_iterator<_Cp, _IsConst>;
using difference_type = typename _In::difference_type;
using __storage_type = typename _In::__storage_type;

const int __bits_per_word = _In::__bits_per_word;
difference_type __n = __last - __first;
if (__n > 0) {
// do first word
if (__last.__ctz_ != 0) {
difference_type __dn = std::min(static_cast<difference_type>(__last.__ctz_), __n);
__n -= __dn;
unsigned __clz = __bits_per_word - __last.__ctz_;
__storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz);
__storage_type __b = *__last.__seg_ & __m;
*__result.__seg_ &= ~__m;
*__result.__seg_ |= __b;
__result.__ctz_ = static_cast<unsigned>(((-__dn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
// __last.__ctz_ = 0
}
// __last.__ctz_ == 0 || __n == 0
// __result.__ctz_ == 0 || __n == 0
// do middle words
__storage_type __nw = __n / __bits_per_word;
__result.__seg_ -= __nw;
__last.__seg_ -= __nw;
std::copy_n(std::__to_address(__last.__seg_), __nw, std::__to_address(__result.__seg_));
__n -= __nw * __bits_per_word;
// do last word
if (__n > 0) {
__storage_type __m = ~__storage_type(0) << (__bits_per_word - __n);
__storage_type __b = *--__last.__seg_ & __m;
*--__result.__seg_ &= ~__m;
*__result.__seg_ |= __b;
__result.__ctz_ = static_cast<unsigned>(-__n & (__bits_per_word - 1));
}
}
return __result;
}

template <class _Cp, bool _IsConst>
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_backward_unaligned(
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
using _In = __bit_iterator<_Cp, _IsConst>;
using difference_type = typename _In::difference_type;
using __storage_type = typename _In::__storage_type;

const int __bits_per_word = _In::__bits_per_word;
difference_type __n = __last - __first;
if (__n > 0) {
// do first word
if (__last.__ctz_ != 0) {
difference_type __dn = std::min(static_cast<difference_type>(__last.__ctz_), __n);
__n -= __dn;
unsigned __clz_l = __bits_per_word - __last.__ctz_;
__storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_l);
__storage_type __b = *__last.__seg_ & __m;
unsigned __clz_r = __bits_per_word - __result.__ctz_;
__storage_type __ddn = std::min(__dn, static_cast<difference_type>(__result.__ctz_));
if (__ddn > 0) {
__m = (~__storage_type(0) << (__result.__ctz_ - __ddn)) & (~__storage_type(0) >> __clz_r);
*__result.__seg_ &= ~__m;
if (__result.__ctz_ > __last.__ctz_)
*__result.__seg_ |= __b << (__result.__ctz_ - __last.__ctz_);
else
*__result.__seg_ |= __b >> (__last.__ctz_ - __result.__ctz_);
__result.__ctz_ = static_cast<unsigned>(((-__ddn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
__dn -= __ddn;
}
if (__dn > 0) {
// __result.__ctz_ == 0
--__result.__seg_;
__result.__ctz_ = static_cast<unsigned>(-__dn & (__bits_per_word - 1));
__m = ~__storage_type(0) << __result.__ctz_;
*__result.__seg_ &= ~__m;
__last.__ctz_ -= __dn + __ddn;
*__result.__seg_ |= __b << (__result.__ctz_ - __last.__ctz_);
}
// __last.__ctz_ = 0
}
// __last.__ctz_ == 0 || __n == 0
// __result.__ctz_ != 0 || __n == 0
// do middle words
unsigned __clz_r = __bits_per_word - __result.__ctz_;
__storage_type __m = ~__storage_type(0) >> __clz_r;
for (; __n >= __bits_per_word; __n -= __bits_per_word) {
__storage_type __b = *--__last.__seg_;
*__result.__seg_ &= ~__m;
*__result.__seg_ |= __b >> __clz_r;
*--__result.__seg_ &= __m;
*__result.__seg_ |= __b << __result.__ctz_;
}
// do last word
if (__n > 0) {
__m = ~__storage_type(0) << (__bits_per_word - __n);
__storage_type __b = *--__last.__seg_ & __m;
__clz_r = __bits_per_word - __result.__ctz_;
__storage_type __dn = std::min(__n, static_cast<difference_type>(__result.__ctz_));
__m = (~__storage_type(0) << (__result.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_r);
*__result.__seg_ &= ~__m;
*__result.__seg_ |= __b >> (__bits_per_word - __result.__ctz_);
__result.__ctz_ = static_cast<unsigned>(((-__dn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
__n -= __dn;
if (__n > 0) {
// __result.__ctz_ == 0
--__result.__seg_;
__result.__ctz_ = static_cast<unsigned>(-__n & (__bits_per_word - 1));
__m = ~__storage_type(0) << __result.__ctz_;
*__result.__seg_ &= ~__m;
*__result.__seg_ |= __b << (__result.__ctz_ - (__bits_per_word - __n - __dn));
}
}
}
return __result;
}

template <class _AlgPolicy>
struct __copy_backward_impl {
template <class _InIter, class _Sent, class _OutIter>
Expand Down Expand Up @@ -107,6 +228,16 @@ struct __copy_backward_impl {
}
}

template <class _Cp, bool _IsConst>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<__bit_iterator<_Cp, _IsConst>, __bit_iterator<_Cp, false> >
operator()(__bit_iterator<_Cp, _IsConst> __first,
__bit_iterator<_Cp, _IsConst> __last,
__bit_iterator<_Cp, false> __result) {
if (__last.__ctz_ == __result.__ctz_)
return std::make_pair(__last, std::__copy_backward_aligned(__first, __last, __result));
return std::make_pair(__last, std::__copy_backward_unaligned(__first, __last, __result));
}

// At this point, the iterators have been unwrapped so any `contiguous_iterator` has been unwrapped to a pointer.
template <class _In, class _Out, __enable_if_t<__can_lower_copy_assignment_to_memmove<_In, _Out>::value, int> = 0>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*>
Expand Down
134 changes: 3 additions & 131 deletions libcxx/include/__bit_reference
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#define _LIBCPP___BIT_REFERENCE

#include <__algorithm/copy.h>
#include <__algorithm/copy_backward.h>
#include <__algorithm/copy_n.h>
#include <__algorithm/min.h>
#include <__bit/countr.h>
Expand Down Expand Up @@ -185,134 +186,6 @@ private:
__mask_(__m) {}
};

// copy_backward

template <class _Cp, bool _IsConst>
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_backward_aligned(
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
using _In = __bit_iterator<_Cp, _IsConst>;
using difference_type = typename _In::difference_type;
using __storage_type = typename _In::__storage_type;

const int __bits_per_word = _In::__bits_per_word;
difference_type __n = __last - __first;
if (__n > 0) {
// do first word
if (__last.__ctz_ != 0) {
difference_type __dn = std::min(static_cast<difference_type>(__last.__ctz_), __n);
__n -= __dn;
unsigned __clz = __bits_per_word - __last.__ctz_;
__storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz);
__storage_type __b = *__last.__seg_ & __m;
*__result.__seg_ &= ~__m;
*__result.__seg_ |= __b;
__result.__ctz_ = static_cast<unsigned>(((-__dn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
// __last.__ctz_ = 0
}
// __last.__ctz_ == 0 || __n == 0
// __result.__ctz_ == 0 || __n == 0
// do middle words
__storage_type __nw = __n / __bits_per_word;
__result.__seg_ -= __nw;
__last.__seg_ -= __nw;
std::copy_n(std::__to_address(__last.__seg_), __nw, std::__to_address(__result.__seg_));
__n -= __nw * __bits_per_word;
// do last word
if (__n > 0) {
__storage_type __m = ~__storage_type(0) << (__bits_per_word - __n);
__storage_type __b = *--__last.__seg_ & __m;
*--__result.__seg_ &= ~__m;
*__result.__seg_ |= __b;
__result.__ctz_ = static_cast<unsigned>(-__n & (__bits_per_word - 1));
}
}
return __result;
}

template <class _Cp, bool _IsConst>
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_backward_unaligned(
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
using _In = __bit_iterator<_Cp, _IsConst>;
using difference_type = typename _In::difference_type;
using __storage_type = typename _In::__storage_type;

const int __bits_per_word = _In::__bits_per_word;
difference_type __n = __last - __first;
if (__n > 0) {
// do first word
if (__last.__ctz_ != 0) {
difference_type __dn = std::min(static_cast<difference_type>(__last.__ctz_), __n);
__n -= __dn;
unsigned __clz_l = __bits_per_word - __last.__ctz_;
__storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_l);
__storage_type __b = *__last.__seg_ & __m;
unsigned __clz_r = __bits_per_word - __result.__ctz_;
__storage_type __ddn = std::min(__dn, static_cast<difference_type>(__result.__ctz_));
if (__ddn > 0) {
__m = (~__storage_type(0) << (__result.__ctz_ - __ddn)) & (~__storage_type(0) >> __clz_r);
*__result.__seg_ &= ~__m;
if (__result.__ctz_ > __last.__ctz_)
*__result.__seg_ |= __b << (__result.__ctz_ - __last.__ctz_);
else
*__result.__seg_ |= __b >> (__last.__ctz_ - __result.__ctz_);
__result.__ctz_ = static_cast<unsigned>(((-__ddn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
__dn -= __ddn;
}
if (__dn > 0) {
// __result.__ctz_ == 0
--__result.__seg_;
__result.__ctz_ = static_cast<unsigned>(-__dn & (__bits_per_word - 1));
__m = ~__storage_type(0) << __result.__ctz_;
*__result.__seg_ &= ~__m;
__last.__ctz_ -= __dn + __ddn;
*__result.__seg_ |= __b << (__result.__ctz_ - __last.__ctz_);
}
// __last.__ctz_ = 0
}
// __last.__ctz_ == 0 || __n == 0
// __result.__ctz_ != 0 || __n == 0
// do middle words
unsigned __clz_r = __bits_per_word - __result.__ctz_;
__storage_type __m = ~__storage_type(0) >> __clz_r;
for (; __n >= __bits_per_word; __n -= __bits_per_word) {
__storage_type __b = *--__last.__seg_;
*__result.__seg_ &= ~__m;
*__result.__seg_ |= __b >> __clz_r;
*--__result.__seg_ &= __m;
*__result.__seg_ |= __b << __result.__ctz_;
}
// do last word
if (__n > 0) {
__m = ~__storage_type(0) << (__bits_per_word - __n);
__storage_type __b = *--__last.__seg_ & __m;
__clz_r = __bits_per_word - __result.__ctz_;
__storage_type __dn = std::min(__n, static_cast<difference_type>(__result.__ctz_));
__m = (~__storage_type(0) << (__result.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_r);
*__result.__seg_ &= ~__m;
*__result.__seg_ |= __b >> (__bits_per_word - __result.__ctz_);
__result.__ctz_ = static_cast<unsigned>(((-__dn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
__n -= __dn;
if (__n > 0) {
// __result.__ctz_ == 0
--__result.__seg_;
__result.__ctz_ = static_cast<unsigned>(-__n & (__bits_per_word - 1));
__m = ~__storage_type(0) << __result.__ctz_;
*__result.__seg_ &= ~__m;
*__result.__seg_ |= __b << (__result.__ctz_ - (__bits_per_word - __n - __dn));
}
}
}
return __result;
}

template <class _Cp, bool _IsConst>
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator<_Cp, false> copy_backward(
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
if (__last.__ctz_ == __result.__ctz_)
return std::__copy_backward_aligned(__first, __last, __result);
return std::__copy_backward_unaligned(__first, __last, __result);
}

// move

template <class _Cp, bool _IsConst>
Expand Down Expand Up @@ -876,9 +749,8 @@ private:
template <class _Dp, bool _IC>
_LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false> __copy_backward_unaligned(
__bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
template <class _Dp, bool _IC>
_LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false>
copy_backward(__bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
template <class _AlgPolicy>
friend struct __copy_backward_impl;
template <class _Cl, class _Cr>
friend __bit_iterator<_Cr, false>
__swap_ranges_aligned(__bit_iterator<_Cl, false>, __bit_iterator<_Cl, false>, __bit_iterator<_Cr, false>);
Expand Down
1 change: 1 addition & 0 deletions libcxx/include/__vector/vector_bool.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#define _LIBCPP___VECTOR_VECTOR_BOOL_H

#include <__algorithm/copy.h>
#include <__algorithm/copy_backward.h>
#include <__algorithm/fill_n.h>
#include <__algorithm/iterator_operations.h>
#include <__algorithm/max.h>
Expand Down
1 change: 1 addition & 0 deletions libcxx/include/bitset
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ template <size_t N> struct hash<std::bitset<N>>;
# include <__cxx03/bitset>
#else
# include <__algorithm/copy.h>
# include <__algorithm/copy_backward.h>
# include <__algorithm/count.h>
# include <__algorithm/fill.h>
# include <__algorithm/fill_n.h>
Expand Down
55 changes: 55 additions & 0 deletions libcxx/test/benchmarks/algorithms/copy_backward.bench.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20

#include <algorithm>
#include <benchmark/benchmark.h>
#include <vector>

static void bm_ranges_copy_backward_vb(benchmark::State& state, bool aligned) {
auto n = state.range();
std::vector<bool> in(n, true);
std::vector<bool> out(aligned ? n : n + 8);
benchmark::DoNotOptimize(&in);
auto dst = aligned ? out.end() : out.end() - 4;
for (auto _ : state) {
benchmark::DoNotOptimize(std::ranges::copy_backward(in, dst));
benchmark::DoNotOptimize(&out);
}
}

static void bm_copy_backward_vb(benchmark::State& state, bool aligned) {
auto n = state.range();
std::vector<bool> in(n, true);
std::vector<bool> out(aligned ? n : n + 8);
benchmark::DoNotOptimize(&in);
auto beg = in.begin();
auto end = in.end();
auto dst = aligned ? out.end() : out.end() - 4;
for (auto _ : state) {
benchmark::DoNotOptimize(std::copy_backward(beg, end, dst));
benchmark::DoNotOptimize(&out);
}
}

static void bm_ranges_copy_backward_vb_aligned(benchmark::State& state) { bm_ranges_copy_backward_vb(state, true); }
static void bm_ranges_copy_backward_vb_unaligned(benchmark::State& state) { bm_ranges_copy_backward_vb(state, false); }

static void bm_copy_backward_vb_aligned(benchmark::State& state) { bm_copy_backward_vb(state, true); }
static void bm_copy_backward_vb_unaligned(benchmark::State& state) { bm_copy_backward_vb(state, false); }

// Test std::ranges::copy_backward for vector<bool>::iterator
BENCHMARK(bm_ranges_copy_backward_vb_aligned)->Range(8, 1 << 16)->DenseRange(102400, 204800, 4096);
BENCHMARK(bm_ranges_copy_backward_vb_unaligned)->Range(8, 1 << 20);

// Test std::copy_backward for vector<bool>::iterator
BENCHMARK(bm_copy_backward_vb_aligned)->Range(8, 1 << 20);
BENCHMARK(bm_copy_backward_vb_unaligned)->Range(8, 1 << 20);

BENCHMARK_MAIN();
Loading