Skip to content

[libc++] Optimize ranges::copy{, _n} for vector<bool>::iterator #121013

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jan 30, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion libcxx/docs/ReleaseNotes/21.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ Implemented Papers
Improvements and New Features
-----------------------------

- TODO
- The ``std::ranges::copy`` and ``std::ranges::copy_n`` algorithms have been optimized for ``std::vector<bool>::iterator``\s,
resulting in a performance improvement of up to 2000x.


Deprecations and Removals
Expand Down
134 changes: 133 additions & 1 deletion libcxx/include/__algorithm/copy.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@
#include <__algorithm/for_each_segment.h>
#include <__algorithm/min.h>
#include <__config>
#include <__fwd/bit_reference.h>
#include <__iterator/iterator_traits.h>
#include <__iterator/segmented_iterator.h>
#include <__memory/pointer_traits.h>
#include <__type_traits/common_type.h>
#include <__type_traits/enable_if.h>
#include <__utility/move.h>
Expand All @@ -29,9 +31,129 @@ _LIBCPP_PUSH_MACROS

_LIBCPP_BEGIN_NAMESPACE_STD

template <class _InputIterator, class _OutputIterator>
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
copy(_InputIterator __first, _InputIterator __last, _OutputIterator __result);

template <class _InIter, class _Sent, class _OutIter>
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter> __copy(_InIter, _Sent, _OutIter);

template <class _Cp, bool _IsConst>
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_aligned(
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
using _In = __bit_iterator<_Cp, _IsConst>;
using difference_type = typename _In::difference_type;
using __storage_type = typename _In::__storage_type;

const int __bits_per_word = _In::__bits_per_word;
difference_type __n = __last - __first;
if (__n > 0) {
// do first word
if (__first.__ctz_ != 0) {
unsigned __clz = __bits_per_word - __first.__ctz_;
difference_type __dn = std::min(static_cast<difference_type>(__clz), __n);
__n -= __dn;
__storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
__storage_type __b = *__first.__seg_ & __m;
*__result.__seg_ &= ~__m;
*__result.__seg_ |= __b;
__result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
__result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
++__first.__seg_;
// __first.__ctz_ = 0;
}
// __first.__ctz_ == 0;
// do middle words
__storage_type __nw = __n / __bits_per_word;
std::copy(std::__to_address(__first.__seg_),
std::__to_address(__first.__seg_ + __nw),
std::__to_address(__result.__seg_));
__n -= __nw * __bits_per_word;
__result.__seg_ += __nw;
// do last word
if (__n > 0) {
__first.__seg_ += __nw;
__storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
__storage_type __b = *__first.__seg_ & __m;
*__result.__seg_ &= ~__m;
*__result.__seg_ |= __b;
__result.__ctz_ = static_cast<unsigned>(__n);
}
}
return __result;
}

template <class _Cp, bool _IsConst>
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_unaligned(
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
using _In = __bit_iterator<_Cp, _IsConst>;
using difference_type = typename _In::difference_type;
using __storage_type = typename _In::__storage_type;

const int __bits_per_word = _In::__bits_per_word;
difference_type __n = __last - __first;
if (__n > 0) {
// do first word
if (__first.__ctz_ != 0) {
unsigned __clz_f = __bits_per_word - __first.__ctz_;
difference_type __dn = std::min(static_cast<difference_type>(__clz_f), __n);
__n -= __dn;
__storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
__storage_type __b = *__first.__seg_ & __m;
unsigned __clz_r = __bits_per_word - __result.__ctz_;
__storage_type __ddn = std::min<__storage_type>(__dn, __clz_r);
__m = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __ddn));
*__result.__seg_ &= ~__m;
if (__result.__ctz_ > __first.__ctz_)
*__result.__seg_ |= __b << (__result.__ctz_ - __first.__ctz_);
else
*__result.__seg_ |= __b >> (__first.__ctz_ - __result.__ctz_);
__result.__seg_ += (__ddn + __result.__ctz_) / __bits_per_word;
__result.__ctz_ = static_cast<unsigned>((__ddn + __result.__ctz_) % __bits_per_word);
__dn -= __ddn;
if (__dn > 0) {
__m = ~__storage_type(0) >> (__bits_per_word - __dn);
*__result.__seg_ &= ~__m;
*__result.__seg_ |= __b >> (__first.__ctz_ + __ddn);
__result.__ctz_ = static_cast<unsigned>(__dn);
}
++__first.__seg_;
// __first.__ctz_ = 0;
}
// __first.__ctz_ == 0;
// do middle words
unsigned __clz_r = __bits_per_word - __result.__ctz_;
__storage_type __m = ~__storage_type(0) << __result.__ctz_;
for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_) {
__storage_type __b = *__first.__seg_;
*__result.__seg_ &= ~__m;
*__result.__seg_ |= __b << __result.__ctz_;
++__result.__seg_;
*__result.__seg_ &= __m;
*__result.__seg_ |= __b >> __clz_r;
}
// do last word
if (__n > 0) {
__m = ~__storage_type(0) >> (__bits_per_word - __n);
__storage_type __b = *__first.__seg_ & __m;
__storage_type __dn = std::min(__n, static_cast<difference_type>(__clz_r));
__m = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __dn));
*__result.__seg_ &= ~__m;
*__result.__seg_ |= __b << __result.__ctz_;
__result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
__result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
__n -= __dn;
if (__n > 0) {
__m = ~__storage_type(0) >> (__bits_per_word - __n);
*__result.__seg_ &= ~__m;
*__result.__seg_ |= __b >> __dn;
__result.__ctz_ = static_cast<unsigned>(__n);
}
}
}
return __result;
}

struct __copy_impl {
template <class _InIter, class _Sent, class _OutIter>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter>
Expand Down Expand Up @@ -95,6 +217,16 @@ struct __copy_impl {
}
}

template <class _Cp, bool _IsConst>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<__bit_iterator<_Cp, _IsConst>, __bit_iterator<_Cp, false> >
operator()(__bit_iterator<_Cp, _IsConst> __first,
__bit_iterator<_Cp, _IsConst> __last,
__bit_iterator<_Cp, false> __result) const {
if (__first.__ctz_ == __result.__ctz_)
return std::make_pair(__last, std::__copy_aligned(__first, __last, __result));
return std::make_pair(__last, std::__copy_unaligned(__first, __last, __result));
}

// At this point, the iterators have been unwrapped so any `contiguous_iterator` has been unwrapped to a pointer.
template <class _In, class _Out, __enable_if_t<__can_lower_copy_assignment_to_memmove<_In, _Out>::value, int> = 0>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*>
Expand All @@ -110,7 +242,7 @@ __copy(_InIter __first, _Sent __last, _OutIter __result) {
}

template <class _InputIterator, class _OutputIterator>
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
copy(_InputIterator __first, _InputIterator __last, _OutputIterator __result) {
return std::__copy(__first, __last, __result).second;
}
Expand Down
131 changes: 5 additions & 126 deletions libcxx/include/__bit_reference
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#ifndef _LIBCPP___BIT_REFERENCE
#define _LIBCPP___BIT_REFERENCE

#include <__algorithm/copy.h>
#include <__algorithm/copy_n.h>
#include <__algorithm/min.h>
#include <__bit/countr.h>
Expand All @@ -24,6 +25,7 @@
#include <__type_traits/conditional.h>
#include <__type_traits/is_constant_evaluated.h>
#include <__type_traits/void_t.h>
#include <__utility/pair.h>
#include <__utility/swap.h>

#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
Expand Down Expand Up @@ -183,130 +185,6 @@ private:
__mask_(__m) {}
};

// copy

template <class _Cp, bool _IsConst>
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_aligned(
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
using _In = __bit_iterator<_Cp, _IsConst>;
using difference_type = typename _In::difference_type;
using __storage_type = typename _In::__storage_type;

const int __bits_per_word = _In::__bits_per_word;
difference_type __n = __last - __first;
if (__n > 0) {
// do first word
if (__first.__ctz_ != 0) {
unsigned __clz = __bits_per_word - __first.__ctz_;
difference_type __dn = std::min(static_cast<difference_type>(__clz), __n);
__n -= __dn;
__storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
__storage_type __b = *__first.__seg_ & __m;
*__result.__seg_ &= ~__m;
*__result.__seg_ |= __b;
__result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
__result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
++__first.__seg_;
// __first.__ctz_ = 0;
}
// __first.__ctz_ == 0;
// do middle words
__storage_type __nw = __n / __bits_per_word;
std::copy_n(std::__to_address(__first.__seg_), __nw, std::__to_address(__result.__seg_));
__n -= __nw * __bits_per_word;
__result.__seg_ += __nw;
// do last word
if (__n > 0) {
__first.__seg_ += __nw;
__storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
__storage_type __b = *__first.__seg_ & __m;
*__result.__seg_ &= ~__m;
*__result.__seg_ |= __b;
__result.__ctz_ = static_cast<unsigned>(__n);
}
}
return __result;
}

template <class _Cp, bool _IsConst>
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_unaligned(
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
using _In = __bit_iterator<_Cp, _IsConst>;
using difference_type = typename _In::difference_type;
using __storage_type = typename _In::__storage_type;

const int __bits_per_word = _In::__bits_per_word;
difference_type __n = __last - __first;
if (__n > 0) {
// do first word
if (__first.__ctz_ != 0) {
unsigned __clz_f = __bits_per_word - __first.__ctz_;
difference_type __dn = std::min(static_cast<difference_type>(__clz_f), __n);
__n -= __dn;
__storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
__storage_type __b = *__first.__seg_ & __m;
unsigned __clz_r = __bits_per_word - __result.__ctz_;
__storage_type __ddn = std::min<__storage_type>(__dn, __clz_r);
__m = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __ddn));
*__result.__seg_ &= ~__m;
if (__result.__ctz_ > __first.__ctz_)
*__result.__seg_ |= __b << (__result.__ctz_ - __first.__ctz_);
else
*__result.__seg_ |= __b >> (__first.__ctz_ - __result.__ctz_);
__result.__seg_ += (__ddn + __result.__ctz_) / __bits_per_word;
__result.__ctz_ = static_cast<unsigned>((__ddn + __result.__ctz_) % __bits_per_word);
__dn -= __ddn;
if (__dn > 0) {
__m = ~__storage_type(0) >> (__bits_per_word - __dn);
*__result.__seg_ &= ~__m;
*__result.__seg_ |= __b >> (__first.__ctz_ + __ddn);
__result.__ctz_ = static_cast<unsigned>(__dn);
}
++__first.__seg_;
// __first.__ctz_ = 0;
}
// __first.__ctz_ == 0;
// do middle words
unsigned __clz_r = __bits_per_word - __result.__ctz_;
__storage_type __m = ~__storage_type(0) << __result.__ctz_;
for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_) {
__storage_type __b = *__first.__seg_;
*__result.__seg_ &= ~__m;
*__result.__seg_ |= __b << __result.__ctz_;
++__result.__seg_;
*__result.__seg_ &= __m;
*__result.__seg_ |= __b >> __clz_r;
}
// do last word
if (__n > 0) {
__m = ~__storage_type(0) >> (__bits_per_word - __n);
__storage_type __b = *__first.__seg_ & __m;
__storage_type __dn = std::min(__n, static_cast<difference_type>(__clz_r));
__m = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __dn));
*__result.__seg_ &= ~__m;
*__result.__seg_ |= __b << __result.__ctz_;
__result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
__result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
__n -= __dn;
if (__n > 0) {
__m = ~__storage_type(0) >> (__bits_per_word - __n);
*__result.__seg_ &= ~__m;
*__result.__seg_ |= __b >> __dn;
__result.__ctz_ = static_cast<unsigned>(__n);
}
}
}
return __result;
}

template <class _Cp, bool _IsConst>
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator<_Cp, false>
copy(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
if (__first.__ctz_ == __result.__ctz_)
return std::__copy_aligned(__first, __last, __result);
return std::__copy_unaligned(__first, __last, __result);
}

// copy_backward

template <class _Cp, bool _IsConst>
Expand Down Expand Up @@ -989,8 +867,9 @@ private:
_LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false> __copy_unaligned(
__bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
template <class _Dp, bool _IC>
_LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false>
copy(__bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
_LIBCPP_CONSTEXPR_SINCE_CXX20 friend pair<__bit_iterator<_Dp, _IC>, __bit_iterator<_Dp, false> >
__copy_impl::operator()(
__bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result) const;
template <class _Dp, bool _IC>
_LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false> __copy_backward_aligned(
__bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
Expand Down
1 change: 1 addition & 0 deletions libcxx/include/bitset
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ template <size_t N> struct hash<std::bitset<N>>;
#if __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
# include <__cxx03/bitset>
#else
# include <__algorithm/copy.h>
# include <__algorithm/count.h>
# include <__algorithm/fill.h>
# include <__algorithm/fill_n.h>
Expand Down
Loading