Skip to content

Commit 5251bb2

Browse files
committed
[libc++] Vectorize mismatch
1 parent 6a9f6de commit 5251bb2

File tree

14 files changed

+863
-67
lines changed

14 files changed

+863
-67
lines changed
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include <algorithm>
10+
#include <benchmark/benchmark.h>
11+
#include <random>
12+
13+
template <class T>
14+
static void bm_find(benchmark::State& state) {
15+
std::vector<T> vec1(state.range(), '1');
16+
std::vector<T> vec2(state.range(), '1');
17+
std::mt19937_64 rng(std::random_device{}());
18+
19+
for (auto _ : state) {
20+
auto idx = rng() % vec1.size();
21+
vec1[idx] = '2';
22+
benchmark::DoNotOptimize(vec1);
23+
benchmark::DoNotOptimize(std::mismatch(vec1.begin(), vec1.end(), vec2.begin()));
24+
vec1[idx] = '1';
25+
}
26+
}
27+
BENCHMARK(bm_find<char>)->DenseRange(1, 8)->Range(16, 1 << 20);
28+
BENCHMARK(bm_find<short>)->DenseRange(1, 8)->Range(16, 1 << 20);
29+
BENCHMARK(bm_find<int>)->DenseRange(1, 8)->Range(16, 1 << 20);
30+
31+
BENCHMARK_MAIN();

libcxx/include/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,7 @@ set(files
229229
__algorithm/unwrap_iter.h
230230
__algorithm/unwrap_range.h
231231
__algorithm/upper_bound.h
232+
__algorithm/vectorization.h
232233
__assert
233234
__atomic/aliases.h
234235
__atomic/atomic.h

libcxx/include/__algorithm/mismatch.h

Lines changed: 153 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,23 +11,171 @@
1111
#define _LIBCPP___ALGORITHM_MISMATCH_H
1212

1313
#include <__algorithm/comp.h>
14+
#include <__algorithm/unwrap_iter.h>
15+
#include <__algorithm/vectorization.h>
1416
#include <__config>
17+
#include <__functional/identity.h>
1518
#include <__iterator/iterator_traits.h>
19+
#include <__type_traits/invoke.h>
20+
#include <__type_traits/is_equality_comparable.h>
21+
#include <__utility/align_down.h>
22+
#include <__utility/move.h>
1623
#include <__utility/pair.h>
24+
#include <experimental/__simd/feature_traits.h>
25+
#include <experimental/__simd/simd.h>
26+
#include <experimental/__simd/simd_mask.h>
1727

1828
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
1929
# pragma GCC system_header
2030
#endif
2131

2232
_LIBCPP_BEGIN_NAMESPACE_STD
2333

34+
template <class _InIter1, class _Sent1, class _InIter2, class _Pred, class _Proj1, class _Proj2>
35+
_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_InIter1, _InIter2>
36+
__mismatch_loop(_InIter1 __first1, _Sent1 __last1, _InIter2 __first2, _Pred __pred, _Proj1 __proj1, _Proj2 __proj2) {
37+
while (__first1 != __last1) {
38+
if (!std::__invoke(__pred, std::__invoke(__proj1, *__first1), std::__invoke(__proj2, *__first2)))
39+
break;
40+
++__first1;
41+
++__first2;
42+
}
43+
return {std::move(__first1), std::move(__first2)};
44+
}
45+
46+
#if _LIBCPP_CAN_VECTORIZE_ALGORIHTMS
47+
template <class _Tp>
48+
struct __mismatch_vector_impl {
49+
template <bool _VectorizeFloatingPoint>
50+
static constexpr bool __can_vectorize =
51+
(__libcpp_is_trivially_equality_comparable<_Tp, _Tp>::value && __fits_in_vector<_Tp> &&
52+
alignof(_Tp) >= alignof(__get_arithmetic_type<_Tp>)) ||
53+
(_VectorizeFloatingPoint && is_floating_point_v<_Tp>);
54+
55+
using __vec = __arithmetic_vec<_Tp>;
56+
using __mask_traits = experimental::__mask_traits<typename __vec::value_type, typename __vec::abi_type>;
57+
static constexpr size_t __unroll_count = 4;
58+
59+
struct __result {
60+
_Tp* __iter1;
61+
_Tp* __iter2;
62+
bool __matched;
63+
};
64+
65+
_LIBCPP_HIDE_FROM_ABI static __result __prologue(_Tp* __first1, _Tp* __last1, _Tp* __first2) {
66+
if constexpr (__mask_traits::__has_maskload) {
67+
auto __first_aligned = std::__align_down(__vec::size(), __first1);
68+
auto __offset = __first1 - __first_aligned;
69+
auto __checked_size = __vec::size() - __offset;
70+
if (__checked_size < __last1 - __first1)
71+
return {__first1, __first2, false};
72+
auto __second_aligned = __first2 - __offset;
73+
auto __mask = __mask_traits::__mask_with_last_enabled(__checked_size);
74+
__vec __lhs =
75+
__mask_traits::__maskload_unaligned(reinterpret_cast<typename __vec::value_type*>(__first_aligned), __mask);
76+
__vec __rhs =
77+
__mask_traits::__maskload_unaligned(reinterpret_cast<typename __vec::value_type*>(__second_aligned), __mask);
78+
auto __res = __mask_traits::__mask_cmp_eq(__mask, __lhs, __rhs);
79+
auto __inv_mask = ~__mask.__get_data().__mask_;
80+
if ((__res.__get_data().__mask_ & __mask.__get_data().__mask_) != __mask.__get_data().__mask_) {
81+
auto __match_offset = experimental::find_first_set(decltype(__mask){
82+
experimental::__from_storage, {decltype(__res.__get_data().__mask_)(~__res.__get_data().__mask_)}});
83+
return {__first_aligned + __match_offset, __second_aligned + __match_offset, true};
84+
}
85+
return {__first_aligned + __vec::size(), __second_aligned + __vec::size(), false};
86+
} else {
87+
return {__first1, __first2, false};
88+
}
89+
}
90+
91+
_LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static __result __loop(_Tp* __first1, _Tp* __last1, _Tp* __first2) {
92+
while (__last1 - __first1 >= __unroll_count * __vec::size()) {
93+
__vec __lhs[__unroll_count];
94+
__vec __rhs[__unroll_count];
95+
96+
for (size_t __i = 0; __i != __unroll_count; ++__i) {
97+
__lhs[__i] = std::__load_as_arithmetic(__first1 + __i * __vec::size());
98+
__rhs[__i] = std::__load_as_arithmetic(__first2 + __i * __vec::size());
99+
}
100+
101+
for (size_t __i = 0; __i != __unroll_count; ++__i) {
102+
if (auto __res = __lhs[__i] == __rhs[__i]; !experimental::all_of(__res)) {
103+
auto __offset = __i * __vec::size() + experimental::find_first_set(__res);
104+
return {__first1 + __offset, __first2 + __offset, true};
105+
}
106+
}
107+
108+
__first1 += __unroll_count * __vec::size();
109+
__first2 += __unroll_count * __vec::size();
110+
}
111+
return {__first1, __first2, __first1 == __last1};
112+
}
113+
114+
_LIBCPP_HIDE_FROM_ABI static pair<_Tp*, _Tp*> __epilogue(_Tp* __first1, _Tp* __last1, _Tp* __first2) {
115+
if constexpr (__mask_traits::__has_maskload) {
116+
auto __size = __last1 - __first1;
117+
auto __mask = __mask_traits::__mask_with_first_enabled(__size);
118+
__vec __lhs =
119+
__mask_traits::__maskload_unaligned(reinterpret_cast<typename __vec::value_type*>(__first1), __mask);
120+
__vec __rhs =
121+
__mask_traits::__maskload_unaligned(reinterpret_cast<typename __vec::value_type*>(__first2), __mask);
122+
auto __res = __mask_traits::__mask_cmp_eq(__mask, __lhs, __rhs);
123+
auto __inv_mask = ~__mask.__get_data().__mask_;
124+
if ((__res.__get_data().__mask_ | __inv_mask) != decltype(__mask){true}.__get_data().__mask_) {
125+
auto __offset = experimental::find_first_set(__res);
126+
return {__first1 + __offset, __first2 + __offset};
127+
}
128+
return {__first1 + __size, __first2 + __size};
129+
} else {
130+
return std::__mismatch_loop(__first1, __last1, __first2, __equal_to(), __identity(), __identity());
131+
}
132+
}
133+
};
134+
#endif // _LIBCPP_CAN_VECTORIZE_ALGORIHTMS
135+
136+
template <class _InIter1, class _Sent1, class _InIter2, class _Pred, class _Proj1, class _Proj2>
137+
_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_InIter1, _InIter2>
138+
__mismatch(_InIter1 __first1, _Sent1 __last1, _InIter2 __first2, _Pred __pred, _Proj1 __proj1, _Proj2 __proj2) {
139+
return std::__mismatch_loop(__first1, __last1, __first2, __pred, __proj1, __proj2);
140+
}
141+
142+
#if _LIBCPP_VECTORIZE_CLASSIC_ALGORITHMS
143+
template <
144+
class _Tp,
145+
class _Pred,
146+
class _Proj1,
147+
class _Proj2,
148+
enable_if_t<
149+
__desugars_to<__equal_tag, _Pred, _Tp, _Tp>::value && __is_identity<_Proj1>::value &&
150+
__is_identity<_Proj2>::value &&
151+
__mismatch_vector_impl<_Tp>::template __can_vectorize<_LIBCPP_VECTORIZE_FLOATING_POINT_CLASSIC_ALGORITHMS>,
152+
int> = 0>
153+
_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI inline constexpr pair<_Tp*, _Tp*>
154+
__mismatch(_Tp* __first1, _Tp* __last1, _Tp* __first2, _Pred __pred, _Proj1 __proj1, _Proj2 __proj2) {
155+
if (__libcpp_is_constant_evaluated())
156+
return std::__mismatch_loop(__first1, __last1, __first2, __pred, __proj1, __proj2);
157+
158+
using __impl = __mismatch_vector_impl<_Tp>;
159+
160+
// auto [__piter1, __piter2, __pmatch] = __impl::__prologue(__first1, __last1, __first2);
161+
// if (__pmatch)
162+
// return {__piter1, __piter2};
163+
164+
auto [__iter1, __iter2, __matched] = __impl::__loop(__first1, __last1, __first2);
165+
if (__matched)
166+
return {__iter1, __iter2};
167+
168+
return __impl::__epilogue(__first1, __last1, __first2);
169+
}
170+
#endif // _LIBCPP_VECTORIZE_ALGORITHMS
171+
24172
template <class _InputIterator1, class _InputIterator2, class _BinaryPredicate>
25-
_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_InputIterator1, _InputIterator2>
173+
_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_InputIterator1, _InputIterator2>
26174
mismatch(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, _BinaryPredicate __pred) {
27-
for (; __first1 != __last1; ++__first1, (void)++__first2)
28-
if (!__pred(*__first1, *__first2))
29-
break;
30-
return pair<_InputIterator1, _InputIterator2>(__first1, __first2);
175+
__identity __proj;
176+
auto __res = std::__mismatch(
177+
std::__unwrap_iter(__first1), std::__unwrap_iter(__last1), std::__unwrap_iter(__first2), __pred, __proj, __proj);
178+
return std::make_pair(std::__rewrap_iter(__first1, __res.first), std::__rewrap_iter(__first2, __res.second));
31179
}
32180

33181
template <class _InputIterator1, class _InputIterator2>
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef _LIBCPP___ALGORITHM_VECTORIZATION_H
10+
#define _LIBCPP___ALGORITHM_VECTORIZATION_H
11+
12+
#include <__config>
13+
#include <__type_traits/is_floating_point.h>
14+
#include <__utility/integer_sequence.h>
15+
#include <experimental/__simd/simd.h>
16+
17+
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
18+
# pragma GCC system_header
19+
#endif
20+
21+
#if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL)
22+
# define _LIBCPP_CAN_VECTORIZE_ALGORIHTMS 1
23+
#else
24+
# define _LIBCPP_CAN_VECTORIZE_ALGORIHTMS 0
25+
#endif
26+
27+
#if _LIBCPP_CAN_VECTORIZE_ALGORIHTMS && !defined(__OPTIMIZE_SIZE__)
28+
# define _LIBCPP_VECTORIZE_CLASSIC_ALGORITHMS 1
29+
#else
30+
# define _LIBCPP_VECTORIZE_CLASSIC_ALGORITHMS 0
31+
#endif
32+
33+
#if _LIBCPP_VECTORIZE_CLASSIC_ALGORITHMS && defined(__FAST_MATH__)
34+
# define _LIBCPP_VECTORIZE_FLOATING_POINT_CLASSIC_ALGORITHMS 1
35+
#else
36+
# define _LIBCPP_VECTORIZE_FLOATING_POINT_CLASSIC_ALGORITHMS 0
37+
#endif
38+
39+
#if _LIBCPP_CAN_VECTORIZE_ALGORIHTMS
40+
41+
_LIBCPP_BEGIN_NAMESPACE_STD
42+
43+
template <class _Tp>
44+
inline static const bool __fits_in_vector =
45+
sizeof(_Tp) == 1 || sizeof(_Tp) == 2 || sizeof(_Tp) == 4 || sizeof(_Tp) == 8;
46+
47+
template <class _Tp>
48+
_LIBCPP_HIDE_FROM_ABI constexpr auto __get_arithmetic_type_impl() {
49+
if constexpr (is_floating_point_v<_Tp>)
50+
return _Tp{};
51+
else if constexpr (constexpr auto __sz = sizeof(_Tp); __sz == 1)
52+
return uint8_t{};
53+
else if constexpr (__sz == 2)
54+
return uint16_t{};
55+
else if constexpr (__sz == 4)
56+
return uint32_t{};
57+
else if constexpr (__sz == 8)
58+
return uint64_t{};
59+
else
60+
static_assert(false, "unexpected sizeof type");
61+
}
62+
63+
template <class _Tp>
64+
using __get_arithmetic_type = decltype(__get_arithmetic_type_impl<_Tp>());
65+
66+
template <class _Tp>
67+
using __arithmetic_vec = experimental::native_simd<__get_arithmetic_type<_Tp>>;
68+
69+
template <class _Tp>
70+
_LIBCPP_HIDE_FROM_ABI __arithmetic_vec<_Tp> __load_as_arithmetic(_Tp* __values) {
71+
return {reinterpret_cast<__get_arithmetic_type<_Tp>*>(__values), 0};
72+
}
73+
74+
_LIBCPP_END_NAMESPACE_STD
75+
76+
#endif // _LIBCPP_CAN_VECTORIZE_ALGORIHTMS
77+
78+
#endif // _LIBCPP___ALGORITHM_VECTORIZATION_H

libcxx/include/__bit/has_single_bit.h

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,19 +19,24 @@
1919
_LIBCPP_PUSH_MACROS
2020
#include <__undef_macros>
2121

22-
#if _LIBCPP_STD_VER >= 20
23-
2422
_LIBCPP_BEGIN_NAMESPACE_STD
2523

26-
template <__libcpp_unsigned_integer _Tp>
27-
_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI constexpr bool has_single_bit(_Tp __t) noexcept {
24+
template <class _Tp>
25+
_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI constexpr bool __has_single_bit(_Tp __t) noexcept {
2826
return __t != 0 && (((__t & (__t - 1)) == 0));
2927
}
3028

31-
_LIBCPP_END_NAMESPACE_STD
29+
#if _LIBCPP_STD_VER >= 20
30+
31+
template <__libcpp_unsigned_integer _Tp>
32+
_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI constexpr bool has_single_bit(_Tp __t) noexcept {
33+
return std::__has_single_bit(__t);
34+
}
3235

3336
#endif // _LIBCPP_STD_VER >= 20
3437

38+
_LIBCPP_END_NAMESPACE_STD
39+
3540
_LIBCPP_POP_MACROS
3641

3742
#endif // _LIBCPP___BIT_HAS_SINGLE_BIT_H

libcxx/include/__utility/align_down.h

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef _LIBCPP___UTILITY_ALIGN_DOWN_H
10+
#define _LIBCPP___UTILITY_ALIGN_DOWN_H
11+
12+
#include <__config>
13+
#include <cstddef>
14+
#include <cstdint>
15+
16+
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
17+
# pragma GCC system_header
18+
#endif
19+
20+
_LIBCPP_BEGIN_NAMESPACE_STD
21+
22+
template <class _Tp>
23+
_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI inline _Tp* __align_down(size_t __align, _Tp* __ptr) {
24+
_LIBCPP_ASSERT_UNCATEGORIZED(
25+
__align >= alignof(_Tp), "Alignment has to be at least as large as the required alignment");
26+
return reinterpret_cast<_Tp*>(reinterpret_cast<uintptr_t>(__ptr) & ~(__align - 1));
27+
}
28+
29+
_LIBCPP_END_NAMESPACE_STD
30+
31+
#endif // _LIBCPP___UTILITY_ALIGN_DOWN_H

0 commit comments

Comments
 (0)