-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[libc++] Vectorize mismatch #73255
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[libc++] Vectorize mismatch #73255
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
//===----------------------------------------------------------------------===// | ||
// | ||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#include <algorithm> | ||
#include <benchmark/benchmark.h> | ||
#include <random> | ||
|
||
// TODO: Look into benchmarking aligned and unaligned memory explicitly | ||
// (currently things happen to be aligned because they are malloced that way) | ||
template <class T> | ||
static void bm_mismatch(benchmark::State& state) { | ||
std::vector<T> vec1(state.range(), '1'); | ||
std::vector<T> vec2(state.range(), '1'); | ||
std::mt19937_64 rng(std::random_device{}()); | ||
|
||
vec1.back() = '2'; | ||
for (auto _ : state) { | ||
benchmark::DoNotOptimize(vec1); | ||
benchmark::DoNotOptimize(std::mismatch(vec1.begin(), vec1.end(), vec2.begin())); | ||
} | ||
} | ||
BENCHMARK(bm_mismatch<char>)->DenseRange(1, 8)->Range(16, 1 << 20); | ||
BENCHMARK(bm_mismatch<short>)->DenseRange(1, 8)->Range(16, 1 << 20); | ||
BENCHMARK(bm_mismatch<int>)->DenseRange(1, 8)->Range(16, 1 << 20); | ||
|
||
BENCHMARK_MAIN(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You want to compare against |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,23 +11,93 @@ | |
#define _LIBCPP___ALGORITHM_MISMATCH_H | ||
|
||
#include <__algorithm/comp.h> | ||
#include <__algorithm/simd_utils.h> | ||
#include <__algorithm/unwrap_iter.h> | ||
#include <__config> | ||
#include <__iterator/iterator_traits.h> | ||
#include <__functional/identity.h> | ||
#include <__type_traits/invoke.h> | ||
#include <__type_traits/is_constant_evaluated.h> | ||
#include <__type_traits/is_equality_comparable.h> | ||
#include <__type_traits/operation_traits.h> | ||
#include <__utility/move.h> | ||
#include <__utility/pair.h> | ||
#include <__utility/unreachable.h> | ||
|
||
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) | ||
# pragma GCC system_header | ||
#endif | ||
|
||
_LIBCPP_PUSH_MACROS | ||
#include <__undef_macros> | ||
|
||
_LIBCPP_BEGIN_NAMESPACE_STD | ||
|
||
template <class _Iter1, class _Sent1, class _Iter2, class _Pred, class _Proj1, class _Proj2> | ||
_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Iter1, _Iter2> | ||
__mismatch_loop(_Iter1 __first1, _Sent1 __last1, _Iter2 __first2, _Pred& __pred, _Proj1& __proj1, _Proj2& __proj2) { | ||
while (__first1 != __last1) { | ||
if (!std::__invoke(__pred, std::__invoke(__proj1, *__first1), std::__invoke(__proj2, *__first2))) | ||
break; | ||
++__first1; | ||
++__first2; | ||
} | ||
return std::make_pair(std::move(__first1), std::move(__first2)); | ||
} | ||
|
||
template <class _Iter1, class _Sent1, class _Iter2, class _Pred, class _Proj1, class _Proj2> | ||
_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Iter1, _Iter2> | ||
__mismatch(_Iter1 __first1, _Sent1 __last1, _Iter2 __first2, _Pred& __pred, _Proj1& __proj1, _Proj2& __proj2) { | ||
return std::__mismatch_loop(__first1, __last1, __first2, __pred, __proj1, __proj2); | ||
} | ||
|
||
#if _LIBCPP_VECTORIZE_ALGORITHMS | ||
|
||
template <class _Tp, | ||
class _Pred, | ||
class _Proj1, | ||
class _Proj2, | ||
__enable_if_t<is_integral<_Tp>::value && __desugars_to<__equal_tag, _Pred, _Tp, _Tp>::value && | ||
__is_identity<_Proj1>::value && __is_identity<_Proj2>::value, | ||
int> = 0> | ||
_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Tp*, _Tp*> | ||
__mismatch(_Tp* __first1, _Tp* __last1, _Tp* __first2, _Pred& __pred, _Proj1& __proj1, _Proj2& __proj2) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One thing we could do here is implement this function as |
||
constexpr size_t __unroll_count = 4; | ||
constexpr size_t __vec_size = __native_vector_size<_Tp>; | ||
using __vec = __simd_vector<_Tp, __vec_size>; | ||
philnik777 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if (!__libcpp_is_constant_evaluated()) { | ||
while (static_cast<size_t>(__last1 - __first1) >= __unroll_count * __vec_size) [[__unlikely__]] { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @fhahn @jroelofs According to @philnik777, if we manually unroll the loop with a constant number of iterations, Clang isn't "smart" enough to vectorize the code. So we end up having to use explicit constructs like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you show the version w/o manual unrolling? Have you tried There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi, Why this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do you think the loop is likely? |
||
__vec __lhs[__unroll_count]; | ||
__vec __rhs[__unroll_count]; | ||
|
||
for (size_t __i = 0; __i != __unroll_count; ++__i) { | ||
__lhs[__i] = std::__load_vector<__vec>(__first1 + __i * __vec_size); | ||
__rhs[__i] = std::__load_vector<__vec>(__first2 + __i * __vec_size); | ||
} | ||
|
||
for (size_t __i = 0; __i != __unroll_count; ++__i) { | ||
if (auto __cmp_res = __lhs[__i] == __rhs[__i]; !std::__all_of(__cmp_res)) { | ||
auto __offset = __i * __vec_size + std::__find_first_not_set(__cmp_res); | ||
return {__first1 + __offset, __first2 + __offset}; | ||
} | ||
} | ||
|
||
__first1 += __unroll_count * __vec_size; | ||
__first2 += __unroll_count * __vec_size; | ||
} | ||
} | ||
// TODO: Consider vectorizing the tail | ||
return std::__mismatch_loop(__first1, __last1, __first2, __pred, __proj1, __proj2); | ||
philnik777 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
#endif // _LIBCPP_VECTORIZE_ALGORITHMS | ||
|
||
template <class _InputIterator1, class _InputIterator2, class _BinaryPredicate> | ||
_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_InputIterator1, _InputIterator2> | ||
mismatch(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, _BinaryPredicate __pred) { | ||
for (; __first1 != __last1; ++__first1, (void)++__first2) | ||
if (!__pred(*__first1, *__first2)) | ||
break; | ||
return pair<_InputIterator1, _InputIterator2>(__first1, __first2); | ||
__identity __proj; | ||
auto __res = std::__mismatch( | ||
std::__unwrap_iter(__first1), std::__unwrap_iter(__last1), std::__unwrap_iter(__first2), __pred, __proj, __proj); | ||
return std::make_pair(std::__rewrap_iter(__first1, __res.first), std::__rewrap_iter(__first2, __res.second)); | ||
} | ||
|
||
template <class _InputIterator1, class _InputIterator2> | ||
|
@@ -59,4 +129,6 @@ mismatch(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __fi | |
|
||
_LIBCPP_END_NAMESPACE_STD | ||
|
||
_LIBCPP_POP_MACROS | ||
|
||
#endif // _LIBCPP___ALGORITHM_MISMATCH_H |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
//===----------------------------------------------------------------------===// | ||
// | ||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#ifndef _LIBCPP___ALGORITHM_SIMD_UTILS_H | ||
#define _LIBCPP___ALGORITHM_SIMD_UTILS_H | ||
|
||
#include <__bit/bit_cast.h> | ||
#include <__bit/countr.h> | ||
#include <__config> | ||
#include <__type_traits/is_arithmetic.h> | ||
#include <__type_traits/is_same.h> | ||
#include <__utility/integer_sequence.h> | ||
#include <cstddef> | ||
#include <cstdint> | ||
|
||
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) | ||
# pragma GCC system_header | ||
#endif | ||
|
||
// TODO: Find out how altivec changes things and allow vectorizations there too. | ||
#if _LIBCPP_STD_VER >= 14 && defined(_LIBCPP_CLANG_VER) && _LIBCPP_CLANG_VER >= 1700 && !defined(__ALTIVEC__) | ||
# define _LIBCPP_HAS_ALGORITHM_VECTOR_UTILS 1 | ||
#else | ||
# define _LIBCPP_HAS_ALGORITHM_VECTOR_UTILS 0 | ||
#endif | ||
|
||
#if _LIBCPP_HAS_ALGORITHM_VECTOR_UTILS && !defined(__OPTIMIZE_SIZE__) | ||
# define _LIBCPP_VECTORIZE_ALGORITHMS 1 | ||
#else | ||
# define _LIBCPP_VECTORIZE_ALGORITHMS 0 | ||
#endif | ||
|
||
#if _LIBCPP_HAS_ALGORITHM_VECTOR_UTILS | ||
|
||
_LIBCPP_BEGIN_NAMESPACE_STD | ||
philnik777 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
// This isn't specialized for 64 byte vectors on purpose. They have the potential to significantly reduce performance | ||
// in mixed simd/non-simd workloads and don't provide any performance improvement for currently vectorized algorithms | ||
// as far as benchmarks are concerned. | ||
# if defined(__AVX__) | ||
template <class _Tp> | ||
inline constexpr size_t __native_vector_size = 32 / sizeof(_Tp); | ||
philnik777 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# elif defined(__SSE__) || defined(__ARM_NEON__) | ||
template <class _Tp> | ||
inline constexpr size_t __native_vector_size = 16 / sizeof(_Tp); | ||
# elif defined(__MMX__) | ||
template <class _Tp> | ||
inline constexpr size_t __native_vector_size = 8 / sizeof(_Tp); | ||
# else | ||
template <class _Tp> | ||
inline constexpr size_t __native_vector_size = 1; | ||
# endif | ||
|
||
template <class _ArithmeticT, size_t _Np> | ||
using __simd_vector __attribute__((__ext_vector_type__(_Np))) = _ArithmeticT; | ||
|
||
template <class _VecT> | ||
inline constexpr size_t __simd_vector_size_v = []<bool _False = false>() -> size_t { | ||
static_assert(_False, "Not a vector!"); | ||
}(); | ||
|
||
template <class _Tp, size_t _Np> | ||
inline constexpr size_t __simd_vector_size_v<__simd_vector<_Tp, _Np>> = _Np; | ||
|
||
template <class _Tp, size_t _Np> | ||
_LIBCPP_HIDE_FROM_ABI _Tp __simd_vector_underlying_type_impl(__simd_vector<_Tp, _Np>) { | ||
return _Tp{}; | ||
} | ||
|
||
template <class _VecT> | ||
using __simd_vector_underlying_type_t = decltype(std::__simd_vector_underlying_type_impl(_VecT{})); | ||
|
||
// This isn't inlined without always_inline when loading chars. | ||
template <class _VecT, class _Tp> | ||
_LIBCPP_NODISCARD _LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _VecT __load_vector(const _Tp* __ptr) noexcept { | ||
return [=]<size_t... _Indices>(index_sequence<_Indices...>) _LIBCPP_ALWAYS_INLINE noexcept { | ||
return _VecT{__ptr[_Indices]...}; | ||
}(make_index_sequence<__simd_vector_size_v<_VecT>>{}); | ||
} | ||
|
||
template <class _Tp, size_t _Np> | ||
_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI bool __all_of(__simd_vector<_Tp, _Np> __vec) noexcept { | ||
return __builtin_reduce_and(__builtin_convertvector(__vec, __simd_vector<bool, _Np>)); | ||
philnik777 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
template <class _Tp, size_t _Np> | ||
_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI size_t __find_first_set(__simd_vector<_Tp, _Np> __vec) noexcept { | ||
using __mask_vec = __simd_vector<bool, _Np>; | ||
|
||
// This has MSan disabled du to https://github.com/llvm/llvm-project/issues/85876 | ||
auto __impl = [&]<class _MaskT>(_MaskT) _LIBCPP_NO_SANITIZE("memory") noexcept { | ||
return std::__countr_zero(__builtin_bit_cast(_MaskT, __builtin_convertvector(__vec, __mask_vec))); | ||
}; | ||
|
||
if constexpr (sizeof(__mask_vec) == sizeof(uint8_t)) { | ||
return __impl(uint8_t{}); | ||
} else if constexpr (sizeof(__mask_vec) == sizeof(uint16_t)) { | ||
return __impl(uint16_t{}); | ||
} else if constexpr (sizeof(__mask_vec) == sizeof(uint32_t)) { | ||
return __impl(uint32_t{}); | ||
} else if constexpr (sizeof(__mask_vec) == sizeof(uint64_t)) { | ||
return __impl(uint64_t{}); | ||
} else { | ||
static_assert(sizeof(__mask_vec) == 0, "unexpected required size for mask integer type"); | ||
return 0; | ||
} | ||
} | ||
|
||
template <class _Tp, size_t _Np> | ||
_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI size_t __find_first_not_set(__simd_vector<_Tp, _Np> __vec) noexcept { | ||
return std::__find_first_set(~__vec); | ||
philnik777 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
_LIBCPP_END_NAMESPACE_STD | ||
|
||
#endif // _LIBCPP_HAS_ALGORITHM_VECTOR_UTILS | ||
|
||
#endif // _LIBCPP___ALGORITHM_SIMD_UTILS_H |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't know how much you care for it but you can try to mess with data alignment.
ve1.data() and vec2.data() will be aligned to 16 bytes. Which can lead to loads being aligned.
the difference can be quite huge.
There are some things you can do about that, I don't know if they are worth it for 2 range algorithms.
At the very least maybe aligned your .data() pointers to 64 bytes:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've added a TODO for now. Though even if it makes a difference I'm not sure we can do much about it, can we?
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can align onr of the arrays. Also this makes for a better benchmark