llvm
diff --git a/‎libcxx/benchmarks/algorithms/mismatch.bench.cpp
Lines changed: 31 additions & 0 deletions b/‎libcxx/benchmarks/algorithms/mismatch.bench.cpp
Lines changed: 31 additions & 0 deletions
diff --git a/‎libcxx/include/CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎libcxx/include/CMakeLists.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎libcxx/include/__algorithm/mismatch.h
Lines changed: 153 additions & 5 deletions b/‎libcxx/include/__algorithm/mismatch.h
Lines changed: 153 additions & 5 deletions
diff --git a/‎libcxx/include/__algorithm/vectorization.h
Lines changed: 78 additions & 0 deletions b/‎libcxx/include/__algorithm/vectorization.h
Lines changed: 78 additions & 0 deletions
diff --git a/‎libcxx/include/__bit/has_single_bit.h
Lines changed: 10 additions & 5 deletions b/‎libcxx/include/__bit/has_single_bit.h
Lines changed: 10 additions & 5 deletions
diff --git a/‎libcxx/include/__utility/align_down.h
Lines changed: 31 additions & 0 deletions b/‎libcxx/include/__utility/align_down.h
Lines changed: 31 additions & 0 deletions
@@ -0,0 +1,31 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <benchmark/benchmark.h>
+#include <random>
+
+template <class T>
+static void bm_find(benchmark::State& state) {
+  std::vector<T> vec1(state.range(), '1');
+  std::vector<T> vec2(state.range(), '1');
+  std::mt19937_64 rng(std::random_device{}());
+
+  for (auto _ : state) {
+    auto idx  = rng() % vec1.size();
+    vec1[idx] = '2';
+    benchmark::DoNotOptimize(vec1);
+    benchmark::DoNotOptimize(std::mismatch(vec1.begin(), vec1.end(), vec2.begin()));
+    vec1[idx] = '1';
+  }
+}
+BENCHMARK(bm_find<char>)->DenseRange(1, 8)->Range(16, 1 << 20);
+BENCHMARK(bm_find<short>)->DenseRange(1, 8)->Range(16, 1 << 20);
+BENCHMARK(bm_find<int>)->DenseRange(1, 8)->Range(16, 1 << 20);
+
+BENCHMARK_MAIN();
@@ -229,6 +229,7 @@ set(files
   __algorithm/unwrap_iter.h
   __algorithm/unwrap_range.h
   __algorithm/upper_bound.h
+  __algorithm/vectorization.h
   __assert
   __atomic/aliases.h
   __atomic/atomic.h
 
@@ -11,23 +11,171 @@
 #define _LIBCPP___ALGORITHM_MISMATCH_H
 
 #include <__algorithm/comp.h>
+#include <__algorithm/unwrap_iter.h>
+#include <__algorithm/vectorization.h>
 #include <__config>
+#include <__functional/identity.h>
 #include <__iterator/iterator_traits.h>
+#include <__type_traits/invoke.h>
+#include <__type_traits/is_equality_comparable.h>
+#include <__utility/align_down.h>
+#include <__utility/move.h>
 #include <__utility/pair.h>
+#include <experimental/__simd/feature_traits.h>
+#include <experimental/__simd/simd.h>
+#include <experimental/__simd/simd_mask.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+template <class _InIter1, class _Sent1, class _InIter2, class _Pred, class _Proj1, class _Proj2>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_InIter1, _InIter2>
+__mismatch_loop(_InIter1 __first1, _Sent1 __last1, _InIter2 __first2, _Pred __pred, _Proj1 __proj1, _Proj2 __proj2) {
+  while (__first1 != __last1) {
+    if (!std::__invoke(__pred, std::__invoke(__proj1, *__first1), std::__invoke(__proj2, *__first2)))
+      break;
+    ++__first1;
+    ++__first2;
+  }
+  return {std::move(__first1), std::move(__first2)};
+}
+
+#if _LIBCPP_CAN_VECTORIZE_ALGORIHTMS
+template <class _Tp>
+struct __mismatch_vector_impl {
+  template <bool _VectorizeFloatingPoint>
+  static constexpr bool __can_vectorize =
+      (__libcpp_is_trivially_equality_comparable<_Tp, _Tp>::value && __fits_in_vector<_Tp> &&
+       alignof(_Tp) >= alignof(__get_arithmetic_type<_Tp>)) ||
+      (_VectorizeFloatingPoint && is_floating_point_v<_Tp>);
+
+  using __vec         = __arithmetic_vec<_Tp>;
+  using __mask_traits = experimental::__mask_traits<typename __vec::value_type, typename __vec::abi_type>;
+  static constexpr size_t __unroll_count = 4;
+
+  struct __result {
+    _Tp* __iter1;
+    _Tp* __iter2;
+    bool __matched;
+  };
+
+  _LIBCPP_HIDE_FROM_ABI static __result __prologue(_Tp* __first1, _Tp* __last1, _Tp* __first2) {
+    if constexpr (__mask_traits::__has_maskload) {
+      auto __first_aligned = std::__align_down(__vec::size(), __first1);
+      auto __offset        = __first1 - __first_aligned;
+      auto __checked_size  = __vec::size() - __offset;
+      if (__checked_size < __last1 - __first1)
+        return {__first1, __first2, false};
+      auto __second_aligned = __first2 - __offset;
+      auto __mask           = __mask_traits::__mask_with_last_enabled(__checked_size);
+      __vec __lhs =
+          __mask_traits::__maskload_unaligned(reinterpret_cast<typename __vec::value_type*>(__first_aligned), __mask);
+      __vec __rhs =
+          __mask_traits::__maskload_unaligned(reinterpret_cast<typename __vec::value_type*>(__second_aligned), __mask);
+      auto __res      = __mask_traits::__mask_cmp_eq(__mask, __lhs, __rhs);
+      auto __inv_mask = ~__mask.__get_data().__mask_;
+      if ((__res.__get_data().__mask_ & __mask.__get_data().__mask_) != __mask.__get_data().__mask_) {
+        auto __match_offset = experimental::find_first_set(decltype(__mask){
+            experimental::__from_storage, {decltype(__res.__get_data().__mask_)(~__res.__get_data().__mask_)}});
+        return {__first_aligned + __match_offset, __second_aligned + __match_offset, true};
+      }
+      return {__first_aligned + __vec::size(), __second_aligned + __vec::size(), false};
+    } else {
+      return {__first1, __first2, false};
+    }
+  }
+
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static __result __loop(_Tp* __first1, _Tp* __last1, _Tp* __first2) {
+    while (__last1 - __first1 >= __unroll_count * __vec::size()) {
+      __vec __lhs[__unroll_count];
+      __vec __rhs[__unroll_count];
+
+      for (size_t __i = 0; __i != __unroll_count; ++__i) {
+        __lhs[__i] = std::__load_as_arithmetic(__first1 + __i * __vec::size());
+        __rhs[__i] = std::__load_as_arithmetic(__first2 + __i * __vec::size());
+      }
+
+      for (size_t __i = 0; __i != __unroll_count; ++__i) {
+        if (auto __res = __lhs[__i] == __rhs[__i]; !experimental::all_of(__res)) {
+          auto __offset = __i * __vec::size() + experimental::find_first_set(__res);
+          return {__first1 + __offset, __first2 + __offset, true};
+        }
+      }
+
+      __first1 += __unroll_count * __vec::size();
+      __first2 += __unroll_count * __vec::size();
+    }
+    return {__first1, __first2, __first1 == __last1};
+  }
+
+  _LIBCPP_HIDE_FROM_ABI static pair<_Tp*, _Tp*> __epilogue(_Tp* __first1, _Tp* __last1, _Tp* __first2) {
+    if constexpr (__mask_traits::__has_maskload) {
+      auto __size = __last1 - __first1;
+      auto __mask = __mask_traits::__mask_with_first_enabled(__size);
+      __vec __lhs =
+          __mask_traits::__maskload_unaligned(reinterpret_cast<typename __vec::value_type*>(__first1), __mask);
+      __vec __rhs =
+          __mask_traits::__maskload_unaligned(reinterpret_cast<typename __vec::value_type*>(__first2), __mask);
+      auto __res      = __mask_traits::__mask_cmp_eq(__mask, __lhs, __rhs);
+      auto __inv_mask = ~__mask.__get_data().__mask_;
+      if ((__res.__get_data().__mask_ | __inv_mask) != decltype(__mask){true}.__get_data().__mask_) {
+        auto __offset = experimental::find_first_set(__res);
+        return {__first1 + __offset, __first2 + __offset};
+      }
+      return {__first1 + __size, __first2 + __size};
+    } else {
+      return std::__mismatch_loop(__first1, __last1, __first2, __equal_to(), __identity(), __identity());
+    }
+  }
+};
+#endif // _LIBCPP_CAN_VECTORIZE_ALGORIHTMS
+
+template <class _InIter1, class _Sent1, class _InIter2, class _Pred, class _Proj1, class _Proj2>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_InIter1, _InIter2>
+__mismatch(_InIter1 __first1, _Sent1 __last1, _InIter2 __first2, _Pred __pred, _Proj1 __proj1, _Proj2 __proj2) {
+  return std::__mismatch_loop(__first1, __last1, __first2, __pred, __proj1, __proj2);
+}
+
+#if _LIBCPP_VECTORIZE_CLASSIC_ALGORITHMS
+template <
+    class _Tp,
+    class _Pred,
+    class _Proj1,
+    class _Proj2,
+    enable_if_t<
+        __desugars_to<__equal_tag, _Pred, _Tp, _Tp>::value && __is_identity<_Proj1>::value &&
+            __is_identity<_Proj2>::value &&
+            __mismatch_vector_impl<_Tp>::template __can_vectorize<_LIBCPP_VECTORIZE_FLOATING_POINT_CLASSIC_ALGORITHMS>,
+        int> = 0>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI inline constexpr pair<_Tp*, _Tp*>
+__mismatch(_Tp* __first1, _Tp* __last1, _Tp* __first2, _Pred __pred, _Proj1 __proj1, _Proj2 __proj2) {
+  if (__libcpp_is_constant_evaluated())
+    return std::__mismatch_loop(__first1, __last1, __first2, __pred, __proj1, __proj2);
+
+  using __impl = __mismatch_vector_impl<_Tp>;
+
+  // auto [__piter1, __piter2, __pmatch] = __impl::__prologue(__first1, __last1, __first2);
+  // if (__pmatch)
+  //   return {__piter1, __piter2};
+
+  auto [__iter1, __iter2, __matched] = __impl::__loop(__first1, __last1, __first2);
+  if (__matched)
+    return {__iter1, __iter2};
+
+  return __impl::__epilogue(__first1, __last1, __first2);
+}
+#endif // _LIBCPP_VECTORIZE_ALGORITHMS
+
 template <class _InputIterator1, class _InputIterator2, class _BinaryPredicate>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_InputIterator1, _InputIterator2>
+_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_InputIterator1, _InputIterator2>
 mismatch(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, _BinaryPredicate __pred) {
-  for (; __first1 != __last1; ++__first1, (void)++__first2)
-    if (!__pred(*__first1, *__first2))
-      break;
-  return pair<_InputIterator1, _InputIterator2>(__first1, __first2);
+  __identity __proj;
+  auto __res = std::__mismatch(
+      std::__unwrap_iter(__first1), std::__unwrap_iter(__last1), std::__unwrap_iter(__first2), __pred, __proj, __proj);
+  return std::make_pair(std::__rewrap_iter(__first1, __res.first), std::__rewrap_iter(__first2, __res.second));
 }
 
 template <class _InputIterator1, class _InputIterator2>
 
@@ -0,0 +1,78 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_VECTORIZATION_H
+#define _LIBCPP___ALGORITHM_VECTORIZATION_H
+
+#include <__config>
+#include <__type_traits/is_floating_point.h>
+#include <__utility/integer_sequence.h>
+#include <experimental/__simd/simd.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL)
+#  define _LIBCPP_CAN_VECTORIZE_ALGORIHTMS 1
+#else
+#  define _LIBCPP_CAN_VECTORIZE_ALGORIHTMS 0
+#endif
+
+#if _LIBCPP_CAN_VECTORIZE_ALGORIHTMS && !defined(__OPTIMIZE_SIZE__)
+#  define _LIBCPP_VECTORIZE_CLASSIC_ALGORITHMS 1
+#else
+#  define _LIBCPP_VECTORIZE_CLASSIC_ALGORITHMS 0
+#endif
+
+#if _LIBCPP_VECTORIZE_CLASSIC_ALGORITHMS && defined(__FAST_MATH__)
+#  define _LIBCPP_VECTORIZE_FLOATING_POINT_CLASSIC_ALGORITHMS 1
+#else
+#  define _LIBCPP_VECTORIZE_FLOATING_POINT_CLASSIC_ALGORITHMS 0
+#endif
+
+#if _LIBCPP_CAN_VECTORIZE_ALGORIHTMS
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _Tp>
+inline static const bool __fits_in_vector =
+    sizeof(_Tp) == 1 || sizeof(_Tp) == 2 || sizeof(_Tp) == 4 || sizeof(_Tp) == 8;
+
+template <class _Tp>
+_LIBCPP_HIDE_FROM_ABI constexpr auto __get_arithmetic_type_impl() {
+  if constexpr (is_floating_point_v<_Tp>)
+    return _Tp{};
+  else if constexpr (constexpr auto __sz = sizeof(_Tp); __sz == 1)
+    return uint8_t{};
+  else if constexpr (__sz == 2)
+    return uint16_t{};
+  else if constexpr (__sz == 4)
+    return uint32_t{};
+  else if constexpr (__sz == 8)
+    return uint64_t{};
+  else
+    static_assert(false, "unexpected sizeof type");
+}
+
+template <class _Tp>
+using __get_arithmetic_type = decltype(__get_arithmetic_type_impl<_Tp>());
+
+template <class _Tp>
+using __arithmetic_vec = experimental::native_simd<__get_arithmetic_type<_Tp>>;
+
+template <class _Tp>
+_LIBCPP_HIDE_FROM_ABI __arithmetic_vec<_Tp> __load_as_arithmetic(_Tp* __values) {
+  return {reinterpret_cast<__get_arithmetic_type<_Tp>*>(__values), 0};
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP_CAN_VECTORIZE_ALGORIHTMS
+
+#endif // _LIBCPP___ALGORITHM_VECTORIZATION_H
@@ -19,19 +19,24 @@
 _LIBCPP_PUSH_MACROS
 #include <__undef_macros>
 
-#if _LIBCPP_STD_VER >= 20
-
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-template <__libcpp_unsigned_integer _Tp>
-_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI constexpr bool has_single_bit(_Tp __t) noexcept {
+template <class _Tp>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI constexpr bool __has_single_bit(_Tp __t) noexcept {
   return __t != 0 && (((__t & (__t - 1)) == 0));
 }
 
-_LIBCPP_END_NAMESPACE_STD
+#if _LIBCPP_STD_VER >= 20
+
+template <__libcpp_unsigned_integer _Tp>
+_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI constexpr bool has_single_bit(_Tp __t) noexcept {
+  return std::__has_single_bit(__t);
+}
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_END_NAMESPACE_STD
+
 _LIBCPP_POP_MACROS
 
 #endif // _LIBCPP___BIT_HAS_SINGLE_BIT_H
@@ -0,0 +1,31 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___UTILITY_ALIGN_DOWN_H
+#define _LIBCPP___UTILITY_ALIGN_DOWN_H
+
+#include <__config>
+#include <cstddef>
+#include <cstdint>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _Tp>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI inline _Tp* __align_down(size_t __align, _Tp* __ptr) {
+  _LIBCPP_ASSERT_UNCATEGORIZED(
+      __align >= alignof(_Tp), "Alignment has to be at least as large as the required alignment");
+  return reinterpret_cast<_Tp*>(reinterpret_cast<uintptr_t>(__ptr) & ~(__align - 1));
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___UTILITY_ALIGN_DOWN_H