-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[X86][AVX10.2] Support AVX10.2-MINMAX new instructions. #101598
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
You can test this locally with the following command:git-clang-format --diff 22c06aa5e94e30fb1333ecaf46ce33c65d148634 822611314c4cf8e985628765cc806a0b718ebdd3 --extensions h,cpp,c,inc -- clang/lib/Headers/avx10_2_512minmaxintrin.h clang/lib/Headers/avx10_2minmaxintrin.h clang/test/CodeGen/X86/avx10_2_512minmax-builtins.c clang/test/CodeGen/X86/avx10_2_512minmax-error.c clang/test/CodeGen/X86/avx10_2minmax-builtins.c clang/lib/Headers/immintrin.h clang/lib/Sema/SemaX86.cpp llvm/lib/Target/X86/X86ISelLowering.cpp llvm/lib/Target/X86/X86ISelLowering.h llvm/lib/Target/X86/X86IntrinsicsInfo.h llvm/test/TableGen/x86-fold-tables.inc View the diff from clang-format here.diff --git a/clang/lib/Headers/avx10_2_512minmaxintrin.h b/clang/lib/Headers/avx10_2_512minmaxintrin.h
index e175365d11..877faf37f5 100644
--- a/clang/lib/Headers/avx10_2_512minmaxintrin.h
+++ b/clang/lib/Headers/avx10_2_512minmaxintrin.h
@@ -35,7 +35,7 @@
#define _mm512_minmax_pd(A, B, C) \
((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
(__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
- (__v8df)_mm512_undefined_pd(), (__mmask8)-1, \
+ (__v8df)_mm512_undefined_pd(), (__mmask8) - 1, \
_MM_FROUND_CUR_DIRECTION))
#define _mm512_mask_minmax_pd(W, U, A, B, C) \
@@ -51,7 +51,7 @@
#define _mm512_minmax_round_pd(A, B, C, R) \
((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
(__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
- (__v8df)_mm512_undefined_pd(), (__mmask8)-1, (int)(R)))
+ (__v8df)_mm512_undefined_pd(), (__mmask8) - 1, (int)(R)))
#define _mm512_mask_minmax_round_pd(W, U, A, B, C, R) \
((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
@@ -66,7 +66,7 @@
#define _mm512_minmax_ph(A, B, C) \
((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
- (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, \
+ (__v32hf)_mm512_undefined_ph(), (__mmask32) - 1, \
_MM_FROUND_CUR_DIRECTION))
#define _mm512_mask_minmax_ph(W, U, A, B, C) \
@@ -82,7 +82,7 @@
#define _mm512_minmax_round_ph(A, B, C, R) \
((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
- (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
+ (__v32hf)_mm512_undefined_ph(), (__mmask32) - 1, (int)(R)))
#define _mm512_mask_minmax_round_ph(W, U, A, B, C, R) \
((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
@@ -97,7 +97,7 @@
#define _mm512_minmax_ps(A, B, C) \
((__m512)__builtin_ia32_vminmaxps512_round_mask( \
(__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \
- (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, \
+ (__v16sf)_mm512_undefined_ps(), (__mmask16) - 1, \
_MM_FROUND_CUR_DIRECTION))
#define _mm512_mask_minmax_ps(W, U, A, B, C) \
@@ -113,7 +113,7 @@
#define _mm512_minmax_round_ps(A, B, C, R) \
((__m512)__builtin_ia32_vminmaxps512_round_mask( \
(__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \
- (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, (int)(R)))
+ (__v16sf)_mm512_undefined_ps(), (__mmask16) - 1, (int)(R)))
#define _mm512_mask_minmax_round_ps(W, U, A, B, C, R) \
((__m512)__builtin_ia32_vminmaxps512_round_mask( \
diff --git a/clang/lib/Headers/avx10_2minmaxintrin.h b/clang/lib/Headers/avx10_2minmaxintrin.h
index a9367e7424..c350a9e669 100644
--- a/clang/lib/Headers/avx10_2minmaxintrin.h
+++ b/clang/lib/Headers/avx10_2minmaxintrin.h
@@ -53,7 +53,7 @@
#define _mm_minmax_pd(A, B, C) \
((__m128d)__builtin_ia32_vminmaxpd128_mask( \
(__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
- (__v2df)_mm_setzero_pd(), (__mmask8)-1))
+ (__v2df)_mm_setzero_pd(), (__mmask8) - 1))
#define _mm_mask_minmax_pd(W, U, A, B, C) \
((__m128d)__builtin_ia32_vminmaxpd128_mask( \
@@ -68,7 +68,7 @@
#define _mm256_minmax_pd(A, B, C) \
((__m256d)__builtin_ia32_vminmaxpd256_round_mask( \
(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \
- (__v4df)_mm256_setzero_pd(), (__mmask8)-1, _MM_FROUND_NO_EXC))
+ (__v4df)_mm256_setzero_pd(), (__mmask8) - 1, _MM_FROUND_NO_EXC))
#define _mm256_mask_minmax_pd(W, U, A, B, C) \
((__m256d)__builtin_ia32_vminmaxpd256_round_mask( \
@@ -83,7 +83,7 @@
#define _mm256_minmax_round_pd(A, B, C, R) \
((__m256d)__builtin_ia32_vminmaxpd256_round_mask( \
(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \
- (__v4df)_mm256_undefined_pd(), (__mmask8)-1, (int)(R)))
+ (__v4df)_mm256_undefined_pd(), (__mmask8) - 1, (int)(R)))
#define _mm256_mask_minmax_round_pd(W, U, A, B, C, R) \
((__m256d)__builtin_ia32_vminmaxpd256_round_mask( \
@@ -98,12 +98,12 @@
#define _mm_minmax_ph(A, B, C) \
((__m128h)__builtin_ia32_vminmaxph128_mask( \
(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
- (__v8hf)_mm_setzero_ph(), (__mmask8)-1))
+ (__v8hf)_mm_setzero_ph(), (__mmask8) - 1))
#define _mm_mask_minmax_ph(W, U, A, B, C) \
((__m128h)__builtin_ia32_vminmaxph128_mask( \
(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
- (__v8hf)(__m128h)(W), (__mmask16)-1))
+ (__v8hf)(__m128h)(W), (__mmask16) - 1))
#define _mm_maskz_minmax_ph(U, A, B, C) \
((__m128h)__builtin_ia32_vminmaxph128_mask( \
@@ -113,7 +113,7 @@
#define _mm256_minmax_ph(A, B, C) \
((__m256h)__builtin_ia32_vminmaxph256_round_mask( \
(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(C), \
- (__v16hf)_mm256_setzero_ph(), (__mmask16)-1, _MM_FROUND_NO_EXC))
+ (__v16hf)_mm256_setzero_ph(), (__mmask16) - 1, _MM_FROUND_NO_EXC))
#define _mm256_mask_minmax_ph(W, U, A, B, C) \
((__m256h)__builtin_ia32_vminmaxph256_round_mask( \
@@ -128,7 +128,7 @@
#define _mm256_minmax_round_ph(A, B, C, R) \
((__m256h)__builtin_ia32_vminmaxph256_round_mask( \
(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(C), \
- (__v16hf)_mm256_undefined_ph(), (__mmask16)-1, (int)(R)))
+ (__v16hf)_mm256_undefined_ph(), (__mmask16) - 1, (int)(R)))
#define _mm256_mask_minmax_round_ph(W, U, A, B, C, R) \
((__m256h)__builtin_ia32_vminmaxph256_round_mask( \
@@ -143,7 +143,7 @@
#define _mm_minmax_ps(A, B, C) \
((__m128)__builtin_ia32_vminmaxps128_mask( \
(__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \
- (__v4sf)_mm_setzero_ps(), (__mmask8)-1))
+ (__v4sf)_mm_setzero_ps(), (__mmask8) - 1))
#define _mm_mask_minmax_ps(W, U, A, B, C) \
((__m128)__builtin_ia32_vminmaxps128_mask( \
@@ -158,7 +158,7 @@
#define _mm256_minmax_ps(A, B, C) \
((__m256)__builtin_ia32_vminmaxps256_round_mask( \
(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), \
- (__v8sf)_mm256_setzero_ps(), (__mmask8)-1, _MM_FROUND_NO_EXC))
+ (__v8sf)_mm256_setzero_ps(), (__mmask8) - 1, _MM_FROUND_NO_EXC))
#define _mm256_mask_minmax_ps(W, U, A, B, C) \
((__m256)__builtin_ia32_vminmaxps256_round_mask( \
@@ -173,7 +173,7 @@
#define _mm256_minmax_round_ps(A, B, C, R) \
((__m256)__builtin_ia32_vminmaxps256_round_mask( \
(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), \
- (__v8sf)_mm256_undefined_ps(), (__mmask8)-1, (int)(R)))
+ (__v8sf)_mm256_undefined_ps(), (__mmask8) - 1, (int)(R)))
#define _mm256_mask_minmax_round_ps(W, U, A, B, C, R) \
((__m256)__builtin_ia32_vminmaxps256_round_mask( \
@@ -188,7 +188,7 @@
#define _mm_minmax_sd(A, B, C) \
((__m128d)__builtin_ia32_vminmaxsd_round_mask( \
(__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
- (__v2df)_mm_undefined_pd(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
+ (__v2df)_mm_undefined_pd(), (__mmask8) - 1, _MM_FROUND_CUR_DIRECTION))
#define _mm_mask_minmax_sd(W, U, A, B, C) \
((__m128d)__builtin_ia32_vminmaxsd_round_mask( \
@@ -203,7 +203,7 @@
#define _mm_minmax_round_sd(A, B, C, R) \
((__m128d)__builtin_ia32_vminmaxsd_round_mask( \
(__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
- (__v2df)_mm_undefined_pd(), (__mmask8)-1, (int)(R)))
+ (__v2df)_mm_undefined_pd(), (__mmask8) - 1, (int)(R)))
#define _mm_mask_minmax_round_sd(W, U, A, B, C, R) \
((__m128d)__builtin_ia32_vminmaxsd_round_mask( \
@@ -218,7 +218,7 @@
#define _mm_minmax_sh(A, B, C) \
((__m128h)__builtin_ia32_vminmaxsh_round_mask( \
(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
- (__v8hf)_mm_undefined_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
+ (__v8hf)_mm_undefined_ph(), (__mmask8) - 1, _MM_FROUND_CUR_DIRECTION))
#define _mm_mask_minmax_sh(W, U, A, B, C) \
((__m128h)__builtin_ia32_vminmaxsh_round_mask( \
@@ -233,7 +233,7 @@
#define _mm_minmax_round_sh(A, B, C, R) \
((__m128h)__builtin_ia32_vminmaxsh_round_mask( \
(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
- (__v8hf)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
+ (__v8hf)_mm_undefined_ph(), (__mmask8) - 1, (int)(R)))
#define _mm_mask_minmax_round_sh(W, U, A, B, C, R) \
((__m128h)__builtin_ia32_vminmaxsh_round_mask( \
@@ -248,7 +248,7 @@
#define _mm_minmax_ss(A, B, C) \
((__m128)__builtin_ia32_vminmaxss_round_mask( \
(__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \
- (__v4sf)_mm_undefined_ps(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
+ (__v4sf)_mm_undefined_ps(), (__mmask8) - 1, _MM_FROUND_CUR_DIRECTION))
#define _mm_mask_minmax_ss(W, U, A, B, C) \
((__m128)__builtin_ia32_vminmaxss_round_mask( \
@@ -263,7 +263,7 @@
#define _mm_minmax_round_ss(A, B, C, R) \
((__m128)__builtin_ia32_vminmaxss_round_mask( \
(__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \
- (__v4sf)_mm_undefined_ps(), (__mmask8)-1, (int)(R)))
+ (__v4sf)_mm_undefined_ps(), (__mmask8) - 1, (int)(R)))
#define _mm_mask_minmax_round_ss(W, U, A, B, C, R) \
((__m128)__builtin_ia32_vminmaxss_round_mask( \
|
@llvm/pr-subscribers-backend-x86 @llvm/pr-subscribers-clang Author: Freddy Ye (FreddyLeaf) ChangesRef.: https://cdrdv2.intel.com/v1/dl/getContent/828965 Patch is 307.16 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/101598.diff 25 Files Affected:
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 4c7bd099420ab..c54627586a240 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -220,6 +220,8 @@ X86 Support
found in the file ``clang/www/builtins.py``.
- Support ISA of ``AVX10.2``.
+ * Supported MINMAX intrinsics of ``*_(mask(z)))_minmax(ne)_p[s|d|h|bh]`` and
+ ``*_(mask(z)))_minmax_s[s|d|h]``.
Arm and AArch64 Support
^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
index f028711a807c0..3200e0112adce 100644
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -2022,6 +2022,22 @@ TARGET_BUILTIN(__builtin_ia32_vsm4key4256, "V8UiV8UiV8Ui", "nV:256:", "sm4")
TARGET_BUILTIN(__builtin_ia32_vsm4rnds4128, "V4UiV4UiV4Ui", "nV:128:", "sm4")
TARGET_BUILTIN(__builtin_ia32_vsm4rnds4256, "V8UiV8UiV8Ui", "nV:256:", "sm4")
+// AVX10-MINMAX
+TARGET_BUILTIN(__builtin_ia32_vminmaxnepbf16128, "V8yV8yV8yIi", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vminmaxnepbf16256, "V16yV16yV16yIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vminmaxnepbf16512, "V32yV32yV32yIi", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vminmaxpd128_mask, "V2dV2dV2dIiV2dUc", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vminmaxpd256_round_mask, "V4dV4dV4dIiV4dUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vminmaxpd512_round_mask, "V8dV8dV8dIiV8dUcIi", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vminmaxph128_mask, "V8xV8xV8xIiV8xUc", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vminmaxph256_round_mask, "V16xV16xV16xIiV16xUsIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vminmaxph512_round_mask, "V32xV32xV32xIiV32xUiIi", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vminmaxps128_mask, "V4fV4fV4fIiV4fUc", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vminmaxps256_round_mask, "V8fV8fV8fIiV8fUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vminmaxps512_round_mask, "V16fV16fV16fIiV16fUsIi", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vminmaxsd_round_mask, "V2dV2dV2dIiV2dUcIi", "nV:128:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vminmaxsh_round_mask, "V8xV8xV8xIiV8xUcIi", "nV:128:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vminmaxss_round_mask, "V4fV4fV4fIiV4fUcIi", "nV:128:", "avx10.2-512")
#undef BUILTIN
#undef TARGET_BUILTIN
#undef TARGET_HEADER_BUILTIN
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index b17ab24d625a0..f3d19e38f8f2b 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -147,7 +147,9 @@ set(x86_files
amxcomplexintrin.h
amxfp16intrin.h
amxintrin.h
+ avx10_2_512minmaxintrin.h
avx10_2_512niintrin.h
+ avx10_2minmaxintrin.h
avx10_2niintrin.h
avx2intrin.h
avx512bf16intrin.h
diff --git a/clang/lib/Headers/avx10_2_512minmaxintrin.h b/clang/lib/Headers/avx10_2_512minmaxintrin.h
new file mode 100644
index 0000000000000..ee486cb24f3d9
--- /dev/null
+++ b/clang/lib/Headers/avx10_2_512minmaxintrin.h
@@ -0,0 +1,219 @@
+/*===--------------- avx10_2_512minmaxintrin.h - AVX10_2_512MINMAX intrinsics
+ *-----------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error \
+ "Never use <avx10_2_512minmaxintrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AVX10_2_512MINMAXINTRIN_H
+#define __AVX10_2_512MINMAXINTRIN_H
+
+#define _mm512_minmaxne_pbh(A, B, C) \
+ ((__m512bh)__builtin_ia32_vminmaxnepbf16512( \
+ (__v32bf)(__m512bh)(A), (__v32bf)(__m512bh)(A), (int)(C)))
+
+#define _mm512_mask_minmaxne_pbh(W, U, A, B, C) \
+ ((__m512bh)__builtin_ia32_selectpbf_512( \
+ (__mmask32)(U), \
+ (__v32bf)__builtin_ia32_vminmaxnepbf16512( \
+ (__v32bf)(__m512bh)(A), (__v32bf)(__m512bh)(B), (int)(C)), \
+ (__v32bf)(__m512bh)(W)))
+
+#define _mm512_maskz_minmaxne_pbh(U, A, B, C) \
+ ((__m512bh)__builtin_ia32_selectpbf_512( \
+ (__mmask32)(U), \
+ (__v32bf)__builtin_ia32_vminmaxnepbf16512( \
+ (__v32bf)(__m512bh)(A), (__v32bf)(__m512bh)(B), (int)(C)), \
+ (__v32bf) __builtin_bit_cast(__m512bh, _mm512_setzero_ps())))
+
+#define _mm512_minmax_pd(A, B, C) \
+ ((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
+ (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
+ (__v8df)_mm512_undefined_pd(), (__mmask8) - 1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_minmax_pd(W, U, A, B, C) \
+ ((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
+ (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
+ (__v8df)(__m512d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_minmax_pd(U, A, B, C) \
+ ((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
+ (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
+ (__v8df)_mm512_setzero_pd(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_minmax_round_pd(A, B, C, R) \
+ ((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
+ (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
+ (__v8df)_mm512_undefined_pd(), (__mmask8) - 1, (int)(R)))
+
+#define _mm512_mask_minmax_round_pd(W, U, A, B, C, R) \
+ ((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
+ (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
+ (__v8df)(__m512d)(W), (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_minmax_round_pd(U, A, B, C, R) \
+ ((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
+ (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
+ (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R)))
+
+#define _mm512_minmax_ph(A, B, C) \
+ ((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
+ (__v32hf)_mm512_undefined_ph(), (__mmask32) - 1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_minmax_ph(W, U, A, B, C) \
+ ((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
+ (__v32hf)(__m512h)(W), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_minmax_ph(U, A, B, C) \
+ ((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
+ (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_minmax_round_ph(A, B, C, R) \
+ ((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
+ (__v32hf)_mm512_undefined_ph(), (__mmask32) - 1, (int)(R)))
+
+#define _mm512_mask_minmax_round_ph(W, U, A, B, C, R) \
+ ((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
+ (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_minmax_round_ph(U, A, B, C, R) \
+ ((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
+ (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
+
+#define _mm512_minmax_ps(A, B, C) \
+ ((__m512)__builtin_ia32_vminmaxps512_round_mask( \
+ (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \
+ (__v16sf)_mm512_undefined_ps(), (__mmask16) - 1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_minmax_ps(W, U, A, B, C) \
+ ((__m512)__builtin_ia32_vminmaxps512_round_mask( \
+ (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), (__v16sf)(W), \
+ (__mmask16)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_minmax_ps(U, A, B, C) \
+ ((__m512)__builtin_ia32_vminmaxps512_round_mask( \
+ (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \
+ (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_minmax_round_ps(A, B, C, R) \
+ ((__m512)__builtin_ia32_vminmaxps512_round_mask( \
+ (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \
+ (__v16sf)_mm512_undefined_ps(), (__mmask16) - 1, (int)(R)))
+
+#define _mm512_mask_minmax_round_ps(W, U, A, B, C, R) \
+ ((__m512)__builtin_ia32_vminmaxps512_round_mask( \
+ (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), (__v16sf)(W), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_minmax_round_ps(U, A, B, C, R) \
+ ((__m512)__builtin_ia32_vminmaxps512_round_mask( \
+ (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \
+ (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R)))
+
+#define _mm_minmax_sd(A, B, C) \
+ ((__m128d)__builtin_ia32_vminmaxsd_round_mask( \
+ (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
+ (__v2df)_mm_undefined_pd(), (__mmask8) - 1, _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_minmax_sd(W, U, A, B, C) \
+ ((__m128d)__builtin_ia32_vminmaxsd_round_mask( \
+ (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
+ (__v2df)(__m128d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_minmax_sd(U, A, B, C) \
+ ((__m128d)__builtin_ia32_vminmaxsd_round_mask( \
+ (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
+ (__v2df)_mm_setzero_pd(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_minmax_round_sd(A, B, C, R) \
+ ((__m128d)__builtin_ia32_vminmaxsd_round_mask( \
+ (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
+ (__v2df)_mm_undefined_pd(), (__mmask8) - 1, (int)(R)))
+
+#define _mm_mask_minmax_round_sd(W, U, A, B, C, R) \
+ ((__m128d)__builtin_ia32_vminmaxsd_round_mask( \
+ (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
+ (__v2df)(__m128d)(W), (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_minmax_round_sd(U, A, B, C, R) \
+ ((__m128d)__builtin_ia32_vminmaxsd_round_mask( \
+ (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
+ (__v2df)_mm_setzero_pd(), (__mmask8)(U), (int)(R)))
+
+#define _mm_minmax_sh(A, B, C) \
+ ((__m128h)__builtin_ia32_vminmaxsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
+ (__v8hf)_mm_undefined_ph(), (__mmask8) - 1, _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_minmax_sh(W, U, A, B, C) \
+ ((__m128h)__builtin_ia32_vminmaxsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
+ (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_minmax_sh(U, A, B, C) \
+ ((__m128h)__builtin_ia32_vminmaxsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
+ (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_minmax_round_sh(A, B, C, R) \
+ ((__m128h)__builtin_ia32_vminmaxsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
+ (__v8hf)_mm_undefined_ph(), (__mmask8) - 1, (int)(R)))
+
+#define _mm_mask_minmax_round_sh(W, U, A, B, C, R) \
+ ((__m128h)__builtin_ia32_vminmaxsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
+ (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_minmax_round_sh(U, A, B, C, R) \
+ ((__m128h)__builtin_ia32_vminmaxsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
+ (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
+
+#define _mm_minmax_ss(A, B, C) \
+ ((__m128)__builtin_ia32_vminmaxss_round_mask( \
+ (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \
+ (__v4sf)_mm_undefined_ps(), (__mmask8) - 1, _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_minmax_ss(W, U, A, B, C) \
+ ((__m128)__builtin_ia32_vminmaxss_round_mask( \
+ (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(W), \
+ (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_minmax_ss(U, A, B, C) \
+ ((__m128)__builtin_ia32_vminmaxss_round_mask( \
+ (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \
+ (__v4sf)_mm_setzero_ps(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_minmax_round_ss(A, B, C, R) \
+ ((__m128)__builtin_ia32_vminmaxss_round_mask( \
+ (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \
+ (__v4sf)_mm_undefined_ps(), (__mmask8) - 1, (int)(R)))
+
+#define _mm_mask_minmax_round_ss(W, U, A, B, C, R) \
+ ((__m128)__builtin_ia32_vminmaxss_round_mask( \
+ (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_minmax_round_ss(U, A, B, C, R) \
+ ((__m128)__builtin_ia32_vminmaxss_round_mask( \
+ (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \
+ (__v4sf)_mm_setzero_ps(), (__mmask8)(U), (int)(R)))
+
+#endif // __AVX10_2_512MINMAXINTRIN_H
diff --git a/clang/lib/Headers/avx10_2minmaxintrin.h b/clang/lib/Headers/avx10_2minmaxintrin.h
new file mode 100644
index 0000000000000..48539dd65b5b9
--- /dev/null
+++ b/clang/lib/Headers/avx10_2minmaxintrin.h
@@ -0,0 +1,188 @@
+/*===--------------- avx10_2minmaxintrin.h - AVX10_2MINMAX intrinsics
+ *-----------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error \
+ "Never use <avx10_2minmaxintrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AVX10_2MINMAXINTRIN_H
+#define __AVX10_2MINMAXINTRIN_H
+
+#define _mm_minmaxne_pbh(A, B, C) \
+ ((__m128bh)__builtin_ia32_vminmaxnepbf16128( \
+ (__m128bh)(__v8bf)(A), (__m128bh)(__v8bf)(B), (int)(C)))
+
+#define _mm_mask_minmaxne_pbh(W, U, A, B, C) \
+ ((__m128bh)__builtin_ia32_selectpbf_128( \
+ (__mmask8)(U), \
+ (__v8bf)__builtin_ia32_vminmaxnepbf16128( \
+ (__m128bh)(__v8bf)(A), (__m128bh)(__v8bf)(B), (int)(C)), \
+ (__v8bf)(W)))
+
+#define _mm_maskz_minmaxne_pbh(U, A, B, C) \
+ ((__m128bh)__builtin_ia32_selectpbf_128( \
+ (__mmask8)(U), \
+ (__v8bf)__builtin_ia32_vminmaxnepbf16128( \
+ (__m128bh)(__v8bf)(A), (__m128bh)(__v8bf)(B), (int)(C)), \
+ (__v8bf) __builtin_bit_cast(__m128bh, _mm_setzero_ps())))
+
+#define _mm256_minmaxne_pbh(A, B, C) \
+ ((__m256bh)__builtin_ia32_vminmaxnepbf16256( \
+ (__m256bh)(__v16bf)(A), (__m256bh)(__v16bf)(B), (int)(C)))
+
+#define _mm256_mask_minmaxne_pbh(W, U, A, B, C) \
+ ((__m256bh)__builtin_ia32_selectpbf_256( \
+ (__mmask16)(U), \
+ (__v16bf)__builtin_ia32_vminmaxnepbf16256( \
+ (__m256bh)(__v16bf)(A), (__m256bh)(__v16bf)(B), (int)(C)), \
+ (__v16bf)(W)))
+
+#define _mm256_maskz_minmaxne_pbh(U, A, B, C) \
+ ((__m256bh)__builtin_ia32_selectpbf_256( \
+ (__mmask16)(U), \
+ (__v16bf)__builtin_ia32_vminmaxnepbf16256( \
+ (__m256bh)(__v16bf)(A), (__m256bh)(__v16bf)(B), (int)(C)), \
+ (__v16bf) __builtin_bit_cast(__m256bh, _mm256_setzero_ps())))
+
+#define _mm_minmax_pd(A, B, C) \
+ ((__m128d)__builtin_ia32_vminmaxpd128_mask( \
+ (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
+ (__v2df)_mm_setzero_pd(), (__mmask8)(-1)))
+
+#define _mm_mask_minmax_pd(W, U, A, B, C) \
+ ((__m128d)__builtin_ia32_vminmaxpd128_mask( \
+ (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
+ (__v2df)(__m128d)(W), (__mmask8)(U)))
+
+#define _mm_maskz_minmax_pd(U, A, B, C) \
+ ((__m128d)__builtin_ia32_vminmaxpd128_mask( \
+ (__v2df)(__m1...
[truncated]
|
@llvm/pr-subscribers-mc Author: Freddy Ye (FreddyLeaf) ChangesRef.: https://cdrdv2.intel.com/v1/dl/getContent/828965 Patch is 307.16 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/101598.diff 25 Files Affected:
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 4c7bd099420ab..c54627586a240 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -220,6 +220,8 @@ X86 Support
found in the file ``clang/www/builtins.py``.
- Support ISA of ``AVX10.2``.
+ * Supported MINMAX intrinsics of ``*_(mask(z)))_minmax(ne)_p[s|d|h|bh]`` and
+ ``*_(mask(z)))_minmax_s[s|d|h]``.
Arm and AArch64 Support
^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
index f028711a807c0..3200e0112adce 100644
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -2022,6 +2022,22 @@ TARGET_BUILTIN(__builtin_ia32_vsm4key4256, "V8UiV8UiV8Ui", "nV:256:", "sm4")
TARGET_BUILTIN(__builtin_ia32_vsm4rnds4128, "V4UiV4UiV4Ui", "nV:128:", "sm4")
TARGET_BUILTIN(__builtin_ia32_vsm4rnds4256, "V8UiV8UiV8Ui", "nV:256:", "sm4")
+// AVX10-MINMAX
+TARGET_BUILTIN(__builtin_ia32_vminmaxnepbf16128, "V8yV8yV8yIi", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vminmaxnepbf16256, "V16yV16yV16yIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vminmaxnepbf16512, "V32yV32yV32yIi", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vminmaxpd128_mask, "V2dV2dV2dIiV2dUc", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vminmaxpd256_round_mask, "V4dV4dV4dIiV4dUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vminmaxpd512_round_mask, "V8dV8dV8dIiV8dUcIi", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vminmaxph128_mask, "V8xV8xV8xIiV8xUc", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vminmaxph256_round_mask, "V16xV16xV16xIiV16xUsIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vminmaxph512_round_mask, "V32xV32xV32xIiV32xUiIi", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vminmaxps128_mask, "V4fV4fV4fIiV4fUc", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vminmaxps256_round_mask, "V8fV8fV8fIiV8fUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vminmaxps512_round_mask, "V16fV16fV16fIiV16fUsIi", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vminmaxsd_round_mask, "V2dV2dV2dIiV2dUcIi", "nV:128:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vminmaxsh_round_mask, "V8xV8xV8xIiV8xUcIi", "nV:128:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vminmaxss_round_mask, "V4fV4fV4fIiV4fUcIi", "nV:128:", "avx10.2-512")
#undef BUILTIN
#undef TARGET_BUILTIN
#undef TARGET_HEADER_BUILTIN
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index b17ab24d625a0..f3d19e38f8f2b 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -147,7 +147,9 @@ set(x86_files
amxcomplexintrin.h
amxfp16intrin.h
amxintrin.h
+ avx10_2_512minmaxintrin.h
avx10_2_512niintrin.h
+ avx10_2minmaxintrin.h
avx10_2niintrin.h
avx2intrin.h
avx512bf16intrin.h
diff --git a/clang/lib/Headers/avx10_2_512minmaxintrin.h b/clang/lib/Headers/avx10_2_512minmaxintrin.h
new file mode 100644
index 0000000000000..ee486cb24f3d9
--- /dev/null
+++ b/clang/lib/Headers/avx10_2_512minmaxintrin.h
@@ -0,0 +1,219 @@
+/*===--------------- avx10_2_512minmaxintrin.h - AVX10_2_512MINMAX intrinsics
+ *-----------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error \
+ "Never use <avx10_2_512minmaxintrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AVX10_2_512MINMAXINTRIN_H
+#define __AVX10_2_512MINMAXINTRIN_H
+
+#define _mm512_minmaxne_pbh(A, B, C) \
+ ((__m512bh)__builtin_ia32_vminmaxnepbf16512( \
+ (__v32bf)(__m512bh)(A), (__v32bf)(__m512bh)(A), (int)(C)))
+
+#define _mm512_mask_minmaxne_pbh(W, U, A, B, C) \
+ ((__m512bh)__builtin_ia32_selectpbf_512( \
+ (__mmask32)(U), \
+ (__v32bf)__builtin_ia32_vminmaxnepbf16512( \
+ (__v32bf)(__m512bh)(A), (__v32bf)(__m512bh)(B), (int)(C)), \
+ (__v32bf)(__m512bh)(W)))
+
+#define _mm512_maskz_minmaxne_pbh(U, A, B, C) \
+ ((__m512bh)__builtin_ia32_selectpbf_512( \
+ (__mmask32)(U), \
+ (__v32bf)__builtin_ia32_vminmaxnepbf16512( \
+ (__v32bf)(__m512bh)(A), (__v32bf)(__m512bh)(B), (int)(C)), \
+ (__v32bf) __builtin_bit_cast(__m512bh, _mm512_setzero_ps())))
+
+#define _mm512_minmax_pd(A, B, C) \
+ ((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
+ (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
+ (__v8df)_mm512_undefined_pd(), (__mmask8) - 1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_minmax_pd(W, U, A, B, C) \
+ ((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
+ (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
+ (__v8df)(__m512d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_minmax_pd(U, A, B, C) \
+ ((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
+ (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
+ (__v8df)_mm512_setzero_pd(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_minmax_round_pd(A, B, C, R) \
+ ((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
+ (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
+ (__v8df)_mm512_undefined_pd(), (__mmask8) - 1, (int)(R)))
+
+#define _mm512_mask_minmax_round_pd(W, U, A, B, C, R) \
+ ((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
+ (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
+ (__v8df)(__m512d)(W), (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_minmax_round_pd(U, A, B, C, R) \
+ ((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
+ (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
+ (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R)))
+
+#define _mm512_minmax_ph(A, B, C) \
+ ((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
+ (__v32hf)_mm512_undefined_ph(), (__mmask32) - 1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_minmax_ph(W, U, A, B, C) \
+ ((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
+ (__v32hf)(__m512h)(W), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_minmax_ph(U, A, B, C) \
+ ((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
+ (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_minmax_round_ph(A, B, C, R) \
+ ((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
+ (__v32hf)_mm512_undefined_ph(), (__mmask32) - 1, (int)(R)))
+
+#define _mm512_mask_minmax_round_ph(W, U, A, B, C, R) \
+ ((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
+ (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_minmax_round_ph(U, A, B, C, R) \
+ ((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
+ (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
+
+#define _mm512_minmax_ps(A, B, C) \
+ ((__m512)__builtin_ia32_vminmaxps512_round_mask( \
+ (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \
+ (__v16sf)_mm512_undefined_ps(), (__mmask16) - 1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_minmax_ps(W, U, A, B, C) \
+ ((__m512)__builtin_ia32_vminmaxps512_round_mask( \
+ (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), (__v16sf)(W), \
+ (__mmask16)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_minmax_ps(U, A, B, C) \
+ ((__m512)__builtin_ia32_vminmaxps512_round_mask( \
+ (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \
+ (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_minmax_round_ps(A, B, C, R) \
+ ((__m512)__builtin_ia32_vminmaxps512_round_mask( \
+ (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \
+ (__v16sf)_mm512_undefined_ps(), (__mmask16) - 1, (int)(R)))
+
+#define _mm512_mask_minmax_round_ps(W, U, A, B, C, R) \
+ ((__m512)__builtin_ia32_vminmaxps512_round_mask( \
+ (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), (__v16sf)(W), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_minmax_round_ps(U, A, B, C, R) \
+ ((__m512)__builtin_ia32_vminmaxps512_round_mask( \
+ (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \
+ (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R)))
+
+#define _mm_minmax_sd(A, B, C) \
+ ((__m128d)__builtin_ia32_vminmaxsd_round_mask( \
+ (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
+ (__v2df)_mm_undefined_pd(), (__mmask8) - 1, _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_minmax_sd(W, U, A, B, C) \
+ ((__m128d)__builtin_ia32_vminmaxsd_round_mask( \
+ (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
+ (__v2df)(__m128d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_minmax_sd(U, A, B, C) \
+ ((__m128d)__builtin_ia32_vminmaxsd_round_mask( \
+ (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
+ (__v2df)_mm_setzero_pd(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_minmax_round_sd(A, B, C, R) \
+ ((__m128d)__builtin_ia32_vminmaxsd_round_mask( \
+ (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
+ (__v2df)_mm_undefined_pd(), (__mmask8) - 1, (int)(R)))
+
+#define _mm_mask_minmax_round_sd(W, U, A, B, C, R) \
+ ((__m128d)__builtin_ia32_vminmaxsd_round_mask( \
+ (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
+ (__v2df)(__m128d)(W), (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_minmax_round_sd(U, A, B, C, R) \
+ ((__m128d)__builtin_ia32_vminmaxsd_round_mask( \
+ (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
+ (__v2df)_mm_setzero_pd(), (__mmask8)(U), (int)(R)))
+
+#define _mm_minmax_sh(A, B, C) \
+ ((__m128h)__builtin_ia32_vminmaxsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
+ (__v8hf)_mm_undefined_ph(), (__mmask8) - 1, _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_minmax_sh(W, U, A, B, C) \
+ ((__m128h)__builtin_ia32_vminmaxsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
+ (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_minmax_sh(U, A, B, C) \
+ ((__m128h)__builtin_ia32_vminmaxsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
+ (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_minmax_round_sh(A, B, C, R) \
+ ((__m128h)__builtin_ia32_vminmaxsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
+ (__v8hf)_mm_undefined_ph(), (__mmask8) - 1, (int)(R)))
+
+#define _mm_mask_minmax_round_sh(W, U, A, B, C, R) \
+ ((__m128h)__builtin_ia32_vminmaxsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
+ (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_minmax_round_sh(U, A, B, C, R) \
+ ((__m128h)__builtin_ia32_vminmaxsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
+ (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
+
+#define _mm_minmax_ss(A, B, C) \
+ ((__m128)__builtin_ia32_vminmaxss_round_mask( \
+ (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \
+ (__v4sf)_mm_undefined_ps(), (__mmask8) - 1, _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_minmax_ss(W, U, A, B, C) \
+ ((__m128)__builtin_ia32_vminmaxss_round_mask( \
+ (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(W), \
+ (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_minmax_ss(U, A, B, C) \
+ ((__m128)__builtin_ia32_vminmaxss_round_mask( \
+ (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \
+ (__v4sf)_mm_setzero_ps(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_minmax_round_ss(A, B, C, R) \
+ ((__m128)__builtin_ia32_vminmaxss_round_mask( \
+ (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \
+ (__v4sf)_mm_undefined_ps(), (__mmask8) - 1, (int)(R)))
+
+#define _mm_mask_minmax_round_ss(W, U, A, B, C, R) \
+ ((__m128)__builtin_ia32_vminmaxss_round_mask( \
+ (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_minmax_round_ss(U, A, B, C, R) \
+ ((__m128)__builtin_ia32_vminmaxss_round_mask( \
+ (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \
+ (__v4sf)_mm_setzero_ps(), (__mmask8)(U), (int)(R)))
+
+#endif // __AVX10_2_512MINMAXINTRIN_H
diff --git a/clang/lib/Headers/avx10_2minmaxintrin.h b/clang/lib/Headers/avx10_2minmaxintrin.h
new file mode 100644
index 0000000000000..48539dd65b5b9
--- /dev/null
+++ b/clang/lib/Headers/avx10_2minmaxintrin.h
@@ -0,0 +1,188 @@
+/*===--------------- avx10_2minmaxintrin.h - AVX10_2MINMAX intrinsics
+ *-----------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error \
+ "Never use <avx10_2minmaxintrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AVX10_2MINMAXINTRIN_H
+#define __AVX10_2MINMAXINTRIN_H
+
+#define _mm_minmaxne_pbh(A, B, C) \
+ ((__m128bh)__builtin_ia32_vminmaxnepbf16128( \
+ (__m128bh)(__v8bf)(A), (__m128bh)(__v8bf)(B), (int)(C)))
+
+#define _mm_mask_minmaxne_pbh(W, U, A, B, C) \
+ ((__m128bh)__builtin_ia32_selectpbf_128( \
+ (__mmask8)(U), \
+ (__v8bf)__builtin_ia32_vminmaxnepbf16128( \
+ (__m128bh)(__v8bf)(A), (__m128bh)(__v8bf)(B), (int)(C)), \
+ (__v8bf)(W)))
+
+#define _mm_maskz_minmaxne_pbh(U, A, B, C) \
+ ((__m128bh)__builtin_ia32_selectpbf_128( \
+ (__mmask8)(U), \
+ (__v8bf)__builtin_ia32_vminmaxnepbf16128( \
+ (__m128bh)(__v8bf)(A), (__m128bh)(__v8bf)(B), (int)(C)), \
+ (__v8bf) __builtin_bit_cast(__m128bh, _mm_setzero_ps())))
+
+#define _mm256_minmaxne_pbh(A, B, C) \
+ ((__m256bh)__builtin_ia32_vminmaxnepbf16256( \
+ (__m256bh)(__v16bf)(A), (__m256bh)(__v16bf)(B), (int)(C)))
+
+#define _mm256_mask_minmaxne_pbh(W, U, A, B, C) \
+ ((__m256bh)__builtin_ia32_selectpbf_256( \
+ (__mmask16)(U), \
+ (__v16bf)__builtin_ia32_vminmaxnepbf16256( \
+ (__m256bh)(__v16bf)(A), (__m256bh)(__v16bf)(B), (int)(C)), \
+ (__v16bf)(W)))
+
+#define _mm256_maskz_minmaxne_pbh(U, A, B, C) \
+ ((__m256bh)__builtin_ia32_selectpbf_256( \
+ (__mmask16)(U), \
+ (__v16bf)__builtin_ia32_vminmaxnepbf16256( \
+ (__m256bh)(__v16bf)(A), (__m256bh)(__v16bf)(B), (int)(C)), \
+ (__v16bf) __builtin_bit_cast(__m256bh, _mm256_setzero_ps())))
+
+#define _mm_minmax_pd(A, B, C) \
+ ((__m128d)__builtin_ia32_vminmaxpd128_mask( \
+ (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
+ (__v2df)_mm_setzero_pd(), (__mmask8)(-1)))
+
+#define _mm_mask_minmax_pd(W, U, A, B, C) \
+ ((__m128d)__builtin_ia32_vminmaxpd128_mask( \
+ (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
+ (__v2df)(__m128d)(W), (__mmask8)(U)))
+
+#define _mm_maskz_minmax_pd(U, A, B, C) \
+ ((__m128d)__builtin_ia32_vminmaxpd128_mask( \
+ (__v2df)(__m1...
[truncated]
|
@llvm/pr-subscribers-llvm-ir Author: Freddy Ye (FreddyLeaf) ChangesRef.: https://cdrdv2.intel.com/v1/dl/getContent/828965 Patch is 307.16 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/101598.diff 25 Files Affected:
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 4c7bd099420ab..c54627586a240 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -220,6 +220,8 @@ X86 Support
found in the file ``clang/www/builtins.py``.
- Support ISA of ``AVX10.2``.
+ * Supported MINMAX intrinsics of ``*_(mask(z)))_minmax(ne)_p[s|d|h|bh]`` and
+ ``*_(mask(z)))_minmax_s[s|d|h]``.
Arm and AArch64 Support
^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
index f028711a807c0..3200e0112adce 100644
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -2022,6 +2022,22 @@ TARGET_BUILTIN(__builtin_ia32_vsm4key4256, "V8UiV8UiV8Ui", "nV:256:", "sm4")
TARGET_BUILTIN(__builtin_ia32_vsm4rnds4128, "V4UiV4UiV4Ui", "nV:128:", "sm4")
TARGET_BUILTIN(__builtin_ia32_vsm4rnds4256, "V8UiV8UiV8Ui", "nV:256:", "sm4")
+// AVX10-MINMAX
+TARGET_BUILTIN(__builtin_ia32_vminmaxnepbf16128, "V8yV8yV8yIi", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vminmaxnepbf16256, "V16yV16yV16yIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vminmaxnepbf16512, "V32yV32yV32yIi", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vminmaxpd128_mask, "V2dV2dV2dIiV2dUc", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vminmaxpd256_round_mask, "V4dV4dV4dIiV4dUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vminmaxpd512_round_mask, "V8dV8dV8dIiV8dUcIi", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vminmaxph128_mask, "V8xV8xV8xIiV8xUc", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vminmaxph256_round_mask, "V16xV16xV16xIiV16xUsIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vminmaxph512_round_mask, "V32xV32xV32xIiV32xUiIi", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vminmaxps128_mask, "V4fV4fV4fIiV4fUc", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vminmaxps256_round_mask, "V8fV8fV8fIiV8fUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vminmaxps512_round_mask, "V16fV16fV16fIiV16fUsIi", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vminmaxsd_round_mask, "V2dV2dV2dIiV2dUcIi", "nV:128:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vminmaxsh_round_mask, "V8xV8xV8xIiV8xUcIi", "nV:128:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vminmaxss_round_mask, "V4fV4fV4fIiV4fUcIi", "nV:128:", "avx10.2-512")
#undef BUILTIN
#undef TARGET_BUILTIN
#undef TARGET_HEADER_BUILTIN
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index b17ab24d625a0..f3d19e38f8f2b 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -147,7 +147,9 @@ set(x86_files
amxcomplexintrin.h
amxfp16intrin.h
amxintrin.h
+ avx10_2_512minmaxintrin.h
avx10_2_512niintrin.h
+ avx10_2minmaxintrin.h
avx10_2niintrin.h
avx2intrin.h
avx512bf16intrin.h
diff --git a/clang/lib/Headers/avx10_2_512minmaxintrin.h b/clang/lib/Headers/avx10_2_512minmaxintrin.h
new file mode 100644
index 0000000000000..ee486cb24f3d9
--- /dev/null
+++ b/clang/lib/Headers/avx10_2_512minmaxintrin.h
@@ -0,0 +1,219 @@
+/*===--------------- avx10_2_512minmaxintrin.h - AVX10_2_512MINMAX intrinsics
+ *-----------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error \
+ "Never use <avx10_2_512minmaxintrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AVX10_2_512MINMAXINTRIN_H
+#define __AVX10_2_512MINMAXINTRIN_H
+
+#define _mm512_minmaxne_pbh(A, B, C) \
+ ((__m512bh)__builtin_ia32_vminmaxnepbf16512( \
+ (__v32bf)(__m512bh)(A), (__v32bf)(__m512bh)(A), (int)(C)))
+
+#define _mm512_mask_minmaxne_pbh(W, U, A, B, C) \
+ ((__m512bh)__builtin_ia32_selectpbf_512( \
+ (__mmask32)(U), \
+ (__v32bf)__builtin_ia32_vminmaxnepbf16512( \
+ (__v32bf)(__m512bh)(A), (__v32bf)(__m512bh)(B), (int)(C)), \
+ (__v32bf)(__m512bh)(W)))
+
+#define _mm512_maskz_minmaxne_pbh(U, A, B, C) \
+ ((__m512bh)__builtin_ia32_selectpbf_512( \
+ (__mmask32)(U), \
+ (__v32bf)__builtin_ia32_vminmaxnepbf16512( \
+ (__v32bf)(__m512bh)(A), (__v32bf)(__m512bh)(B), (int)(C)), \
+ (__v32bf) __builtin_bit_cast(__m512bh, _mm512_setzero_ps())))
+
+#define _mm512_minmax_pd(A, B, C) \
+ ((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
+ (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
+ (__v8df)_mm512_undefined_pd(), (__mmask8) - 1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_minmax_pd(W, U, A, B, C) \
+ ((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
+ (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
+ (__v8df)(__m512d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_minmax_pd(U, A, B, C) \
+ ((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
+ (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
+ (__v8df)_mm512_setzero_pd(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_minmax_round_pd(A, B, C, R) \
+ ((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
+ (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
+ (__v8df)_mm512_undefined_pd(), (__mmask8) - 1, (int)(R)))
+
+#define _mm512_mask_minmax_round_pd(W, U, A, B, C, R) \
+ ((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
+ (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
+ (__v8df)(__m512d)(W), (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_minmax_round_pd(U, A, B, C, R) \
+ ((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
+ (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
+ (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R)))
+
+#define _mm512_minmax_ph(A, B, C) \
+ ((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
+ (__v32hf)_mm512_undefined_ph(), (__mmask32) - 1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_minmax_ph(W, U, A, B, C) \
+ ((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
+ (__v32hf)(__m512h)(W), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_minmax_ph(U, A, B, C) \
+ ((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
+ (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_minmax_round_ph(A, B, C, R) \
+ ((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
+ (__v32hf)_mm512_undefined_ph(), (__mmask32) - 1, (int)(R)))
+
+#define _mm512_mask_minmax_round_ph(W, U, A, B, C, R) \
+ ((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
+ (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_minmax_round_ph(U, A, B, C, R) \
+ ((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
+ (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
+
+#define _mm512_minmax_ps(A, B, C) \
+ ((__m512)__builtin_ia32_vminmaxps512_round_mask( \
+ (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \
+ (__v16sf)_mm512_undefined_ps(), (__mmask16) - 1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_minmax_ps(W, U, A, B, C) \
+ ((__m512)__builtin_ia32_vminmaxps512_round_mask( \
+ (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), (__v16sf)(W), \
+ (__mmask16)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_minmax_ps(U, A, B, C) \
+ ((__m512)__builtin_ia32_vminmaxps512_round_mask( \
+ (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \
+ (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_minmax_round_ps(A, B, C, R) \
+ ((__m512)__builtin_ia32_vminmaxps512_round_mask( \
+ (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \
+ (__v16sf)_mm512_undefined_ps(), (__mmask16) - 1, (int)(R)))
+
+#define _mm512_mask_minmax_round_ps(W, U, A, B, C, R) \
+ ((__m512)__builtin_ia32_vminmaxps512_round_mask( \
+ (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), (__v16sf)(W), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_minmax_round_ps(U, A, B, C, R) \
+ ((__m512)__builtin_ia32_vminmaxps512_round_mask( \
+ (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \
+ (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R)))
+
+#define _mm_minmax_sd(A, B, C) \
+ ((__m128d)__builtin_ia32_vminmaxsd_round_mask( \
+ (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
+ (__v2df)_mm_undefined_pd(), (__mmask8) - 1, _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_minmax_sd(W, U, A, B, C) \
+ ((__m128d)__builtin_ia32_vminmaxsd_round_mask( \
+ (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
+ (__v2df)(__m128d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_minmax_sd(U, A, B, C) \
+ ((__m128d)__builtin_ia32_vminmaxsd_round_mask( \
+ (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
+ (__v2df)_mm_setzero_pd(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_minmax_round_sd(A, B, C, R) \
+ ((__m128d)__builtin_ia32_vminmaxsd_round_mask( \
+ (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
+ (__v2df)_mm_undefined_pd(), (__mmask8) - 1, (int)(R)))
+
+#define _mm_mask_minmax_round_sd(W, U, A, B, C, R) \
+ ((__m128d)__builtin_ia32_vminmaxsd_round_mask( \
+ (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
+ (__v2df)(__m128d)(W), (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_minmax_round_sd(U, A, B, C, R) \
+ ((__m128d)__builtin_ia32_vminmaxsd_round_mask( \
+ (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
+ (__v2df)_mm_setzero_pd(), (__mmask8)(U), (int)(R)))
+
+#define _mm_minmax_sh(A, B, C) \
+ ((__m128h)__builtin_ia32_vminmaxsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
+ (__v8hf)_mm_undefined_ph(), (__mmask8) - 1, _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_minmax_sh(W, U, A, B, C) \
+ ((__m128h)__builtin_ia32_vminmaxsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
+ (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_minmax_sh(U, A, B, C) \
+ ((__m128h)__builtin_ia32_vminmaxsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
+ (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_minmax_round_sh(A, B, C, R) \
+ ((__m128h)__builtin_ia32_vminmaxsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
+ (__v8hf)_mm_undefined_ph(), (__mmask8) - 1, (int)(R)))
+
+#define _mm_mask_minmax_round_sh(W, U, A, B, C, R) \
+ ((__m128h)__builtin_ia32_vminmaxsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
+ (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_minmax_round_sh(U, A, B, C, R) \
+ ((__m128h)__builtin_ia32_vminmaxsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
+ (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
+
+#define _mm_minmax_ss(A, B, C) \
+ ((__m128)__builtin_ia32_vminmaxss_round_mask( \
+ (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \
+ (__v4sf)_mm_undefined_ps(), (__mmask8) - 1, _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_minmax_ss(W, U, A, B, C) \
+ ((__m128)__builtin_ia32_vminmaxss_round_mask( \
+ (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(W), \
+ (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_minmax_ss(U, A, B, C) \
+ ((__m128)__builtin_ia32_vminmaxss_round_mask( \
+ (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \
+ (__v4sf)_mm_setzero_ps(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_minmax_round_ss(A, B, C, R) \
+ ((__m128)__builtin_ia32_vminmaxss_round_mask( \
+ (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \
+ (__v4sf)_mm_undefined_ps(), (__mmask8) - 1, (int)(R)))
+
+#define _mm_mask_minmax_round_ss(W, U, A, B, C, R) \
+ ((__m128)__builtin_ia32_vminmaxss_round_mask( \
+ (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_minmax_round_ss(U, A, B, C, R) \
+ ((__m128)__builtin_ia32_vminmaxss_round_mask( \
+ (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \
+ (__v4sf)_mm_setzero_ps(), (__mmask8)(U), (int)(R)))
+
+#endif // __AVX10_2_512MINMAXINTRIN_H
diff --git a/clang/lib/Headers/avx10_2minmaxintrin.h b/clang/lib/Headers/avx10_2minmaxintrin.h
new file mode 100644
index 0000000000000..48539dd65b5b9
--- /dev/null
+++ b/clang/lib/Headers/avx10_2minmaxintrin.h
@@ -0,0 +1,188 @@
+/*===--------------- avx10_2minmaxintrin.h - AVX10_2MINMAX intrinsics
+ *-----------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error \
+ "Never use <avx10_2minmaxintrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AVX10_2MINMAXINTRIN_H
+#define __AVX10_2MINMAXINTRIN_H
+
+#define _mm_minmaxne_pbh(A, B, C) \
+ ((__m128bh)__builtin_ia32_vminmaxnepbf16128( \
+ (__m128bh)(__v8bf)(A), (__m128bh)(__v8bf)(B), (int)(C)))
+
+#define _mm_mask_minmaxne_pbh(W, U, A, B, C) \
+ ((__m128bh)__builtin_ia32_selectpbf_128( \
+ (__mmask8)(U), \
+ (__v8bf)__builtin_ia32_vminmaxnepbf16128( \
+ (__m128bh)(__v8bf)(A), (__m128bh)(__v8bf)(B), (int)(C)), \
+ (__v8bf)(W)))
+
+#define _mm_maskz_minmaxne_pbh(U, A, B, C) \
+ ((__m128bh)__builtin_ia32_selectpbf_128( \
+ (__mmask8)(U), \
+ (__v8bf)__builtin_ia32_vminmaxnepbf16128( \
+ (__m128bh)(__v8bf)(A), (__m128bh)(__v8bf)(B), (int)(C)), \
+ (__v8bf) __builtin_bit_cast(__m128bh, _mm_setzero_ps())))
+
+#define _mm256_minmaxne_pbh(A, B, C) \
+ ((__m256bh)__builtin_ia32_vminmaxnepbf16256( \
+ (__m256bh)(__v16bf)(A), (__m256bh)(__v16bf)(B), (int)(C)))
+
+#define _mm256_mask_minmaxne_pbh(W, U, A, B, C) \
+ ((__m256bh)__builtin_ia32_selectpbf_256( \
+ (__mmask16)(U), \
+ (__v16bf)__builtin_ia32_vminmaxnepbf16256( \
+ (__m256bh)(__v16bf)(A), (__m256bh)(__v16bf)(B), (int)(C)), \
+ (__v16bf)(W)))
+
+#define _mm256_maskz_minmaxne_pbh(U, A, B, C) \
+ ((__m256bh)__builtin_ia32_selectpbf_256( \
+ (__mmask16)(U), \
+ (__v16bf)__builtin_ia32_vminmaxnepbf16256( \
+ (__m256bh)(__v16bf)(A), (__m256bh)(__v16bf)(B), (int)(C)), \
+ (__v16bf) __builtin_bit_cast(__m256bh, _mm256_setzero_ps())))
+
+#define _mm_minmax_pd(A, B, C) \
+ ((__m128d)__builtin_ia32_vminmaxpd128_mask( \
+ (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
+ (__v2df)_mm_setzero_pd(), (__mmask8)(-1)))
+
+#define _mm_mask_minmax_pd(W, U, A, B, C) \
+ ((__m128d)__builtin_ia32_vminmaxpd128_mask( \
+ (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
+ (__v2df)(__m128d)(W), (__mmask8)(U)))
+
+#define _mm_maskz_minmax_pd(U, A, B, C) \
+ ((__m128d)__builtin_ia32_vminmaxpd128_mask( \
+ (__v2df)(__m1...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Ref.: https://cdrdv2.intel.com/v1/dl/getContent/828965