Skip to content

[X86][AVX10.2] Support AVX10.2-MINMAX new instructions. #101598

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Aug 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions clang/docs/ReleaseNotes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,8 @@ X86 Support
found in the file ``clang/www/builtins.py``.

- Support ISA of ``AVX10.2``.
* Supported MINMAX intrinsics of ``*_(mask(z)))_minmax(ne)_p[s|d|h|bh]`` and
``*_(mask(z)))_minmax_s[s|d|h]``.

Arm and AArch64 Support
^^^^^^^^^^^^^^^^^^^^^^^
Expand Down
16 changes: 16 additions & 0 deletions clang/include/clang/Basic/BuiltinsX86.def
Original file line number Diff line number Diff line change
Expand Up @@ -2142,6 +2142,22 @@ TARGET_BUILTIN(__builtin_ia32_vsm4key4256, "V8UiV8UiV8Ui", "nV:256:", "sm4")
TARGET_BUILTIN(__builtin_ia32_vsm4rnds4128, "V4UiV4UiV4Ui", "nV:128:", "sm4")
TARGET_BUILTIN(__builtin_ia32_vsm4rnds4256, "V8UiV8UiV8Ui", "nV:256:", "sm4")

// AVX10 MINMAX
TARGET_BUILTIN(__builtin_ia32_vminmaxnepbf16128, "V8yV8yV8yIi", "nV:128:", "avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vminmaxnepbf16256, "V16yV16yV16yIi", "nV:256:", "avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vminmaxnepbf16512, "V32yV32yV32yIi", "nV:512:", "avx10.2-512")
TARGET_BUILTIN(__builtin_ia32_vminmaxpd128_mask, "V2dV2dV2dIiV2dUc", "nV:128:", "avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vminmaxpd256_round_mask, "V4dV4dV4dIiV4dUcIi", "nV:256:", "avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vminmaxpd512_round_mask, "V8dV8dV8dIiV8dUcIi", "nV:512:", "avx10.2-512")
TARGET_BUILTIN(__builtin_ia32_vminmaxph128_mask, "V8xV8xV8xIiV8xUc", "nV:128:", "avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vminmaxph256_round_mask, "V16xV16xV16xIiV16xUsIi", "nV:256:", "avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vminmaxph512_round_mask, "V32xV32xV32xIiV32xUiIi", "nV:512:", "avx10.2-512")
TARGET_BUILTIN(__builtin_ia32_vminmaxps128_mask, "V4fV4fV4fIiV4fUc", "nV:128:", "avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vminmaxps256_round_mask, "V8fV8fV8fIiV8fUcIi", "nV:256:", "avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vminmaxps512_round_mask, "V16fV16fV16fIiV16fUsIi", "nV:512:", "avx10.2-512")
TARGET_BUILTIN(__builtin_ia32_vminmaxsd_round_mask, "V2dV2dV2dIiV2dUcIi", "nV:128:", "avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vminmaxsh_round_mask, "V8xV8xV8xIiV8xUcIi", "nV:128:", "avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vminmaxss_round_mask, "V4fV4fV4fIiV4fUcIi", "nV:128:", "avx10.2-256")
#undef BUILTIN
#undef TARGET_BUILTIN
#undef TARGET_HEADER_BUILTIN
2 changes: 2 additions & 0 deletions clang/lib/Headers/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,9 @@ set(x86_files
amxcomplexintrin.h
amxfp16intrin.h
amxintrin.h
avx10_2_512minmaxintrin.h
avx10_2_512niintrin.h
avx10_2minmaxintrin.h
avx10_2niintrin.h
avx2intrin.h
avx512bf16intrin.h
Expand Down
127 changes: 127 additions & 0 deletions clang/lib/Headers/avx10_2_512minmaxintrin.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
/*===---- avx10_2_512minmaxintrin.h - AVX10_2_512MINMAX intrinsics ---------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error \
"Never use <avx10_2_512minmaxintrin.h> directly; include <immintrin.h> instead."
#endif // __IMMINTRIN_H

#ifndef __AVX10_2_512MINMAXINTRIN_H
#define __AVX10_2_512MINMAXINTRIN_H

#define _mm512_minmaxne_pbh(A, B, C) \
((__m512bh)__builtin_ia32_vminmaxnepbf16512( \
(__v32bf)(__m512bh)(A), (__v32bf)(__m512bh)(A), (int)(C)))

#define _mm512_mask_minmaxne_pbh(W, U, A, B, C) \
((__m512bh)__builtin_ia32_selectpbf_512( \
(__mmask32)(U), \
(__v32bf)_mm512_minmaxne_pbh((__v32bf)(__m512bh)(A), \
(__v32bf)(__m512bh)(B), (int)(C)), \
(__v32bf)(__m512bh)(W)))

#define _mm512_maskz_minmaxne_pbh(U, A, B, C) \
((__m512bh)__builtin_ia32_selectpbf_512( \
(__mmask32)(U), \
(__v32bf)_mm512_minmaxne_pbh((__v32bf)(__m512bh)(A), \
(__v32bf)(__m512bh)(B), (int)(C)), \
(__v32bf) __builtin_bit_cast(__m512bh, _mm512_setzero_ps())))

#define _mm512_minmax_pd(A, B, C) \
((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
(__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
(__v8df)_mm512_undefined_pd(), (__mmask8)-1, \
_MM_FROUND_CUR_DIRECTION))

#define _mm512_mask_minmax_pd(W, U, A, B, C) \
((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
(__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
(__v8df)(__m512d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))

#define _mm512_maskz_minmax_pd(U, A, B, C) \
((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
(__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
(__v8df)_mm512_setzero_pd(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))

#define _mm512_minmax_round_pd(A, B, C, R) \
((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
(__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
(__v8df)_mm512_undefined_pd(), (__mmask8)-1, (int)(R)))

#define _mm512_mask_minmax_round_pd(W, U, A, B, C, R) \
((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
(__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
(__v8df)(__m512d)(W), (__mmask8)(U), (int)(R)))

#define _mm512_maskz_minmax_round_pd(U, A, B, C, R) \
((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
(__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
(__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R)))

#define _mm512_minmax_ph(A, B, C) \
((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
(__v32hf)_mm512_undefined_ph(), (__mmask32)-1, \
_MM_FROUND_CUR_DIRECTION))

#define _mm512_mask_minmax_ph(W, U, A, B, C) \
((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
(__v32hf)(__m512h)(W), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))

#define _mm512_maskz_minmax_ph(U, A, B, C) \
((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
(__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))

#define _mm512_minmax_round_ph(A, B, C, R) \
((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
(__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))

#define _mm512_mask_minmax_round_ph(W, U, A, B, C, R) \
((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
(__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R)))

#define _mm512_maskz_minmax_round_ph(U, A, B, C, R) \
((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
(__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))

#define _mm512_minmax_ps(A, B, C) \
((__m512)__builtin_ia32_vminmaxps512_round_mask( \
(__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \
(__v16sf)_mm512_undefined_ps(), (__mmask16)-1, \
_MM_FROUND_CUR_DIRECTION))

#define _mm512_mask_minmax_ps(W, U, A, B, C) \
((__m512)__builtin_ia32_vminmaxps512_round_mask( \
(__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), (__v16sf)(W), \
(__mmask16)(U), _MM_FROUND_CUR_DIRECTION))

#define _mm512_maskz_minmax_ps(U, A, B, C) \
((__m512)__builtin_ia32_vminmaxps512_round_mask( \
(__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \
(__v16sf)_mm512_setzero_ps(), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION))

#define _mm512_minmax_round_ps(A, B, C, R) \
((__m512)__builtin_ia32_vminmaxps512_round_mask( \
(__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \
(__v16sf)_mm512_undefined_ps(), (__mmask16)-1, (int)(R)))

#define _mm512_mask_minmax_round_ps(W, U, A, B, C, R) \
((__m512)__builtin_ia32_vminmaxps512_round_mask( \
(__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), (__v16sf)(W), \
(__mmask16)(U), (int)(R)))

#define _mm512_maskz_minmax_round_ps(U, A, B, C, R) \
((__m512)__builtin_ia32_vminmaxps512_round_mask( \
(__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \
(__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R)))
#endif // __AVX10_2_512MINMAXINTRIN_H
Loading
Loading