Skip to content

[X86][AVX10.2] Remove YMM rounding from VMINMAXP[H,S,D] #132405

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Mar 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions clang/include/clang/Basic/BuiltinsX86.td
Original file line number Diff line number Diff line change
Expand Up @@ -4823,7 +4823,7 @@ let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] i
}

let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
def vminmaxpd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int, _Vector<4, double>, unsigned char, _Constant int)">;
def vminmaxpd256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int, _Vector<4, double>, unsigned char)">;
}

let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
Expand All @@ -4835,7 +4835,7 @@ let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] i
}

let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
def vminmaxph256_round_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Constant int, _Vector<16, _Float16>, unsigned short, _Constant int)">;
def vminmaxph256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Constant int, _Vector<16, _Float16>, unsigned short)">;
}

let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
Expand All @@ -4847,7 +4847,7 @@ let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] i
}

let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
def vminmaxps256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int, _Vector<8, float>, unsigned char, _Constant int)">;
def vminmaxps256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int, _Vector<8, float>, unsigned char)">;
}

let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
Expand Down
81 changes: 18 additions & 63 deletions clang/lib/Headers/avx10_2minmaxintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,34 +66,19 @@
(__v2df)_mm_setzero_pd(), (__mmask8)(U)))

#define _mm256_minmax_pd(A, B, C) \
((__m256d)__builtin_ia32_vminmaxpd256_round_mask( \
((__m256d)__builtin_ia32_vminmaxpd256_mask( \
(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \
(__v4df)_mm256_setzero_pd(), (__mmask8)-1, _MM_FROUND_NO_EXC))
(__v4df)_mm256_setzero_pd(), (__mmask8)-1))

#define _mm256_mask_minmax_pd(W, U, A, B, C) \
((__m256d)__builtin_ia32_vminmaxpd256_round_mask( \
((__m256d)__builtin_ia32_vminmaxpd256_mask( \
(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \
(__v4df)(__m256d)(W), (__mmask8)(U), _MM_FROUND_NO_EXC))
(__v4df)(__m256d)(W), (__mmask8)(U)))

#define _mm256_maskz_minmax_pd(U, A, B, C) \
((__m256d)__builtin_ia32_vminmaxpd256_round_mask( \
((__m256d)__builtin_ia32_vminmaxpd256_mask( \
(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \
(__v4df)_mm256_setzero_pd(), (__mmask8)(U), _MM_FROUND_NO_EXC))

#define _mm256_minmax_round_pd(A, B, C, R) \
((__m256d)__builtin_ia32_vminmaxpd256_round_mask( \
(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \
(__v4df)_mm256_undefined_pd(), (__mmask8)-1, (int)(R)))

#define _mm256_mask_minmax_round_pd(W, U, A, B, C, R) \
((__m256d)__builtin_ia32_vminmaxpd256_round_mask( \
(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \
(__v4df)(__m256d)(W), (__mmask8)(U), (int)(R)))

#define _mm256_maskz_minmax_round_pd(U, A, B, C, R) \
((__m256d)__builtin_ia32_vminmaxpd256_round_mask( \
(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \
(__v4df)_mm256_setzero_pd(), (__mmask8)(U), (int)(R)))
(__v4df)_mm256_setzero_pd(), (__mmask8)(U)))

#define _mm_minmax_ph(A, B, C) \
((__m128h)__builtin_ia32_vminmaxph128_mask( \
Expand All @@ -111,34 +96,19 @@
(__v8hf)_mm_setzero_ph(), (__mmask8)(U)))

#define _mm256_minmax_ph(A, B, C) \
((__m256h)__builtin_ia32_vminmaxph256_round_mask( \
((__m256h)__builtin_ia32_vminmaxph256_mask( \
(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(C), \
(__v16hf)_mm256_setzero_ph(), (__mmask16)-1, _MM_FROUND_NO_EXC))
(__v16hf)_mm256_setzero_ph(), (__mmask16)-1))

#define _mm256_mask_minmax_ph(W, U, A, B, C) \
((__m256h)__builtin_ia32_vminmaxph256_round_mask( \
((__m256h)__builtin_ia32_vminmaxph256_mask( \
(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(C), \
(__v16hf)(__m256h)(W), (__mmask16)(U), _MM_FROUND_NO_EXC))
(__v16hf)(__m256h)(W), (__mmask16)(U)))

#define _mm256_maskz_minmax_ph(U, A, B, C) \
((__m256h)__builtin_ia32_vminmaxph256_round_mask( \
(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(C), \
(__v16hf)_mm256_setzero_ph(), (__mmask16)(U), _MM_FROUND_NO_EXC))

#define _mm256_minmax_round_ph(A, B, C, R) \
((__m256h)__builtin_ia32_vminmaxph256_round_mask( \
((__m256h)__builtin_ia32_vminmaxph256_mask( \
(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(C), \
(__v16hf)_mm256_undefined_ph(), (__mmask16)-1, (int)(R)))

#define _mm256_mask_minmax_round_ph(W, U, A, B, C, R) \
((__m256h)__builtin_ia32_vminmaxph256_round_mask( \
(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (C), \
(__v16hf)(__m256h)(W), (__mmask16)(U), (int)(R)))

#define _mm256_maskz_minmax_round_ph(U, A, B, C, R) \
((__m256h)__builtin_ia32_vminmaxph256_round_mask( \
(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(C), \
(__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
(__v16hf)_mm256_setzero_ph(), (__mmask16)(U)))

#define _mm_minmax_ps(A, B, C) \
((__m128)__builtin_ia32_vminmaxps128_mask( \
Expand All @@ -156,34 +126,19 @@
(__v4sf)_mm_setzero_ps(), (__mmask8)(U)))

#define _mm256_minmax_ps(A, B, C) \
((__m256)__builtin_ia32_vminmaxps256_round_mask( \
((__m256)__builtin_ia32_vminmaxps256_mask( \
(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), \
(__v8sf)_mm256_setzero_ps(), (__mmask8)-1, _MM_FROUND_NO_EXC))
(__v8sf)_mm256_setzero_ps(), (__mmask8)-1))

#define _mm256_mask_minmax_ps(W, U, A, B, C) \
((__m256)__builtin_ia32_vminmaxps256_round_mask( \
((__m256)__builtin_ia32_vminmaxps256_mask( \
(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), (__v8sf)(__m256)(W), \
(__mmask8)(U), _MM_FROUND_NO_EXC))
(__mmask8)(U)))

#define _mm256_maskz_minmax_ps(U, A, B, C) \
((__m256)__builtin_ia32_vminmaxps256_round_mask( \
(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), \
(__v8sf)_mm256_setzero_ps(), (__mmask8)(U), _MM_FROUND_NO_EXC))

#define _mm256_minmax_round_ps(A, B, C, R) \
((__m256)__builtin_ia32_vminmaxps256_round_mask( \
(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), \
(__v8sf)_mm256_undefined_ps(), (__mmask8)-1, (int)(R)))

#define _mm256_mask_minmax_round_ps(W, U, A, B, C, R) \
((__m256)__builtin_ia32_vminmaxps256_round_mask( \
(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), (__v8sf)(__m256)(W), \
(__mmask8)(U), (int)(R)))

#define _mm256_maskz_minmax_round_ps(U, A, B, C, R) \
((__m256)__builtin_ia32_vminmaxps256_round_mask( \
((__m256)__builtin_ia32_vminmaxps256_mask( \
(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), \
(__v8sf)_mm256_setzero_ps(), (__mmask8)(U), (int)(R)))
(__v8sf)_mm256_setzero_ps(), (__mmask8)(U)))

#define _mm_minmax_sd(A, B, C) \
((__m128d)__builtin_ia32_vminmaxsd_round_mask( \
Expand Down
9 changes: 3 additions & 6 deletions clang/lib/Sema/SemaX86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,9 +147,6 @@ bool SemaX86::CheckBuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) {
case X86::BI__builtin_ia32_rndscalesd_round_mask:
case X86::BI__builtin_ia32_rndscaless_round_mask:
case X86::BI__builtin_ia32_rndscalesh_round_mask:
case X86::BI__builtin_ia32_vminmaxpd256_round_mask:
case X86::BI__builtin_ia32_vminmaxps256_round_mask:
case X86::BI__builtin_ia32_vminmaxph256_round_mask:
case X86::BI__builtin_ia32_vminmaxpd512_round_mask:
case X86::BI__builtin_ia32_vminmaxps512_round_mask:
case X86::BI__builtin_ia32_vminmaxph512_round_mask:
Expand Down Expand Up @@ -910,11 +907,11 @@ bool SemaX86::CheckBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
case X86::BI__builtin_ia32_vminmaxbf16256:
case X86::BI__builtin_ia32_vminmaxbf16512:
case X86::BI__builtin_ia32_vminmaxpd128_mask:
case X86::BI__builtin_ia32_vminmaxpd256_round_mask:
case X86::BI__builtin_ia32_vminmaxpd256_mask:
case X86::BI__builtin_ia32_vminmaxph128_mask:
case X86::BI__builtin_ia32_vminmaxph256_round_mask:
case X86::BI__builtin_ia32_vminmaxph256_mask:
case X86::BI__builtin_ia32_vminmaxps128_mask:
case X86::BI__builtin_ia32_vminmaxps256_round_mask:
case X86::BI__builtin_ia32_vminmaxps256_mask:
case X86::BI__builtin_ia32_vminmaxpd512_round_mask:
case X86::BI__builtin_ia32_vminmaxps512_round_mask:
case X86::BI__builtin_ia32_vminmaxph512_round_mask:
Expand Down
11 changes: 0 additions & 11 deletions clang/test/CodeGen/X86/avx10_2_512minmax-error.c
Original file line number Diff line number Diff line change
Expand Up @@ -113,17 +113,6 @@ __m512 test_mm512_minmax_round_ps(__m512 __A, __m512 __B) {
return _mm512_minmax_round_ps(__A, __B, 127, 11); // expected-error {{invalid rounding argument}}
}

__m256d test_mm256_minmax_round_pd(__m256d __A, __m256d __B) {
return _mm256_minmax_round_pd(__A, __B, 127, 11); // expected-error {{invalid rounding argument}}
}

__m256h test_mm256_minmax_round_ph(__m256h __A, __m256h __B) {
return _mm256_minmax_round_ph(__A, __B, 127, 11); // expected-error {{invalid rounding argument}}
}

__m256 test_mm256_minmax_round_ps(__m256 __A, __m256 __B) {
return _mm256_minmax_round_ps(__A, __B, 127, 11); // expected-error {{invalid rounding argument}}
}
__m128d test_mm_minmax_round_sd(__m128d __A, __m128d __B) {
return _mm_minmax_round_sd(__A, __B, 127, 11); // expected-error {{invalid rounding argument}}
}
Expand Down
72 changes: 9 additions & 63 deletions clang/test/CodeGen/X86/avx10_2minmax-builtins.c
Original file line number Diff line number Diff line change
Expand Up @@ -67,40 +67,22 @@ __m128d test_mm_maskz_minmax_pd(__mmask8 __A, __m128d __B, __m128d __C) {

__m256d test_mm256_minmax_pd(__m256d __A, __m256d __B) {
// CHECK-LABEL: @test_mm256_minmax_pd(
// CHECK: call <4 x double> @llvm.x86.avx10.mask.vminmaxpd256.round(
// CHECK: call <4 x double> @llvm.x86.avx10.mask.vminmaxpd256(
return _mm256_minmax_pd(__A, __B, 127);
}

__m256d test_mm256_mask_minmax_pd(__m256d __A, __mmask8 __B, __m256d __C, __m256d __D) {
// CHECK-LABEL: @test_mm256_mask_minmax_pd(
// CHECK: call <4 x double> @llvm.x86.avx10.mask.vminmaxpd256.round(
// CHECK: call <4 x double> @llvm.x86.avx10.mask.vminmaxpd256(
return _mm256_mask_minmax_pd(__A, __B, __C, __D, 127);
}

__m256d test_mm256_maskz_minmax_pd(__mmask8 __A, __m256d __B, __m256d __C) {
// CHECK-LABEL: @test_mm256_maskz_minmax_pd(
// CHECK: call <4 x double> @llvm.x86.avx10.mask.vminmaxpd256.round(
// CHECK: call <4 x double> @llvm.x86.avx10.mask.vminmaxpd256(
return _mm256_maskz_minmax_pd(__A, __B, __C, 127);
}

__m256d test_mm256_minmax_round_pd(__m256d __A, __m256d __B) {
// CHECK-LABEL: @test_mm256_minmax_round_pd(
// CHECK: call <4 x double> @llvm.x86.avx10.mask.vminmaxpd256.round(
return _mm256_minmax_round_pd(__A, __B, 127, _MM_FROUND_NO_EXC);
}

__m256d test_mm256_mask_minmax_round_pd(__m256d __A, __mmask8 __B, __m256d __C, __m256d __D) {
// CHECK-LABEL: @test_mm256_mask_minmax_round_pd(
// CHECK: call <4 x double> @llvm.x86.avx10.mask.vminmaxpd256.round(
return _mm256_mask_minmax_round_pd(__A, __B, __C, __D, 127, _MM_FROUND_NO_EXC);
}

__m256d test_mm256_maskz_minmax_round_pd(__mmask8 __A, __m256d __B, __m256d __C) {
// CHECK-LABEL: @test_mm256_maskz_minmax_round_pd(
// CHECK: call <4 x double> @llvm.x86.avx10.mask.vminmaxpd256.round(
return _mm256_maskz_minmax_round_pd(__A, __B, __C, 127, _MM_FROUND_NO_EXC);
}

__m128h test_mm_minmax_ph(__m128h __A, __m128h __B) {
// CHECK-LABEL: @test_mm_minmax_ph(
// CHECK: call <8 x half> @llvm.x86.avx10.mask.vminmaxph128(
Expand All @@ -121,40 +103,22 @@ __m128h test_mm_maskz_minmax_ph(__mmask8 __A, __m128h __B, __m128h __C) {

__m256h test_mm256_minmax_ph(__m256h __A, __m256h __B) {
// CHECK-LABEL: @test_mm256_minmax_ph(
// CHECK: call <16 x half> @llvm.x86.avx10.mask.vminmaxph256.round(
// CHECK: call <16 x half> @llvm.x86.avx10.mask.vminmaxph256(
return _mm256_minmax_ph(__A, __B, 127);
}

__m256h test_mm256_mask_minmax_ph(__m256h __A, __mmask16 __B, __m256h __C, __m256h __D) {
// CHECK-LABEL: @test_mm256_mask_minmax_ph(
// CHECK: call <16 x half> @llvm.x86.avx10.mask.vminmaxph256.round(
// CHECK: call <16 x half> @llvm.x86.avx10.mask.vminmaxph256(
return _mm256_mask_minmax_ph(__A, __B, __C, __D, 127);
}

__m256h test_mm256_maskz_minmax_ph(__mmask16 __A, __m256h __B, __m256h __C) {
// CHECK-LABEL: @test_mm256_maskz_minmax_ph(
// CHECK: call <16 x half> @llvm.x86.avx10.mask.vminmaxph256.round(
// CHECK: call <16 x half> @llvm.x86.avx10.mask.vminmaxph256(
return _mm256_maskz_minmax_ph(__A, __B, __C, 127);
}

__m256h test_mm256_minmax_round_ph(__m256h __A, __m256h __B) {
// CHECK-LABEL: @test_mm256_minmax_round_ph(
// CHECK: call <16 x half> @llvm.x86.avx10.mask.vminmaxph256.round(
return _mm256_minmax_round_ph(__A, __B, 127, _MM_FROUND_NO_EXC);
}

__m256h test_mm256_mask_minmax_round_ph(__m256h __A, __mmask16 __B, __m256h __C, __m256h __D) {
// CHECK-LABEL: @test_mm256_mask_minmax_round_ph(
// CHECK: call <16 x half> @llvm.x86.avx10.mask.vminmaxph256.round(
return _mm256_mask_minmax_round_ph(__A, __B, __C, __D, 127, _MM_FROUND_NO_EXC);
}

__m256h test_mm256_maskz_minmax_round_ph(__mmask16 __A, __m256h __B, __m256h __C) {
// CHECK-LABEL: @test_mm256_maskz_minmax_round_ph(
// CHECK: call <16 x half> @llvm.x86.avx10.mask.vminmaxph256.round(
return _mm256_maskz_minmax_round_ph(__A, __B, __C, 127, _MM_FROUND_NO_EXC);
}

__m128 test_mm_minmax_ps(__m128 __A, __m128 __B) {
// CHECK-LABEL: @test_mm_minmax_ps(
// CHECK: call <4 x float> @llvm.x86.avx10.mask.vminmaxps128(
Expand All @@ -175,40 +139,22 @@ __m128 test_mm_maskz_minmax_ps(__mmask8 __A, __m128 __B, __m128 __C) {

__m256 test_mm256_minmax_ps(__m256 __A, __m256 __B) {
// CHECK-LABEL: @test_mm256_minmax_ps(
// CHECK: call <8 x float> @llvm.x86.avx10.mask.vminmaxps256.round(
// CHECK: call <8 x float> @llvm.x86.avx10.mask.vminmaxps256(
return _mm256_minmax_ps(__A, __B, 127);
}

__m256 test_mm256_mask_minmax_ps(__m256 __A, __mmask8 __B, __m256 __C, __m256 __D) {
// CHECK-LABEL: @test_mm256_mask_minmax_ps(
// CHECK: call <8 x float> @llvm.x86.avx10.mask.vminmaxps256.round(
// CHECK: call <8 x float> @llvm.x86.avx10.mask.vminmaxps256(
return _mm256_mask_minmax_ps(__A, __B, __C, __D, 127);
}

__m256 test_mm256_maskz_minmax_ps(__mmask8 __A, __m256 __B, __m256 __C) {
// CHECK-LABEL: @test_mm256_maskz_minmax_ps(
// CHECK: call <8 x float> @llvm.x86.avx10.mask.vminmaxps256.round(
// CHECK: call <8 x float> @llvm.x86.avx10.mask.vminmaxps256(
return _mm256_maskz_minmax_ps(__A, __B, __C, 127);
}

__m256 test_mm256_minmax_round_ps(__m256 __A, __m256 __B) {
// CHECK-LABEL: @test_mm256_minmax_round_ps(
// CHECK: call <8 x float> @llvm.x86.avx10.mask.vminmaxps256.round(
return _mm256_minmax_round_ps(__A, __B, 127, _MM_FROUND_NO_EXC);
}

__m256 test_mm256_mask_minmax_round_ps(__m256 __A, __mmask8 __B, __m256 __C, __m256 __D) {
// CHECK-LABEL: @test_mm256_mask_minmax_round_ps(
// CHECK: call <8 x float> @llvm.x86.avx10.mask.vminmaxps256.round(
return _mm256_mask_minmax_round_ps(__A, __B, __C, __D, 127, _MM_FROUND_NO_EXC);
}

__m256 test_mm256_maskz_minmax_round_ps(__mmask8 __A, __m256 __B, __m256 __C) {
// CHECK-LABEL: @test_mm256_maskz_minmax_round_ps(
// CHECK: call <8 x float> @llvm.x86.avx10.mask.vminmaxps256.round(
return _mm256_maskz_minmax_round_ps(__A, __B, __C, 127, _MM_FROUND_NO_EXC);
}

__m128d test_mm_minmax_sd(__m128d __A, __m128d __B) {
// CHECK-LABEL: @test_mm_minmax_sd(
// CHECK: call <2 x double> @llvm.x86.avx10.mask.vminmaxsd.round(
Expand Down
Loading
Loading