Skip to content

Commit 7226b39

Browse files
[X86] Support vectorized llvm.fmaximum/fminimum.vXf16 lowering (#120988)
Support the lowering of vectorized FMINIMUM and FMAXIMUM to vminph and vmaxph on types v8f16, v16f16 when AVX512FP, AVX512VL features are present, and on type v32f16 when AVX512FP is present.
1 parent 676b48d commit 7226b39

File tree

2 files changed

+66
-25
lines changed

2 files changed

+66
-25
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2333,6 +2333,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
23332333

23342334
setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
23352335
setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2336+
2337+
setOperationAction(ISD::FMINIMUM, MVT::v32f16, Custom);
2338+
setOperationAction(ISD::FMAXIMUM, MVT::v32f16, Custom);
23362339
}
23372340

23382341
if (Subtarget.hasVLX()) {
@@ -2377,6 +2380,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
23772380
// Need to custom widen these to prevent scalarization.
23782381
setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
23792382
setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2383+
2384+
setOperationAction(ISD::FMINIMUM, MVT::v8f16, Custom);
2385+
setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Custom);
2386+
2387+
setOperationAction(ISD::FMINIMUM, MVT::v16f16, Custom);
2388+
setOperationAction(ISD::FMAXIMUM, MVT::v16f16, Custom);
23802389
}
23812390
}
23822391

llvm/test/CodeGen/X86/avx512fp16-fminimum-fmaximum.ll

Lines changed: 57 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@ declare half @llvm.minimum.f16(half, half)
55
declare half @llvm.maximum.f16(half, half)
66
declare <8 x half> @llvm.minimum.v8f16(<8 x half>, <8 x half>)
77
declare <8 x half> @llvm.maximum.v8f16(<8 x half>, <8 x half>)
8+
declare <16 x half> @llvm.minimum.v16f16(<16 x half>, <16 x half>)
9+
declare <16 x half> @llvm.maximum.v16f16(<16 x half>, <16 x half>)
10+
declare <32 x half> @llvm.minimum.v32f16(<32 x half>, <32 x half>)
11+
declare <32 x half> @llvm.maximum.v32f16(<32 x half>, <32 x half>)
812

913
define half @test_fminimum(half %x, half %y) {
1014
; CHECK-LABEL: test_fminimum:
@@ -25,20 +29,10 @@ define half @test_fminimum(half %x, half %y) {
2529
ret half %z
2630
}
2731

28-
define <8 x half> @test_fminimum_scalarize(<8 x half> %x, <8 x half> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" {
29-
; CHECK-LABEL: test_fminimum_scalarize:
32+
define <8 x half> @test_fminimum_v8f16(<8 x half> %x, <8 x half> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" {
33+
; CHECK-LABEL: test_fminimum_v8f16:
3034
; CHECK: # %bb.0:
31-
; CHECK-NEXT: vcmpltph %xmm1, %xmm0, %k1
32-
; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm2 {%k1}
33-
; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768]
34-
; CHECK-NEXT: vpcmpeqw %xmm3, %xmm0, %k1
35-
; CHECK-NEXT: vpblendmw %xmm0, %xmm2, %xmm0 {%k1}
36-
; CHECK-NEXT: vpcmpeqw %xmm3, %xmm1, %k1
37-
; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1}
38-
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
39-
; CHECK-NEXT: vcmpeqph %xmm1, %xmm2, %k1
40-
; CHECK-NEXT: vmovdqu16 %xmm0, %xmm2 {%k1}
41-
; CHECK-NEXT: vmovdqa %xmm2, %xmm0
35+
; CHECK-NEXT: vminph %xmm1, %xmm0, %xmm0
4236
; CHECK-NEXT: retq
4337
%r = call <8 x half> @llvm.minimum.v8f16(<8 x half> %x, <8 x half> %y)
4438
ret <8 x half> %r
@@ -113,19 +107,10 @@ define half @test_fmaximum(half %x, half %y) {
113107
ret half %r
114108
}
115109

116-
define <8 x half> @test_fmaximum_scalarize(<8 x half> %x, <8 x half> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" {
117-
; CHECK-LABEL: test_fmaximum_scalarize:
110+
define <8 x half> @test_fmaximum_v8f16(<8 x half> %x, <8 x half> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" {
111+
; CHECK-LABEL: test_fmaximum_v8f16:
118112
; CHECK: # %bb.0:
119-
; CHECK-NEXT: vcmpltph %xmm0, %xmm1, %k1
120-
; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm2 {%k1}
121-
; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
122-
; CHECK-NEXT: vpblendmw %xmm0, %xmm2, %xmm0 {%k1}
123-
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
124-
; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1}
125-
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
126-
; CHECK-NEXT: vcmpeqph %xmm1, %xmm2, %k1
127-
; CHECK-NEXT: vmovdqu16 %xmm0, %xmm2 {%k1}
128-
; CHECK-NEXT: vmovdqa %xmm2, %xmm0
113+
; CHECK-NEXT: vmaxph %xmm1, %xmm0, %xmm0
129114
; CHECK-NEXT: retq
130115
%r = call <8 x half> @llvm.maximum.v8f16(<8 x half> %x, <8 x half> %y)
131116
ret <8 x half> %r
@@ -186,3 +171,50 @@ define half @test_fmaximum_combine_cmps(half %x, half %y) {
186171
%2 = tail call half @llvm.maximum.f16(half %x, half %1)
187172
ret half %2
188173
}
174+
175+
define <16 x half> @test_fminimum_v16f16(<16 x half> %x, <16 x half> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" {
176+
; CHECK-LABEL: test_fminimum_v16f16:
177+
; CHECK: # %bb.0:
178+
; CHECK-NEXT: vminph %ymm1, %ymm0, %ymm0
179+
; CHECK-NEXT: retq
180+
%r = call <16 x half> @llvm.minimum.v16f16(<16 x half> %x, <16 x half> %y)
181+
ret <16 x half> %r
182+
}
183+
184+
define <16 x half> @test_fmaximum_v16f16_nans(<16 x half> %x, <16 x half> %y) "no-signed-zeros-fp-math"="true" {
185+
; CHECK-LABEL: test_fmaximum_v16f16_nans:
186+
; CHECK: # %bb.0:
187+
; CHECK-NEXT: vmaxph %ymm1, %ymm0, %ymm1
188+
; CHECK-NEXT: vcmpunordph %ymm0, %ymm0, %k1
189+
; CHECK-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1}
190+
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
191+
; CHECK-NEXT: retq
192+
%r = call <16 x half> @llvm.maximum.v16f16(<16 x half> %x, <16 x half> %y)
193+
ret <16 x half> %r
194+
}
195+
196+
define <32 x half> @test_fminimum_v32f16_szero(<32 x half> %x, <32 x half> %y) "no-nans-fp-math"="true" {
197+
; CHECK-LABEL: test_fminimum_v32f16_szero:
198+
; CHECK: # %bb.0:
199+
; CHECK-NEXT: vpmovw2m %zmm0, %k1
200+
; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm2 {%k1}
201+
; CHECK-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
202+
; CHECK-NEXT: vminph %zmm2, %zmm0, %zmm0
203+
; CHECK-NEXT: retq
204+
%r = call <32 x half> @llvm.minimum.v32f16(<32 x half> %x, <32 x half> %y)
205+
ret <32 x half> %r
206+
}
207+
208+
define <32 x half> @test_fmaximum_v32f16_nans_szero(<32 x half> %x, <32 x half> %y) {
209+
; CHECK-LABEL: test_fmaximum_v32f16_nans_szero:
210+
; CHECK: # %bb.0:
211+
; CHECK-NEXT: vpmovw2m %zmm0, %k1
212+
; CHECK-NEXT: vpblendmw %zmm1, %zmm0, %zmm2 {%k1}
213+
; CHECK-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
214+
; CHECK-NEXT: vmaxph %zmm2, %zmm1, %zmm0
215+
; CHECK-NEXT: vcmpunordph %zmm1, %zmm1, %k1
216+
; CHECK-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
217+
; CHECK-NEXT: retq
218+
%r = call <32 x half> @llvm.maximum.v32f16(<32 x half> %x, <32 x half> %y)
219+
ret <32 x half> %r
220+
}

0 commit comments

Comments
 (0)