Skip to content

Commit 34deb76

Browse files
authored
[X86] IsElementEquivalent - add handling for ISD::BITCASTS from smaller vector elements (#139741)
Check if all smaller aliased source elements are equivalent
1 parent 6bb05ea commit 34deb76

File tree

6 files changed

+64
-49
lines changed

6 files changed

+64
-49
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9987,19 +9987,29 @@ static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
99879987
MaskSize == (int)ExpectedOp.getNumOperands())
99889988
return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
99899989
break;
9990-
case ISD::BITCAST:
9991-
if (Op == ExpectedOp && (int)VT.getVectorNumElements() == MaskSize) {
9992-
SDValue Src = peekThroughBitcasts(Op);
9993-
EVT SrcVT = Src.getValueType();
9994-
if (SrcVT.isVector() &&
9995-
(SrcVT.getScalarSizeInBits() % VT.getScalarSizeInBits()) == 0) {
9990+
case ISD::BITCAST: {
9991+
SDValue Src = peekThroughBitcasts(Op);
9992+
EVT SrcVT = Src.getValueType();
9993+
if (Op == ExpectedOp && SrcVT.isVector() &&
9994+
(int)VT.getVectorNumElements() == MaskSize) {
9995+
if ((SrcVT.getScalarSizeInBits() % VT.getScalarSizeInBits()) == 0) {
99969996
unsigned Scale = SrcVT.getScalarSizeInBits() / VT.getScalarSizeInBits();
99979997
return (Idx % Scale) == (ExpectedIdx % Scale) &&
99989998
IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
99999999
Idx / Scale, ExpectedIdx / Scale);
1000010000
}
10001+
if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0) {
10002+
unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
10003+
for (unsigned I = 0; I != Scale; ++I)
10004+
if (!IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
10005+
(Idx * Scale) + I,
10006+
(ExpectedIdx * Scale) + I))
10007+
return false;
10008+
return true;
10009+
}
1000110010
}
1000210011
break;
10012+
}
1000310013
case ISD::VECTOR_SHUFFLE: {
1000410014
auto *SVN = cast<ShuffleVectorSDNode>(Op);
1000510015
return Op == ExpectedOp && (int)VT.getVectorNumElements() == MaskSize &&

llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -532,10 +532,10 @@ define <2 x half> @vfptrunc_v2f16_v2f64(<2 x double> %a, <2 x i1> %m, i32 zeroex
532532
; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
533533
; AVX512-NEXT: callq __truncdfhf2@PLT
534534
; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0
535-
; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
536-
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
535+
; AVX512-NEXT: vpblendd $13, (%rsp), %xmm0, %xmm1 # 16-byte Folded Reload
536+
; AVX512-NEXT: # xmm1 = mem[0],xmm0[1],mem[2,3]
537537
; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0
538-
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
538+
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
539539
; AVX512-NEXT: addq $40, %rsp
540540
; AVX512-NEXT: .cfi_def_cfa_offset 8
541541
; AVX512-NEXT: retq

llvm/test/CodeGen/X86/horizontal-sum.ll

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -179,8 +179,8 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
179179
; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
180180
; SSSE3-SLOW-NEXT: haddps %xmm7, %xmm6
181181
; SSSE3-SLOW-NEXT: haddps %xmm6, %xmm6
182-
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,1]
183-
; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1
182+
; SSSE3-SLOW-NEXT: movhlps {{.*#+}} xmm6 = xmm2[1],xmm6[1]
183+
; SSSE3-SLOW-NEXT: movaps %xmm6, %xmm1
184184
; SSSE3-SLOW-NEXT: retq
185185
;
186186
; SSSE3-FAST-LABEL: pair_sum_v8f32_v4f32:
@@ -345,8 +345,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
345345
; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
346346
; SSSE3-SLOW-NEXT: phaddd %xmm7, %xmm6
347347
; SSSE3-SLOW-NEXT: phaddd %xmm6, %xmm6
348-
; SSSE3-SLOW-NEXT: palignr {{.*#+}} xmm6 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
349-
; SSSE3-SLOW-NEXT: movdqa %xmm6, %xmm1
348+
; SSSE3-SLOW-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm6[1]
350349
; SSSE3-SLOW-NEXT: retq
351350
;
352351
; SSSE3-FAST-LABEL: pair_sum_v8i32_v4i32:
@@ -374,7 +373,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
374373
; AVX1-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm4
375374
; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm2
376375
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
377-
; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
376+
; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
378377
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,0,0]
379378
; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7]
380379
; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
@@ -397,7 +396,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
397396
; AVX1-FAST-NEXT: vphaddd %xmm5, %xmm5, %xmm4
398397
; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2
399398
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
400-
; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
399+
; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
401400
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,0,0]
402401
; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7]
403402
; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
@@ -422,7 +421,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
422421
; AVX2-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm4
423422
; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm2
424423
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
425-
; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
424+
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
426425
; AVX2-SLOW-NEXT: vpbroadcastd %xmm4, %xmm5
427426
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
428427
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1]
@@ -445,7 +444,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
445444
; AVX2-FAST-NEXT: vphaddd %xmm5, %xmm5, %xmm4
446445
; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2
447446
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
448-
; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
447+
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
449448
; AVX2-FAST-NEXT: vpbroadcastd %xmm4, %xmm5
450449
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
451450
; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1]

llvm/test/CodeGen/X86/vector-half-conversions.ll

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3138,10 +3138,10 @@ define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind {
31383138
; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
31393139
; AVX512-NEXT: callq __truncdfhf2@PLT
31403140
; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0
3141-
; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
3142-
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3141+
; AVX512-NEXT: vpblendd $13, (%rsp), %xmm0, %xmm1 # 16-byte Folded Reload
3142+
; AVX512-NEXT: # xmm1 = mem[0],xmm0[1],mem[2,3]
31433143
; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0
3144-
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
3144+
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
31453145
; AVX512-NEXT: addq $40, %rsp
31463146
; AVX512-NEXT: retq
31473147
%1 = fptrunc <2 x double> %a0 to <2 x half>
@@ -3272,8 +3272,8 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind {
32723272
; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
32733273
; AVX512-NEXT: callq __truncdfhf2@PLT
32743274
; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0
3275-
; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3276-
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
3275+
; AVX512-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3276+
; AVX512-NEXT: # xmm0 = mem[0,1],xmm0[2,3]
32773277
; AVX512-NEXT: addq $72, %rsp
32783278
; AVX512-NEXT: retq
32793279
%1 = fptrunc <4 x double> %a0 to <4 x half>
@@ -3404,8 +3404,8 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
34043404
; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
34053405
; AVX512-NEXT: callq __truncdfhf2@PLT
34063406
; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0
3407-
; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3408-
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
3407+
; AVX512-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3408+
; AVX512-NEXT: # xmm0 = mem[0,1],xmm0[2,3]
34093409
; AVX512-NEXT: addq $72, %rsp
34103410
; AVX512-NEXT: retq
34113411
%1 = fptrunc <4 x double> %a0 to <4 x half>
@@ -4107,8 +4107,8 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, ptr %a1) nounwind {
41074107
; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
41084108
; AVX512-NEXT: callq __truncdfhf2@PLT
41094109
; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0
4110-
; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4111-
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
4110+
; AVX512-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4111+
; AVX512-NEXT: # xmm0 = mem[0,1],xmm0[2,3]
41124112
; AVX512-NEXT: vmovdqa %xmm0, (%rbx)
41134113
; AVX512-NEXT: addq $64, %rsp
41144114
; AVX512-NEXT: popq %rbx

llvm/test/CodeGen/X86/vector-mul.ll

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1569,25 +1569,19 @@ define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind {
15691569
}
15701570

15711571
define <2 x i64> @mul_v2i64_0_1(<2 x i64> %a0) nounwind {
1572-
; X86-SSE2-LABEL: mul_v2i64_0_1:
1573-
; X86-SSE2: # %bb.0:
1574-
; X86-SSE2-NEXT: xorpd %xmm1, %xmm1
1575-
; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1576-
; X86-SSE2-NEXT: retl
1572+
; SSE2-LABEL: mul_v2i64_0_1:
1573+
; SSE2: # %bb.0:
1574+
; SSE2-NEXT: xorps %xmm1, %xmm1
1575+
; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1576+
; SSE2-NEXT: movaps %xmm1, %xmm0
1577+
; SSE2-NEXT: ret{{[l|q]}}
15771578
;
15781579
; SSE4-LABEL: mul_v2i64_0_1:
15791580
; SSE4: # %bb.0:
15801581
; SSE4-NEXT: xorps %xmm1, %xmm1
15811582
; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
15821583
; SSE4-NEXT: ret{{[l|q]}}
15831584
;
1584-
; X64-SSE2-LABEL: mul_v2i64_0_1:
1585-
; X64-SSE2: # %bb.0:
1586-
; X64-SSE2-NEXT: xorps %xmm1, %xmm1
1587-
; X64-SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1588-
; X64-SSE2-NEXT: movaps %xmm1, %xmm0
1589-
; X64-SSE2-NEXT: retq
1590-
;
15911585
; X64-AVX-LABEL: mul_v2i64_0_1:
15921586
; X64-AVX: # %bb.0:
15931587
; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1

llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7065,17 +7065,29 @@ define void @vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2(ptr %i
70657065
; AVX512DQ-NEXT: vzeroupper
70667066
; AVX512DQ-NEXT: retq
70677067
;
7068-
; AVX512BW-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2:
7069-
; AVX512BW: # %bb.0:
7070-
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
7071-
; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
7072-
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
7073-
; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,9,2,11,0,13,2,15]
7074-
; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
7075-
; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0
7076-
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
7077-
; AVX512BW-NEXT: vzeroupper
7078-
; AVX512BW-NEXT: retq
7068+
; AVX512BW-SLOW-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2:
7069+
; AVX512BW-SLOW: # %bb.0:
7070+
; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm0
7071+
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
7072+
; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
7073+
; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
7074+
; AVX512BW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
7075+
; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
7076+
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
7077+
; AVX512BW-SLOW-NEXT: vzeroupper
7078+
; AVX512BW-SLOW-NEXT: retq
7079+
;
7080+
; AVX512BW-FAST-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2:
7081+
; AVX512BW-FAST: # %bb.0:
7082+
; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0
7083+
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
7084+
; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
7085+
; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,9,2,11,0,13,2,15]
7086+
; AVX512BW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
7087+
; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0
7088+
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
7089+
; AVX512BW-FAST-NEXT: vzeroupper
7090+
; AVX512BW-FAST-NEXT: retq
70797091
%in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
70807092
%in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
70817093
%in.vec = add <64 x i8> %in.vec.base, %in.vec.bias

0 commit comments

Comments
 (0)