Skip to content

Commit 0e1f9fb

Browse files
committed
[X86] matchBinaryPermuteShuffle - match AVX512 "cross lane" SHLDQ/SRLDQ patterns using VALIGN
Very similar to what we do in lowerShuffleAsVALIGN I've updated isTargetShuffleEquivalent to correctly handle SM_SentinelZero in the expected shuffle mask, but it only allows an exact match (or the test mask was undef) - it can't be used to match zero elements with MaskedVectorIsZero. Noticed while working on llvm#140516
1 parent 52e1995 commit 0e1f9fb

File tree

2 files changed

+40
-10
lines changed

2 files changed

+40
-10
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10096,7 +10096,9 @@ static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,
1009610096
if (Size != (int)ExpectedMask.size())
1009710097
return false;
1009810098
assert(llvm::all_of(ExpectedMask,
10099-
[Size](int M) { return isInRange(M, 0, 2 * Size); }) &&
10099+
[Size](int M) {
10100+
return M == SM_SentinelZero || (M, 0, 2 * Size);
10101+
}) &&
1010010102
"Illegal target shuffle mask");
1010110103

1010210104
// Check for out-of-range target shuffle mask indices.
@@ -10119,6 +10121,9 @@ static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,
1011910121
int ExpectedIdx = ExpectedMask[i];
1012010122
if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
1012110123
continue;
10124+
// If we failed to match an expected SM_SentinelZero then early out.
10125+
if (ExpectedIdx < 0)
10126+
return false;
1012210127
if (MaskIdx == SM_SentinelZero) {
1012310128
// If we need this expected index to be a zero element, then update the
1012410129
// relevant zero mask and perform the known bits at the end to minimize
@@ -39594,18 +39599,46 @@ static bool matchBinaryPermuteShuffle(
3959439599
((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
3959539600
(MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
3959639601
(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39602+
MVT AlignVT = MVT::getVectorVT(MVT::getIntegerVT(EltSizeInBits),
39603+
MaskVT.getSizeInBits() / EltSizeInBits);
3959739604
if (!isAnyZero(Mask)) {
3959839605
int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
3959939606
if (0 < Rotation) {
3960039607
Shuffle = X86ISD::VALIGN;
39601-
if (EltSizeInBits == 64)
39602-
ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
39603-
else
39604-
ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
39608+
ShuffleVT = AlignVT;
3960539609
PermuteImm = Rotation;
3960639610
return true;
3960739611
}
3960839612
}
39613+
// See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
39614+
unsigned ZeroLo = Zeroable.countr_one();
39615+
unsigned ZeroHi = Zeroable.countl_one();
39616+
assert((ZeroLo + ZeroHi) < NumMaskElts && "Zeroable shuffle detected");
39617+
if (ZeroLo) {
39618+
SmallVector<int, 16> ShiftMask(NumMaskElts, SM_SentinelZero);
39619+
std::iota(ShiftMask.begin() + ZeroLo, ShiftMask.end(), 0);
39620+
if (isTargetShuffleEquivalent(MaskVT, Mask, ShiftMask, DAG, V1)) {
39621+
V1 = V1;
39622+
V2 = getZeroVector(AlignVT, Subtarget, DAG, DL);
39623+
Shuffle = X86ISD::VALIGN;
39624+
ShuffleVT = AlignVT;
39625+
PermuteImm = NumMaskElts - ZeroLo;
39626+
return true;
39627+
}
39628+
}
39629+
if (ZeroHi) {
39630+
SmallVector<int, 16> ShiftMask(NumMaskElts, SM_SentinelZero);
39631+
std::iota(ShiftMask.begin(), ShiftMask.begin() + NumMaskElts - ZeroHi,
39632+
ZeroHi);
39633+
if (isTargetShuffleEquivalent(MaskVT, Mask, ShiftMask, DAG, V1)) {
39634+
V2 = V1;
39635+
V1 = getZeroVector(AlignVT, Subtarget, DAG, DL);
39636+
Shuffle = X86ISD::VALIGN;
39637+
ShuffleVT = AlignVT;
39638+
PermuteImm = ZeroHi;
39639+
return true;
39640+
}
39641+
}
3960939642
}
3961039643

3961139644
// Attempt to match against PALIGNR byte rotate.

llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -812,10 +812,8 @@ define <8 x i64> @combine_vpermt2var_8i64_as_valignq(<8 x i64> %x0, <8 x i64> %x
812812
define <8 x i64> @combine_vpermt2var_8i64_as_valignq_zero(<8 x i64> %x0) {
813813
; CHECK-LABEL: combine_vpermt2var_8i64_as_valignq_zero:
814814
; CHECK: # %bb.0:
815-
; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm2 = [15,0,1,2,3,4,5,6]
816815
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
817-
; CHECK-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
818-
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
816+
; CHECK-NEXT: valignq {{.*#+}} zmm0 = zmm0[7],zmm1[0,1,2,3,4,5,6]
819817
; CHECK-NEXT: ret{{[l|q]}}
820818
%res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 15, i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6>, <8 x i64> zeroinitializer, <8 x i64> %x0, i8 -1)
821819
ret <8 x i64> %res0
@@ -825,8 +823,7 @@ define <8 x i64> @combine_vpermt2var_8i64_as_zero_valignq(<8 x i64> %x0) {
825823
; CHECK-LABEL: combine_vpermt2var_8i64_as_zero_valignq:
826824
; CHECK: # %bb.0:
827825
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
828-
; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm2 = [15,0,1,2,3,4,5,6]
829-
; CHECK-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
826+
; CHECK-NEXT: valignq {{.*#+}} zmm0 = zmm1[7],zmm0[0,1,2,3,4,5,6]
830827
; CHECK-NEXT: ret{{[l|q]}}
831828
%res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 15, i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6>, <8 x i64> %x0, <8 x i64> zeroinitializer, i8 -1)
832829
ret <8 x i64> %res0

0 commit comments

Comments
 (0)