Skip to content

Commit 09fd8f0

Browse files
authored
[X86] matchBinaryPermuteShuffle - match AVX512 "cross lane" SHLDQ/SRLDQ style patterns using VALIGN (#140538)
Very similar to what we do in lowerShuffleAsVALIGN I've updated isTargetShuffleEquivalent to correctly handle SM_SentinelZero in the expected shuffle mask, but it only allows an exact match (or the test mask was undef) - it can't be used to match zero elements with MaskedVectorIsZero. Noticed while working on #140516
1 parent 621a5a9 commit 09fd8f0

File tree

2 files changed

+41
-10
lines changed

2 files changed

+41
-10
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 39 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10096,7 +10096,10 @@ static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,
1009610096
if (Size != (int)ExpectedMask.size())
1009710097
return false;
1009810098
assert(llvm::all_of(ExpectedMask,
10099-
[Size](int M) { return isInRange(M, 0, 2 * Size); }) &&
10099+
[Size](int M) {
10100+
return M == SM_SentinelZero ||
10101+
isInRange(M, 0, 2 * Size);
10102+
}) &&
1010010103
"Illegal target shuffle mask");
1010110104

1010210105
// Check for out-of-range target shuffle mask indices.
@@ -10119,6 +10122,9 @@ static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,
1011910122
int ExpectedIdx = ExpectedMask[i];
1012010123
if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
1012110124
continue;
10125+
// If we failed to match an expected SM_SentinelZero then early out.
10126+
if (ExpectedIdx < 0)
10127+
return false;
1012210128
if (MaskIdx == SM_SentinelZero) {
1012310129
// If we need this expected index to be a zero element, then update the
1012410130
// relevant zero mask and perform the known bits at the end to minimize
@@ -39594,18 +39600,46 @@ static bool matchBinaryPermuteShuffle(
3959439600
((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
3959539601
(MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
3959639602
(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39603+
MVT AlignVT = MVT::getVectorVT(MVT::getIntegerVT(EltSizeInBits),
39604+
MaskVT.getSizeInBits() / EltSizeInBits);
3959739605
if (!isAnyZero(Mask)) {
3959839606
int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
3959939607
if (0 < Rotation) {
3960039608
Shuffle = X86ISD::VALIGN;
39601-
if (EltSizeInBits == 64)
39602-
ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
39603-
else
39604-
ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
39609+
ShuffleVT = AlignVT;
3960539610
PermuteImm = Rotation;
3960639611
return true;
3960739612
}
3960839613
}
39614+
// See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
39615+
unsigned ZeroLo = Zeroable.countr_one();
39616+
unsigned ZeroHi = Zeroable.countl_one();
39617+
assert((ZeroLo + ZeroHi) < NumMaskElts && "Zeroable shuffle detected");
39618+
if (ZeroLo) {
39619+
SmallVector<int, 16> ShiftMask(NumMaskElts, SM_SentinelZero);
39620+
std::iota(ShiftMask.begin() + ZeroLo, ShiftMask.end(), 0);
39621+
if (isTargetShuffleEquivalent(MaskVT, Mask, ShiftMask, DAG, V1)) {
39622+
V1 = V1;
39623+
V2 = getZeroVector(AlignVT, Subtarget, DAG, DL);
39624+
Shuffle = X86ISD::VALIGN;
39625+
ShuffleVT = AlignVT;
39626+
PermuteImm = NumMaskElts - ZeroLo;
39627+
return true;
39628+
}
39629+
}
39630+
if (ZeroHi) {
39631+
SmallVector<int, 16> ShiftMask(NumMaskElts, SM_SentinelZero);
39632+
std::iota(ShiftMask.begin(), ShiftMask.begin() + NumMaskElts - ZeroHi,
39633+
ZeroHi);
39634+
if (isTargetShuffleEquivalent(MaskVT, Mask, ShiftMask, DAG, V1)) {
39635+
V2 = V1;
39636+
V1 = getZeroVector(AlignVT, Subtarget, DAG, DL);
39637+
Shuffle = X86ISD::VALIGN;
39638+
ShuffleVT = AlignVT;
39639+
PermuteImm = ZeroHi;
39640+
return true;
39641+
}
39642+
}
3960939643
}
3961039644

3961139645
// Attempt to match against PALIGNR byte rotate.

llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -812,10 +812,8 @@ define <8 x i64> @combine_vpermt2var_8i64_as_valignq(<8 x i64> %x0, <8 x i64> %x
812812
define <8 x i64> @combine_vpermt2var_8i64_as_valignq_zero(<8 x i64> %x0) {
813813
; CHECK-LABEL: combine_vpermt2var_8i64_as_valignq_zero:
814814
; CHECK: # %bb.0:
815-
; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm2 = [15,0,1,2,3,4,5,6]
816815
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
817-
; CHECK-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
818-
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
816+
; CHECK-NEXT: valignq {{.*#+}} zmm0 = zmm0[7],zmm1[0,1,2,3,4,5,6]
819817
; CHECK-NEXT: ret{{[l|q]}}
820818
%res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 15, i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6>, <8 x i64> zeroinitializer, <8 x i64> %x0, i8 -1)
821819
ret <8 x i64> %res0
@@ -825,8 +823,7 @@ define <8 x i64> @combine_vpermt2var_8i64_as_zero_valignq(<8 x i64> %x0) {
825823
; CHECK-LABEL: combine_vpermt2var_8i64_as_zero_valignq:
826824
; CHECK: # %bb.0:
827825
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
828-
; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm2 = [15,0,1,2,3,4,5,6]
829-
; CHECK-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
826+
; CHECK-NEXT: valignq {{.*#+}} zmm0 = zmm1[7],zmm0[0,1,2,3,4,5,6]
830827
; CHECK-NEXT: ret{{[l|q]}}
831828
%res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 15, i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6>, <8 x i64> %x0, <8 x i64> zeroinitializer, i8 -1)
832829
ret <8 x i64> %res0

0 commit comments

Comments
 (0)