Skip to content

Commit 5f41b07

Browse files
committed
[VectorCombine] foldShuffleOfShuffles - fold "shuffle (shuffle x, undef), (shuffle y, undef)" -> "shuffle x, y"
Another step towards cleaning up shuffles that have been split, often across bitcasts between SSE intrinsic. Strip shuffles entirely if we fold to an identity shuffle.
1 parent ac79188 commit 5f41b07

File tree

4 files changed

+77
-22
lines changed

4 files changed

+77
-22
lines changed

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ class VectorCombine {
113113
bool scalarizeLoadExtract(Instruction &I);
114114
bool foldShuffleOfBinops(Instruction &I);
115115
bool foldShuffleOfCastops(Instruction &I);
116+
bool foldShuffleOfShuffles(Instruction &I);
116117
bool foldShuffleFromReductions(Instruction &I);
117118
bool foldTruncFromReductions(Instruction &I);
118119
bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
@@ -1547,6 +1548,73 @@ bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
15471548
return true;
15481549
}
15491550

1551+
/// Try to convert "shuffle (shuffle x, undef), (shuffle y, undef)"
1552+
/// into "shuffle x, y".
1553+
bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
1554+
Value *V0, *V1;
1555+
ArrayRef<int> OuterMask, InnerMask0, InnerMask1;
1556+
if (!match(&I, m_Shuffle(m_OneUse(m_Shuffle(m_Value(V0), m_Undef(),
1557+
m_Mask(InnerMask0))),
1558+
m_OneUse(m_Shuffle(m_Value(V1), m_Undef(),
1559+
m_Mask(InnerMask1))),
1560+
m_Mask(OuterMask))))
1561+
return false;
1562+
1563+
auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
1564+
auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(V0->getType());
1565+
auto *ShuffleImmTy = dyn_cast<FixedVectorType>(I.getOperand(0)->getType());
1566+
if (!ShuffleDstTy || !ShuffleSrcTy || !ShuffleImmTy ||
1567+
V0->getType() != V1->getType())
1568+
return false;
1569+
1570+
unsigned NumSrcElts = ShuffleSrcTy->getNumElements();
1571+
unsigned NumImmElts = ShuffleImmTy->getNumElements();
1572+
1573+
SmallVector<int, 16> NewMask(OuterMask.begin(), OuterMask.end());
1574+
for (int &M : NewMask) {
1575+
if (0 <= M && M < (int)NumImmElts)
1576+
M = InnerMask0[M];
1577+
else if ((int)NumImmElts <= M)
1578+
M = InnerMask1[M - NumImmElts] + (V0 == V1 ? 0 : NumSrcElts);
1579+
}
1580+
1581+
// Have we folded to an Identity shuffle?
1582+
if (ShuffleVectorInst::isIdentityMask(NewMask, NumSrcElts)) {
1583+
replaceValue(I, *V0);
1584+
return true;
1585+
}
1586+
1587+
// Try to merge the shuffles if the new shuffle is not costly.
1588+
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1589+
1590+
InstructionCost OldCost =
1591+
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, ShuffleSrcTy,
1592+
InnerMask0, CostKind) +
1593+
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, ShuffleSrcTy,
1594+
InnerMask1, CostKind) +
1595+
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleImmTy,
1596+
OuterMask, CostKind, 0, nullptr, std::nullopt, &I);
1597+
1598+
InstructionCost NewCost = TTI.getShuffleCost(
1599+
TargetTransformInfo::SK_PermuteTwoSrc, ShuffleSrcTy, NewMask, CostKind);
1600+
1601+
LLVM_DEBUG(dbgs() << "Found a shuffle feeding two shuffles: " << I
1602+
<< "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
1603+
<< "\n");
1604+
if (NewCost > OldCost)
1605+
return false;
1606+
1607+
// Clear unused sources to undef.
1608+
if (none_of(NewMask, [&](int M) { return 0 <= M && M < (int)NumSrcElts; }))
1609+
V0 = UndefValue::get(ShuffleSrcTy);
1610+
if (none_of(NewMask, [&](int M) { return (int)NumSrcElts <= M; }))
1611+
V1 = UndefValue::get(ShuffleSrcTy);
1612+
1613+
Value *Shuf = Builder.CreateShuffleVector(V0, V1, NewMask);
1614+
replaceValue(I, *Shuf);
1615+
return true;
1616+
}
1617+
15501618
/// Given a commutative reduction, the order of the input lanes does not alter
15511619
/// the results. We can use this to remove certain shuffles feeding the
15521620
/// reduction, removing the need to shuffle at all.
@@ -2102,6 +2170,7 @@ bool VectorCombine::run() {
21022170
case Instruction::ShuffleVector:
21032171
MadeChange |= foldShuffleOfBinops(I);
21042172
MadeChange |= foldShuffleOfCastops(I);
2173+
MadeChange |= foldShuffleOfShuffles(I);
21052174
MadeChange |= foldSelectShuffle(I);
21062175
break;
21072176
case Instruction::BitCast:

llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2-
; RUN: opt -passes='vector-combine' -S %s | FileCheck %s
2+
; RUN: opt -passes=vector-combine -S %s | FileCheck %s
33

44
target triple = "aarch64"
55

@@ -741,37 +741,29 @@ define i32 @full_reorder(ptr nocapture noundef readonly %pix1, i32 noundef %i_pi
741741
; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1
742742
; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1
743743
; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ADD_PTR_2]], align 1
744-
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
745-
; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
746-
; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
744+
; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> [[TMP8]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
747745
; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
748746
; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
749747
; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
750748
; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <16 x i8> [[TMP17]], <16 x i8> [[TMP18]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
751749
; CHECK-NEXT: [[TMP20:%.*]] = zext <16 x i8> [[TMP19]] to <16 x i32>
752750
; CHECK-NEXT: [[TMP21:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1
753-
; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP21]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
754-
; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
755-
; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
751+
; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <4 x i8> [[TMP21]], <4 x i8> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
756752
; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
757753
; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
758754
; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
759755
; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <16 x i8> [[TMP26]], <16 x i8> [[TMP27]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
760756
; CHECK-NEXT: [[TMP29:%.*]] = zext <16 x i8> [[TMP28]] to <16 x i32>
761757
; CHECK-NEXT: [[TMP30:%.*]] = sub nsw <16 x i32> [[TMP20]], [[TMP29]]
762758
; CHECK-NEXT: [[TMP31:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1
763-
; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP31]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
764-
; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
765-
; CHECK-NEXT: [[TMP34:%.*]] = shufflevector <16 x i8> [[TMP32]], <16 x i8> [[TMP33]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
759+
; CHECK-NEXT: [[TMP34:%.*]] = shufflevector <4 x i8> [[TMP31]], <4 x i8> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
766760
; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
767761
; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <16 x i8> [[TMP34]], <16 x i8> [[TMP35]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
768762
; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
769763
; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <16 x i8> [[TMP36]], <16 x i8> [[TMP37]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
770764
; CHECK-NEXT: [[TMP39:%.*]] = zext <16 x i8> [[TMP38]] to <16 x i32>
771765
; CHECK-NEXT: [[TMP40:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1
772-
; CHECK-NEXT: [[TMP41:%.*]] = shufflevector <4 x i8> [[TMP40]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
773-
; CHECK-NEXT: [[TMP42:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
774-
; CHECK-NEXT: [[TMP43:%.*]] = shufflevector <16 x i8> [[TMP41]], <16 x i8> [[TMP42]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
766+
; CHECK-NEXT: [[TMP43:%.*]] = shufflevector <4 x i8> [[TMP40]], <4 x i8> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
775767
; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
776768
; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <16 x i8> [[TMP43]], <16 x i8> [[TMP44]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
777769
; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>

llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,11 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2-
; RUN: opt -passes='vector-combine' -S %s | FileCheck %s
2+
; RUN: opt -passes=vector-combine -S %s | FileCheck %s
33

44
target triple = "aarch64"
55

66
define <8 x i8> @trivial(<8 x i8> %a) {
77
; CHECK-LABEL: @trivial(
8-
; CHECK-NEXT: [[AB:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
9-
; CHECK-NEXT: [[AT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
10-
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i8> [[AT]], <4 x i8> [[AB]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
11-
; CHECK-NEXT: ret <8 x i8> [[R]]
8+
; CHECK-NEXT: ret <8 x i8> [[R:%.*]]
129
;
1310
%ab = shufflevector <8 x i8> %a, <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1411
%at = shufflevector <8 x i8> %a, <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>

llvm/test/Transforms/VectorCombine/X86/pr67803.ll

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,7 @@ define <4 x i64> @PR67803(<8 x i32> %x, <8 x i32> %y, <8 x float> %a, <8 x float
66
; CHECK-LABEL: @PR67803(
77
; CHECK-NEXT: entry:
88
; CHECK-NEXT: [[CMP:%.*]] = icmp sgt <8 x i32> [[X:%.*]], [[Y:%.*]]
9-
; CHECK-NEXT: [[CMP_LO:%.*]] = shufflevector <8 x i1> [[CMP]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
10-
; CHECK-NEXT: [[CMP_HI:%.*]] = shufflevector <8 x i1> [[CMP]], <8 x i1> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
11-
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i1> [[CMP_LO]], <4 x i1> [[CMP_HI]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
12-
; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i32>
9+
; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i1> [[CMP]] to <8 x i32>
1310
; CHECK-NEXT: [[CONCAT:%.*]] = bitcast <8 x i32> [[TMP1]] to <4 x i64>
1411
; CHECK-NEXT: [[MASK:%.*]] = bitcast <4 x i64> [[CONCAT]] to <8 x float>
1512
; CHECK-NEXT: [[SEL:%.*]] = tail call noundef <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[MASK]])

0 commit comments

Comments
 (0)