Skip to content

Commit 282b56f

Browse files
authored
[VectorCombine] foldShuffleOfBinops - add support for length changing shuffles (#88899)
Refactor to be closer to foldShuffleOfCastops - sibling patch to #88743 that can be used to address some of the issues identified in #88693
1 parent 4c3b0a6 commit 282b56f

File tree

2 files changed

+97
-55
lines changed

2 files changed

+97
-55
lines changed

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 58 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1395,60 +1395,91 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
13951395
return true;
13961396
}
13971397

1398-
/// Try to convert "shuffle (binop), (binop)" with a shared binop operand into
1399-
/// "binop (shuffle), (shuffle)".
1398+
/// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)".
14001399
bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
1401-
auto *VecTy = cast<FixedVectorType>(I.getType());
14021400
BinaryOperator *B0, *B1;
1403-
ArrayRef<int> Mask;
1401+
ArrayRef<int> OldMask;
14041402
if (!match(&I, m_Shuffle(m_OneUse(m_BinOp(B0)), m_OneUse(m_BinOp(B1)),
1405-
m_Mask(Mask))) ||
1406-
B0->getOpcode() != B1->getOpcode() || B0->getType() != VecTy)
1403+
m_Mask(OldMask))))
14071404
return false;
14081405

14091406
// Don't introduce poison into div/rem.
1410-
if (any_of(Mask, [](int M) { return M == PoisonMaskElem; }) &&
1407+
if (any_of(OldMask, [](int M) { return M == PoisonMaskElem; }) &&
14111408
B0->isIntDivRem())
14121409
return false;
14131410

1414-
// Try to replace a binop with a shuffle if the shuffle is not costly.
1415-
// The new shuffle will choose from a single, common operand, so it may be
1416-
// cheaper than the existing two-operand shuffle.
1417-
SmallVector<int> UnaryMask = createUnaryMask(Mask, Mask.size());
1411+
// TODO: Add support for addlike etc.
14181412
Instruction::BinaryOps Opcode = B0->getOpcode();
1419-
InstructionCost BinopCost = TTI.getArithmeticInstrCost(Opcode, VecTy);
1420-
InstructionCost ShufCost = TTI.getShuffleCost(
1421-
TargetTransformInfo::SK_PermuteSingleSrc, VecTy, UnaryMask);
1422-
if (ShufCost > BinopCost)
1413+
if (Opcode != B1->getOpcode())
1414+
return false;
1415+
1416+
auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
1417+
auto *BinOpTy = dyn_cast<FixedVectorType>(B0->getType());
1418+
if (!ShuffleDstTy || !BinOpTy)
14231419
return false;
14241420

1421+
unsigned NumSrcElts = BinOpTy->getNumElements();
1422+
14251423
// If we have something like "add X, Y" and "add Z, X", swap ops to match.
14261424
Value *X = B0->getOperand(0), *Y = B0->getOperand(1);
14271425
Value *Z = B1->getOperand(0), *W = B1->getOperand(1);
1428-
if (BinaryOperator::isCommutative(Opcode) && X != Z && Y != W)
1426+
if (BinaryOperator::isCommutative(Opcode) && X != Z && Y != W &&
1427+
(X == W || Y == Z))
14291428
std::swap(X, Y);
14301429

1431-
Value *Shuf0, *Shuf1;
1430+
auto ConvertToUnary = [NumSrcElts](int &M) {
1431+
if (M >= (int)NumSrcElts)
1432+
M -= NumSrcElts;
1433+
};
1434+
1435+
SmallVector<int> NewMask0(OldMask.begin(), OldMask.end());
1436+
TargetTransformInfo::ShuffleKind SK0 = TargetTransformInfo::SK_PermuteTwoSrc;
14321437
if (X == Z) {
1433-
// shuf (bo X, Y), (bo X, W) --> bo (shuf X), (shuf Y, W)
1434-
Shuf0 = Builder.CreateShuffleVector(X, UnaryMask);
1435-
Shuf1 = Builder.CreateShuffleVector(Y, W, Mask);
1436-
} else if (Y == W) {
1437-
// shuf (bo X, Y), (bo Z, Y) --> bo (shuf X, Z), (shuf Y)
1438-
Shuf0 = Builder.CreateShuffleVector(X, Z, Mask);
1439-
Shuf1 = Builder.CreateShuffleVector(Y, UnaryMask);
1440-
} else {
1441-
return false;
1438+
llvm::for_each(NewMask0, ConvertToUnary);
1439+
SK0 = TargetTransformInfo::SK_PermuteSingleSrc;
1440+
Z = PoisonValue::get(BinOpTy);
14421441
}
14431442

1443+
SmallVector<int> NewMask1(OldMask.begin(), OldMask.end());
1444+
TargetTransformInfo::ShuffleKind SK1 = TargetTransformInfo::SK_PermuteTwoSrc;
1445+
if (Y == W) {
1446+
llvm::for_each(NewMask1, ConvertToUnary);
1447+
SK1 = TargetTransformInfo::SK_PermuteSingleSrc;
1448+
W = PoisonValue::get(BinOpTy);
1449+
}
1450+
1451+
// Try to replace a binop with a shuffle if the shuffle is not costly.
1452+
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1453+
1454+
InstructionCost OldCost =
1455+
TTI.getArithmeticInstrCost(B0->getOpcode(), BinOpTy, CostKind) +
1456+
TTI.getArithmeticInstrCost(B1->getOpcode(), BinOpTy, CostKind) +
1457+
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy,
1458+
OldMask, CostKind, 0, nullptr, {B0, B1}, &I);
1459+
1460+
InstructionCost NewCost =
1461+
TTI.getShuffleCost(SK0, BinOpTy, NewMask0, CostKind, 0, nullptr, {X, Z}) +
1462+
TTI.getShuffleCost(SK1, BinOpTy, NewMask1, CostKind, 0, nullptr, {Y, W}) +
1463+
TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
1464+
1465+
LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I
1466+
<< "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
1467+
<< "\n");
1468+
if (NewCost >= OldCost)
1469+
return false;
1470+
1471+
Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0);
1472+
Value *Shuf1 = Builder.CreateShuffleVector(Y, W, NewMask1);
14441473
Value *NewBO = Builder.CreateBinOp(Opcode, Shuf0, Shuf1);
1474+
14451475
// Intersect flags from the old binops.
14461476
if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
14471477
NewInst->copyIRFlags(B0);
14481478
NewInst->andIRFlags(B1);
14491479
}
14501480

1451-
// TODO: Add Shuf0/Shuf1 to WorkList?
1481+
Worklist.pushValue(Shuf0);
1482+
Worklist.pushValue(Shuf1);
14521483
replaceValue(I, *NewBO);
14531484
return true;
14541485
}

llvm/test/Transforms/VectorCombine/X86/shuffle-of-binops.ll

Lines changed: 39 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,9 @@ define <4 x float> @shuf_fdiv_v4f32_yy(<4 x float> %x, <4 x float> %y, <4 x floa
2525
define <4 x i32> @shuf_add_v4i32_xx(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
2626
; CHECK-LABEL: define <4 x i32> @shuf_add_v4i32_xx(
2727
; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] {
28-
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 2, i32 0>
29-
; CHECK-NEXT: [[R1:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[Z]], <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
30-
; CHECK-NEXT: [[R2:%.*]] = add <4 x i32> [[TMP1]], [[R1]]
28+
; CHECK-NEXT: [[B0:%.*]] = add <4 x i32> [[X]], [[Y]]
29+
; CHECK-NEXT: [[B1:%.*]] = add <4 x i32> [[X]], [[Z]]
30+
; CHECK-NEXT: [[R2:%.*]] = shufflevector <4 x i32> [[B0]], <4 x i32> [[B1]], <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
3131
; CHECK-NEXT: ret <4 x i32> [[R2]]
3232
;
3333
%b0 = add <4 x i32> %x, %y
@@ -36,15 +36,22 @@ define <4 x i32> @shuf_add_v4i32_xx(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
3636
ret <4 x i32> %r
3737
}
3838

39-
; For commutative instructions, common operand may be swapped.
39+
; For commutative instructions, common operand may be swapped (SSE - expensive fmul vs AVX - cheap fmul)
4040

4141
define <4 x float> @shuf_fmul_v4f32_xx_swap(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
42-
; CHECK-LABEL: define <4 x float> @shuf_fmul_v4f32_xx_swap(
43-
; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
44-
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
45-
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 0, i32 3>
46-
; CHECK-NEXT: [[R:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
47-
; CHECK-NEXT: ret <4 x float> [[R]]
42+
; SSE-LABEL: define <4 x float> @shuf_fmul_v4f32_xx_swap(
43+
; SSE-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
44+
; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
45+
; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 0, i32 3>
46+
; SSE-NEXT: [[R:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
47+
; SSE-NEXT: ret <4 x float> [[R]]
48+
;
49+
; AVX-LABEL: define <4 x float> @shuf_fmul_v4f32_xx_swap(
50+
; AVX-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
51+
; AVX-NEXT: [[B0:%.*]] = fmul <4 x float> [[X]], [[Y]]
52+
; AVX-NEXT: [[B1:%.*]] = fmul <4 x float> [[Z]], [[X]]
53+
; AVX-NEXT: [[R:%.*]] = shufflevector <4 x float> [[B0]], <4 x float> [[B1]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
54+
; AVX-NEXT: ret <4 x float> [[R]]
4855
;
4956
%b0 = fmul <4 x float> %x, %y
5057
%b1 = fmul <4 x float> %z, %x
@@ -57,9 +64,9 @@ define <4 x float> @shuf_fmul_v4f32_xx_swap(<4 x float> %x, <4 x float> %y, <4 x
5764
define <2 x i64> @shuf_and_v2i64_yy_swap(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z) {
5865
; CHECK-LABEL: define <2 x i64> @shuf_and_v2i64_yy_swap(
5966
; CHECK-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]], <2 x i64> [[Z:%.*]]) #[[ATTR0]] {
60-
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[Y]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
61-
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[X]], <2 x i64> [[Z]], <2 x i32> <i32 3, i32 0>
62-
; CHECK-NEXT: [[R:%.*]] = and <2 x i64> [[TMP1]], [[TMP2]]
67+
; CHECK-NEXT: [[B0:%.*]] = and <2 x i64> [[X]], [[Y]]
68+
; CHECK-NEXT: [[B1:%.*]] = and <2 x i64> [[Y]], [[Z]]
69+
; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x i64> [[B0]], <2 x i64> [[B1]], <2 x i32> <i32 3, i32 0>
6370
; CHECK-NEXT: ret <2 x i64> [[R]]
6471
;
6572
%b0 = and <2 x i64> %x, %y
@@ -84,15 +91,22 @@ define <4 x i32> @shuf_shl_v4i32_xx(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
8491
ret <4 x i32> %r
8592
}
8693

87-
; negative test - common operand, but not commutable
94+
; common operand, but not commutable (SSE - expensive vector shift vs AVX2 - cheap vector shift)
8895

8996
define <4 x i32> @shuf_shl_v4i32_xx_swap(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
90-
; CHECK-LABEL: define <4 x i32> @shuf_shl_v4i32_xx_swap(
91-
; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] {
92-
; CHECK-NEXT: [[B0:%.*]] = shl <4 x i32> [[X]], [[Y]]
93-
; CHECK-NEXT: [[B1:%.*]] = shl <4 x i32> [[Z]], [[X]]
94-
; CHECK-NEXT: [[R1:%.*]] = shufflevector <4 x i32> [[B0]], <4 x i32> [[B1]], <4 x i32> <i32 3, i32 2, i32 2, i32 5>
95-
; CHECK-NEXT: ret <4 x i32> [[R1]]
97+
; SSE-LABEL: define <4 x i32> @shuf_shl_v4i32_xx_swap(
98+
; SSE-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] {
99+
; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> [[Z]], <4 x i32> <i32 3, i32 2, i32 2, i32 5>
100+
; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[X]], <4 x i32> <i32 3, i32 2, i32 2, i32 5>
101+
; SSE-NEXT: [[R:%.*]] = shl <4 x i32> [[TMP1]], [[TMP2]]
102+
; SSE-NEXT: ret <4 x i32> [[R]]
103+
;
104+
; AVX-LABEL: define <4 x i32> @shuf_shl_v4i32_xx_swap(
105+
; AVX-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] {
106+
; AVX-NEXT: [[B0:%.*]] = shl <4 x i32> [[X]], [[Y]]
107+
; AVX-NEXT: [[B1:%.*]] = shl <4 x i32> [[Z]], [[X]]
108+
; AVX-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[B0]], <4 x i32> [[B1]], <4 x i32> <i32 3, i32 2, i32 2, i32 5>
109+
; AVX-NEXT: ret <4 x i32> [[R]]
96110
;
97111
%b0 = shl <4 x i32> %x, %y
98112
%b1 = shl <4 x i32> %z, %x
@@ -116,7 +130,7 @@ define <2 x i64> @shuf_sub_add_v2i64_yy(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z
116130
ret <2 x i64> %r
117131
}
118132

119-
; negative test - type change via shuffle
133+
; type change via shuffle
120134

121135
define <8 x float> @shuf_fmul_v4f32_xx_type(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
122136
; CHECK-LABEL: define <8 x float> @shuf_fmul_v4f32_xx_type(
@@ -168,14 +182,14 @@ define <4 x i32> @shuf_mul_v4i32_yy_use2(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
168182
ret <4 x i32> %r
169183
}
170184

171-
; negative test - must have matching operand
185+
; non-matching operands (not commutable)
172186

173187
define <4 x float> @shuf_fdiv_v4f32_no_common_op(<4 x float> %x, <4 x float> %y, <4 x float> %z, <4 x float> %w) {
174188
; CHECK-LABEL: define <4 x float> @shuf_fdiv_v4f32_no_common_op(
175189
; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]], <4 x float> [[W:%.*]]) #[[ATTR0]] {
176-
; CHECK-NEXT: [[B0:%.*]] = fdiv <4 x float> [[X]], [[Y]]
177-
; CHECK-NEXT: [[B1:%.*]] = fdiv <4 x float> [[Z]], [[W]]
178-
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[B0]], <4 x float> [[B1]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
190+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[X]], <4 x float> [[Z]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
191+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[W]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
192+
; CHECK-NEXT: [[R:%.*]] = fdiv <4 x float> [[TMP1]], [[TMP2]]
179193
; CHECK-NEXT: ret <4 x float> [[R]]
180194
;
181195
%b0 = fdiv <4 x float> %x, %y
@@ -216,6 +230,3 @@ define <4 x i32> @shuf_srem_v4i32_poison(<4 x i32> %a0, <4 x i32> %a1) {
216230
ret <4 x i32> %r
217231
}
218232

219-
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
220-
; AVX: {{.*}}
221-
; SSE: {{.*}}

0 commit comments

Comments
 (0)