-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[SLP] Make getSameOpcode support different instructions if they have same semantics. #112181
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SLP] Make getSameOpcode support different instructions if they have same semantics. #112181
Conversation
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-vectorizers Author: Han-Kuan Chen (HanKuanChen) ChangesPatch is 41.47 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/112181.diff 14 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 401597af35bdac..fdda87e541ca74 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -818,6 +818,105 @@ struct InstructionsState {
} // end anonymous namespace
+struct InterchangeableInstruction {
+ unsigned Opcode;
+ SmallVector<Value *> Ops;
+ template <class... ArgTypes>
+ InterchangeableInstruction(unsigned Opcode, ArgTypes &&...Args)
+ : Opcode(Opcode), Ops{std::forward<decltype(Args)>(Args)...} {}
+};
+
+bool operator<(const InterchangeableInstruction &LHS,
+ const InterchangeableInstruction &RHS) {
+ return LHS.Opcode < RHS.Opcode;
+}
+
+/// \returns a list of interchangeable instructions which \p I can be converted
+/// to.
+/// e.g.,
+/// x << y -> x * (2^y)
+/// x << 1 -> x * 2
+/// x << 0 -> x * 1 -> x - 0 -> x + 0 -> x & 11...1 -> x | 0
+/// x * 0 -> x & 0
+/// x * -1 -> 0 - x
+/// TODO: support more patterns
+static SmallVector<InterchangeableInstruction, 6>
+getInterchangeableInstruction(Instruction *I) {
+ // PII = Possible Interchangeable Instruction
+ SmallVector<InterchangeableInstruction, 6> PII;
+ unsigned Opcode = I->getOpcode();
+ PII.emplace_back(Opcode, I->operands());
+ if (!is_contained({Instruction::Shl, Instruction::Mul, Instruction::Sub,
+ Instruction::Add},
+ Opcode))
+ return PII;
+ Constant *C;
+ if (match(I, m_BinOp(m_Value(), m_Constant(C)))) {
+ ConstantInt *V = nullptr;
+ if (auto *CI = dyn_cast<ConstantInt>(C)) {
+ V = CI;
+ } else if (auto *CDV = dyn_cast<ConstantDataVector>(C)) {
+ if (auto *CI = dyn_cast_if_present<ConstantInt>(CDV->getSplatValue()))
+ V = CI;
+ }
+ if (!V)
+ return PII;
+ Value *Op0 = I->getOperand(0);
+ Type *Op1Ty = I->getOperand(1)->getType();
+ const APInt &Op1Int = V->getValue();
+ Constant *Zero =
+ ConstantInt::get(Op1Ty, APInt::getZero(Op1Int.getBitWidth()));
+ Constant *UnsignedMax =
+ ConstantInt::get(Op1Ty, APInt::getMaxValue(Op1Int.getBitWidth()));
+ switch (Opcode) {
+ case Instruction::Shl: {
+ PII.emplace_back(Instruction::Mul, Op0,
+ ConstantInt::get(Op1Ty, 1 << Op1Int.getZExtValue()));
+ if (Op1Int.isZero()) {
+ PII.emplace_back(Instruction::Sub, Op0, Zero);
+ PII.emplace_back(Instruction::Add, Op0, Zero);
+ PII.emplace_back(Instruction::And, Op0, UnsignedMax);
+ PII.emplace_back(Instruction::Or, Op0, Zero);
+ }
+ break;
+ }
+ case Instruction::Mul: {
+ switch (Op1Int.getSExtValue()) {
+ case 1:
+ PII.emplace_back(Instruction::Sub, Op0, Zero);
+ PII.emplace_back(Instruction::Add, Op0, Zero);
+ PII.emplace_back(Instruction::And, Op0, UnsignedMax);
+ PII.emplace_back(Instruction::Or, Op0, Zero);
+ break;
+ case 0:
+ PII.emplace_back(Instruction::And, Op0, Zero);
+ break;
+ case -1:
+ PII.emplace_back(Instruction::Sub, Zero, Op0);
+ break;
+ }
+ break;
+ }
+ case Instruction::Sub:
+ if (Op1Int.isZero()) {
+ PII.emplace_back(Instruction::Add, Op0, Zero);
+ PII.emplace_back(Instruction::And, Op0, UnsignedMax);
+ PII.emplace_back(Instruction::Or, Op0, Zero);
+ }
+ break;
+ case Instruction::Add:
+ if (Op1Int.isZero()) {
+ PII.emplace_back(Instruction::And, Op0, UnsignedMax);
+ PII.emplace_back(Instruction::Or, Op0, Zero);
+ }
+ break;
+ }
+ }
+ // std::set_intersection requires a sorted range.
+ sort(PII);
+ return PII;
+}
+
/// \returns true if \p Opcode is allowed as part of the main/alternate
/// instruction for SLP vectorization.
///
@@ -922,18 +1021,54 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
return InstructionsState(VL[BaseIndex], nullptr, nullptr);
}
+ // Currently, this is only used for binary ops.
+ // TODO: support all instructions
+ SmallVector<InterchangeableInstruction> InterchangeableOpcode =
+ getInterchangeableInstruction(cast<Instruction>(VL[BaseIndex]));
+ SmallVector<InterchangeableInstruction> AlternateInterchangeableOpcode;
+ auto UpdateInterchangeableOpcode =
+ [](SmallVector<InterchangeableInstruction> &LHS,
+ ArrayRef<InterchangeableInstruction> RHS) {
+ SmallVector<InterchangeableInstruction> NewInterchangeableOpcode;
+ std::set_intersection(LHS.begin(), LHS.end(), RHS.begin(), RHS.end(),
+ std::back_inserter(NewInterchangeableOpcode));
+ if (NewInterchangeableOpcode.empty())
+ return false;
+ LHS = std::move(NewInterchangeableOpcode);
+ return true;
+ };
for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
auto *I = cast<Instruction>(VL[Cnt]);
unsigned InstOpcode = I->getOpcode();
if (IsBinOp && isa<BinaryOperator>(I)) {
- if (InstOpcode == Opcode || InstOpcode == AltOpcode)
+ SmallVector<InterchangeableInstruction> ThisInterchangeableOpcode(
+ getInterchangeableInstruction(I));
+ if (UpdateInterchangeableOpcode(InterchangeableOpcode,
+ ThisInterchangeableOpcode))
continue;
- if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
- isValidForAlternation(Opcode)) {
- AltOpcode = InstOpcode;
- AltIndex = Cnt;
+ if (AlternateInterchangeableOpcode.empty()) {
+ InterchangeableOpcode.erase(
+ std::remove_if(InterchangeableOpcode.begin(),
+ InterchangeableOpcode.end(),
+ [](const InterchangeableInstruction &I) {
+ return !isValidForAlternation(I.Opcode);
+ }),
+ InterchangeableOpcode.end());
+ ThisInterchangeableOpcode.erase(
+ std::remove_if(ThisInterchangeableOpcode.begin(),
+ ThisInterchangeableOpcode.end(),
+ [](const InterchangeableInstruction &I) {
+ return !isValidForAlternation(I.Opcode);
+ }),
+ ThisInterchangeableOpcode.end());
+ if (InterchangeableOpcode.empty() || ThisInterchangeableOpcode.empty())
+ return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+ AlternateInterchangeableOpcode = std::move(ThisInterchangeableOpcode);
continue;
}
+ if (UpdateInterchangeableOpcode(AlternateInterchangeableOpcode,
+ ThisInterchangeableOpcode))
+ continue;
} else if (IsCastOp && isa<CastInst>(I)) {
Value *Op0 = IBase->getOperand(0);
Type *Ty0 = Op0->getType();
@@ -1027,6 +1162,22 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
return InstructionsState(VL[BaseIndex], nullptr, nullptr);
}
+ if (IsBinOp) {
+ auto FindOp =
+ [&](const SmallVector<InterchangeableInstruction> &CandidateOp) {
+ for (Value *V : VL)
+ for (const InterchangeableInstruction &I : CandidateOp)
+ if (cast<Instruction>(V)->getOpcode() == I.Opcode)
+ return cast<Instruction>(V);
+ llvm_unreachable(
+ "Cannot find the candidate instruction for InstructionsState.");
+ };
+ Instruction *MainOp = FindOp(InterchangeableOpcode);
+ Instruction *AltOp = AlternateInterchangeableOpcode.empty()
+ ? MainOp
+ : FindOp(AlternateInterchangeableOpcode);
+ return InstructionsState(VL[BaseIndex], MainOp, AltOp);
+ }
return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
cast<Instruction>(VL[AltIndex]));
}
@@ -2318,24 +2469,41 @@ class BoUpSLP {
: cast<Instruction>(VL[0])->getNumOperands();
OpsVec.resize(NumOperands);
unsigned NumLanes = VL.size();
- for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
+ InstructionsState S = getSameOpcode(VL, TLI);
+ for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx)
OpsVec[OpIdx].resize(NumLanes);
- for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
- assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
- // Our tree has just 3 nodes: the root and two operands.
- // It is therefore trivial to get the APO. We only need to check the
- // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
- // RHS operand. The LHS operand of both add and sub is never attached
- // to an inversese operation in the linearized form, therefore its APO
- // is false. The RHS is true only if VL[Lane] is an inverse operation.
-
- // Since operand reordering is performed on groups of commutative
- // operations or alternating sequences (e.g., +, -), we can safely
- // tell the inverse operations by checking commutativity.
- bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
+ for (auto [I, V] : enumerate(VL)) {
+ assert(isa<Instruction>(V) && "Expected instruction");
+ SmallVector<InterchangeableInstruction> IIList =
+ getInterchangeableInstruction(cast<Instruction>(V));
+ Value *SelectedOp;
+ auto Iter = find_if(IIList, [&](const InterchangeableInstruction &II) {
+ return II.Opcode == S.MainOp->getOpcode();
+ });
+ if (Iter == IIList.end()) {
+ Iter = find_if(IIList, [&](const InterchangeableInstruction &II) {
+ return II.Opcode == S.AltOp->getOpcode();
+ });
+ SelectedOp = S.AltOp;
+ } else {
+ SelectedOp = S.MainOp;
+ }
+ assert(Iter != IIList.end() &&
+ "Cannot find an interchangeable instruction.");
+ // Our tree has just 3 nodes: the root and two operands.
+ // It is therefore trivial to get the APO. We only need to check the
+ // opcode of V and whether the operand at OpIdx is the LHS or RHS
+ // operand. The LHS operand of both add and sub is never attached to an
+ // inversese operation in the linearized form, therefore its APO is
+ // false. The RHS is true only if V is an inverse operation.
+
+ // Since operand reordering is performed on groups of commutative
+ // operations or alternating sequences (e.g., +, -), we can safely
+ // tell the inverse operations by checking commutativity.
+ bool IsInverseOperation = !isCommutative(cast<Instruction>(SelectedOp));
+ for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
bool APO = (OpIdx == 0) ? false : IsInverseOperation;
- OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
- APO, false};
+ OpsVec[OpIdx][I] = {Iter->Ops[OpIdx], APO, false};
}
}
}
@@ -3227,15 +3395,25 @@ class BoUpSLP {
auto *I0 = cast<Instruction>(Scalars[0]);
Operands.resize(I0->getNumOperands());
unsigned NumLanes = Scalars.size();
- for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
- OpIdx != NumOperands; ++OpIdx) {
+ unsigned NumOperands = I0->getNumOperands();
+ for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx)
Operands[OpIdx].resize(NumLanes);
- for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
- auto *I = cast<Instruction>(Scalars[Lane]);
- assert(I->getNumOperands() == NumOperands &&
- "Expected same number of operands");
- Operands[OpIdx][Lane] = I->getOperand(OpIdx);
- }
+ for (auto [I, V] : enumerate(Scalars)) {
+ SmallVector<InterchangeableInstruction> IIList =
+ getInterchangeableInstruction(cast<Instruction>(V));
+ auto Iter = find_if(IIList, [&](const InterchangeableInstruction &II) {
+ return II.Opcode == MainOp->getOpcode();
+ });
+ if (Iter == IIList.end())
+ Iter = find_if(IIList, [&](const InterchangeableInstruction &II) {
+ return II.Opcode == AltOp->getOpcode();
+ });
+ assert(Iter != IIList.end() &&
+ "Cannot find an interchangeable instruction.");
+ assert(Iter->Ops.size() == NumOperands &&
+ "Expected same number of operands");
+ for (auto [J, Op] : enumerate(Iter->Ops))
+ Operands[J][I] = Op;
}
}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll
index c18811a35c1eeb..c7c999bb572851 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll
@@ -314,10 +314,10 @@ define void @store_try_reorder(ptr %dst) {
;
; POW2-ONLY-LABEL: @store_try_reorder(
; POW2-ONLY-NEXT: entry:
-; POW2-ONLY-NEXT: [[ADD:%.*]] = add i32 0, 0
-; POW2-ONLY-NEXT: store i32 [[ADD]], ptr [[DST:%.*]], align 4
-; POW2-ONLY-NEXT: [[ARRAYIDX_I1887:%.*]] = getelementptr i32, ptr [[DST]], i64 1
-; POW2-ONLY-NEXT: store <2 x i32> zeroinitializer, ptr [[ARRAYIDX_I1887]], align 4
+; POW2-ONLY-NEXT: store <2 x i32> zeroinitializer, ptr [[DST:%.*]], align 4
+; POW2-ONLY-NEXT: [[ADD216:%.*]] = sub i32 0, 0
+; POW2-ONLY-NEXT: [[ARRAYIDX_I1891:%.*]] = getelementptr i32, ptr [[DST]], i64 2
+; POW2-ONLY-NEXT: store i32 [[ADD216]], ptr [[ARRAYIDX_I1891]], align 4
; POW2-ONLY-NEXT: ret void
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
index 3fa42047162e45..7bc03e7c7755b4 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
@@ -7,13 +7,12 @@ define void @test(ptr %a, i64 %0) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[A]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[TMP0]], i32 0
; CHECK-NEXT: br label %[[BB:.*]]
; CHECK: [[BB]]:
-; CHECK-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
-; CHECK-NEXT: [[ARRAYIDX17_I28_1:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 0, i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = or disjoint <2 x i64> [[TMP3]], <i64 1, i64 0>
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr double, <2 x ptr> [[TMP2]], <2 x i64> [[TMP5]]
+; CHECK-NEXT: [[ARRAYIDX17_I28_1:%.*]] = extractelement <2 x ptr> [[TMP6]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[TMP6]], i32 8, <2 x i1> <i1 true, i1 true>, <2 x double> poison)
; CHECK-NEXT: [[TMP8:%.*]] = load <2 x double>, ptr [[A]], align 8
; CHECK-NEXT: [[TMP9:%.*]] = load <2 x double>, ptr [[A]], align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
index 308d0e27f1ea89..e158c2a3ed87ea 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
@@ -324,10 +324,10 @@ define void @store_try_reorder(ptr %dst) {
;
; POW2-ONLY-LABEL: @store_try_reorder(
; POW2-ONLY-NEXT: entry:
-; POW2-ONLY-NEXT: [[ADD:%.*]] = add i32 0, 0
-; POW2-ONLY-NEXT: store i32 [[ADD]], ptr [[DST:%.*]], align 4
-; POW2-ONLY-NEXT: [[ARRAYIDX_I1887:%.*]] = getelementptr i32, ptr [[DST]], i64 1
-; POW2-ONLY-NEXT: store <2 x i32> zeroinitializer, ptr [[ARRAYIDX_I1887]], align 4
+; POW2-ONLY-NEXT: store <2 x i32> zeroinitializer, ptr [[DST:%.*]], align 4
+; POW2-ONLY-NEXT: [[ADD216:%.*]] = sub i32 0, 0
+; POW2-ONLY-NEXT: [[ARRAYIDX_I1891:%.*]] = getelementptr i32, ptr [[DST]], i64 2
+; POW2-ONLY-NEXT: store i32 [[ADD216]], ptr [[ARRAYIDX_I1891]], align 4
; POW2-ONLY-NEXT: ret void
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll b/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll
index d388fd17925a16..d2e70f05204d79 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll
@@ -10,9 +10,7 @@ define i32 @foo(ptr nocapture %A, i32 %n) {
; CHECK-NEXT: [[CALL:%.*]] = tail call i32 (...) @bar()
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i32 0
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <4 x i32> [[SHUFFLE]], <i32 5, i32 9, i32 3, i32 10>
-; CHECK-NEXT: [[TMP2:%.*]] = shl <4 x i32> [[SHUFFLE]], <i32 5, i32 9, i32 3, i32 10>
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT: [[TMP3:%.*]] = mul nsw <4 x i32> [[SHUFFLE]], <i32 5, i32 9, i32 8, i32 10>
; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], <i32 9, i32 9, i32 9, i32 9>
; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr [[A:%.*]], align 4
; CHECK-NEXT: ret i32 undef
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll
index 889f5a95c81d69..7af0c64f187480 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll
@@ -4,22 +4,17 @@
define void @test(ptr %0, ptr %1, ptr %2) {
; CHECK-LABEL: @test(
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 4
-; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr [[TMP1:%.*]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
-; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
-; CHECK-NEXT: [[TMP11:%.*]] = sub <4 x i32> <i32 0, i32 0, i32 undef, i32 0>, [[TMP8]]
-; CHECK-NEXT: [[TMP12:%.*]] = sub <4 x i32> [[TMP11]], [[TMP10]]
-; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[TMP6]]
-; CHECK-NEXT: [[TMP14:%.*]] = add <4 x i32> [[TMP13]], <i32 0, i32 0, i32 1, i32 0>
-; CHECK-NEXT: [[TMP15:%.*]] = sub <4 x i32> [[TMP13]], <i32 0, i32 0, i32 1, i32 0>
-; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], <4 x i32> <i32 2, i32 0, i32 1, i32 7>
-; CHECK-NEXT: [[TMP17:%.*]] = add <4 x i32> [[TMP16]], zeroinitializer
-; CHECK-NEXT: [[TMP18:%.*]] = sub <4 x i32> [[TMP16]], zeroinitializer
-; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> [[TMP18]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[TMP20:%.*]] = add <4 x i32> [[TMP19]], zeroinitializer
-; CHECK-NEXT: [[TMP21:%.*]] = sub <4 x i32> [[TMP19]], zeroinitializer
-; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i32> [[TMP20]], <4 x i32> [[TMP21]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-; CHECK-NEXT: store <4 x i32> [[TMP22]], ptr [[TMP2:%.*]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr [[TMP1:%.*]], align 4
+; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT: [[TMP8:%.*]] = sub <4 x i32> <i32 0, i32 0, i32 undef, i32 0>, [[TMP6]]
+; CHECK-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP8]], [[TMP7]]
+; CHECK-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], [[TMP5]]
+; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i32> <i32 0, i32 0, i32 1, i32 0>, [[TMP10]]
+; CHECK-NEXT: [[TMP12:%.*]] = add <4 x i32> [[TMP11]], zeroinitializer
+; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> poison, <4 x ...
[truncated]
|
2d81590
to
d9d8f3c
Compare
; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <4 x i32> [[SHUFFLE]], <i32 5, i32 9, i32 3, i32 10> | ||
; CHECK-NEXT: [[TMP2:%.*]] = shl <4 x i32> [[SHUFFLE]], <i32 5, i32 9, i32 3, i32 10> | ||
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 3> | ||
; CHECK-NEXT: [[TMP3:%.*]] = mul nsw <4 x i32> [[SHUFFLE]], <i32 5, i32 9, i32 8, i32 10> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Shall we drop nsw here? Please double check
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
See def0fc1.
But I wonder why we pass VL0 instead of nullptr to propagateIRFlags. Even if we don't enable interchangeable instruction, pass nullptr should be enough.
✅ With the latest revision this PR passed the C/C++ code formatter. |
7a6cbcb
to
48bae64
Compare
48bae64
to
ad591ac
Compare
@@ -14935,7 +15108,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { | |||
Value *V = Builder.CreateBinOp( | |||
static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, | |||
RHS); | |||
propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end()); | |||
propagateIRFlags(V, E->Scalars, nullptr, It == MinBWs.end()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should it always be nullptr or are there cases where we can keep it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually I don't know why we pass VL0 here. Only alternate operation should pass non nullptr value.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It should pass VL0 here, but need to check if all opcodes are originally mathed and if not, then pass fourth argument /*IncludeWrapFlags=*/false
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Before the PR, there is no difference between VL0 and nullptr. The opcode must be the same for all VL.
After the PR, VL0 cannot be used because opcode may be different.
I don't know why VL0 is used in the beginning since pass VL0 and nullptr will get the same result for propagateIRFlags.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this part still requires extra work. If the opcode of the instruction does not match the opcode of intersection, its flags are ignored. This is not correct and must be fixed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is used to fix
- ; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <4 x i32> [[SHUFFLE]], <i32 5, i32 9, i32 3, i32 10>
- ; CHECK-NEXT: [[TMP2:%.*]] = shl <4 x i32> [[SHUFFLE]], <i32 5, i32 9, i32 3, i32 10>
- ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+ ; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[SHUFFLE]], <i32 5, i32 9, i32 8, i32 10>
VL0 is mul here. If we pass VL0, then eventually nsw
will be passed.
However, shl does not contain nsw
. We should pass nullptr here to get the correct result.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Say, you have something like {mul nsw, shl, mul nsw, mul nsw}
. For this case you will still emit mul nsw <4 x >
, b ecause shl will be ignored
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes. But actually shl
does not have nsw
. We should emit mul <4 x >
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, it what is expected. But I assume that instead currently it will emit mul nsw <4 x >
if (NewInterchangeableOpcode.empty()) | ||
return false; | ||
LHS.swap(NewInterchangeableOpcode); | ||
return true; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (NewInterchangeableOpcode.empty()) | |
return false; | |
LHS.swap(NewInterchangeableOpcode); | |
return true; | |
LHS.swap(NewInterchangeableOpcode); | |
return !LHS.empty(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We do not want LHS
be empty if NewInterchangeableOpcode
is empty.
@@ -2335,24 +2479,41 @@ class BoUpSLP { | |||
: cast<Instruction>(VL[0])->getNumOperands(); | |||
OpsVec.resize(NumOperands); | |||
unsigned NumLanes = VL.size(); | |||
for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { | |||
InstructionsState S = getSameOpcode(VL, TLI); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Suggest to pass it to VLOperands constructor instead and use it instead of regenerating.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we only use VLOperands
? Right now the code is like this
// Sort operands of the instructions so that each side is more likely to
// have the same opcode.
if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
ValueList Left, Right;
reorderInputsAccordingToOpcode(VL, Left, Right, *this);
TE->setOperand(0, Left);
TE->setOperand(1, Right);
buildTree_rec(Left, Depth + 1, {TE, 0});
buildTree_rec(Right, Depth + 1, {TE, 1});
return;
}
TE->setOperandsInOrder();
for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
We need to implement the logics in VLOperands
and Tree::setOperandsInOrder
. It will be simpler if we use VLOperands
only.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can try. But keep in mind, these elements should not be reordered at all. Otherwise, it may affect compile time.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I will do this in another PR.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
see ee74f11
auto Iter = find_if(IIList, [&](const InterchangeableInstruction &II) { | ||
return II.Opcode == S.MainOp->getOpcode(); | ||
}); | ||
if (Iter == IIList.end()) { | ||
Iter = find_if(IIList, [&](const InterchangeableInstruction &II) { | ||
return II.Opcode == S.AltOp->getOpcode(); | ||
}); | ||
SelectedOp = S.AltOp; | ||
} else { | ||
SelectedOp = S.MainOp; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Better to allow to specify particual opcode as operand of getInterchangeableInstruction to allow early filtering + simplify the code
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Any examples? Cannot image how to modify getInterchangeableInstruction
.
getInterchangeableInstruction(cast<Instruction>(V)); | ||
auto Iter = find_if(IIList, [&](const InterchangeableInstruction &II) { | ||
return II.Opcode == MainOp->getOpcode(); | ||
}); | ||
if (Iter == IIList.end()) | ||
Iter = find_if(IIList, [&](const InterchangeableInstruction &II) { | ||
return II.Opcode == AltOp->getOpcode(); | ||
}); | ||
assert(Iter != IIList.end() && | ||
"Cannot find an interchangeable instruction."); | ||
assert(Iter->Ops.size() == NumOperands && | ||
"Expected same number of operands"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same, try to add extra operand to getInterchangeableInstruction to allow filtering inside
@@ -14935,7 +15108,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { | |||
Value *V = Builder.CreateBinOp( | |||
static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, | |||
RHS); | |||
propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end()); | |||
propagateIRFlags(V, E->Scalars, nullptr, It == MinBWs.end()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It should pass VL0 here, but need to check if all opcodes are originally mathed and if not, then pass fourth argument /*IncludeWrapFlags=*/false
a56bd1c
to
9672f6d
Compare
✅ With the latest revision this PR passed the undef deprecator. |
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/123/builds/11175 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/133/builds/8338 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/190/builds/11281 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/7/builds/8567 Here is the relevant piece of the build log for the reference
|
Looks like this was already reverted for other reasons, so just as a FYI this also causes a significant compile-time regression: https://llvm-compile-time-tracker.com/compare.php?from=02bcaca5995de283c85acfcca61a39baac315794&to=82204154b7bd1f8c487c94c7ef00399d776b29f0&stat=instructions:u |
No description provided.