-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[VPlan] Implement VPExtendedReduction, VPMulAccumulateReductionRecipe and corresponding vplan transformations. #137746
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[VPlan] Implement VPExtendedReduction, VPMulAccumulateReductionRecipe and corresponding vplan transformations. #137746
Conversation
@llvm/pr-subscribers-llvm-transforms Author: Elvis Wang (ElvisWang123) ChangesThis patch implements the transformation that match the following
The converted abstract recipes will be lower to the concrete recipes This should be a cost-model based decision which will be implemented in the Split from #113903. Stacked on #137745. Patch is 68.39 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/137746.diff 12 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 4684378687ef6..f5e3d1664b407 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9631,10 +9631,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
"entry block must be set to a VPRegionBlock having a non-empty entry "
"VPBasicBlock");
- for (ElementCount VF : Range)
- Plan->addVF(VF);
- Plan->setName("Initial VPlan");
-
// Update wide induction increments to use the same step as the corresponding
// wide induction. This enables detecting induction increments directly in
// VPlan and removes redundant splats.
@@ -9670,6 +9666,21 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
// Adjust the recipes for any inloop reductions.
adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
+ // Transform recipes to abstract recipes if it is legal and beneficial and
+ // clamp the range for better cost estimation.
+ // TODO: Enable following transform when the EVL-version of extended-reduction
+ // and mulacc-reduction are implemented.
+ if (!CM.foldTailWithEVL()) {
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
+ CM.CostKind);
+ VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
+ CostCtx, Range);
+ }
+
+ for (ElementCount VF : Range)
+ Plan->addVF(VF);
+ Plan->setName("Initial VPlan");
+
// Interleave memory: for each Interleave Group we marked earlier as relevant
// for this VPlan, replace the Recipes widening its memory instructions with a
// single VPInterleaveRecipe at its insertion point.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index afad73bcd3501..587ba29965646 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -525,6 +525,8 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPInstructionSC:
case VPRecipeBase::VPReductionEVLSC:
case VPRecipeBase::VPReductionSC:
+ case VPRecipeBase::VPMulAccumulateReductionSC:
+ case VPRecipeBase::VPExtendedReductionSC:
case VPRecipeBase::VPReplicateSC:
case VPRecipeBase::VPScalarIVStepsSC:
case VPRecipeBase::VPVectorPointerSC:
@@ -609,13 +611,15 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
DisjointFlagsTy(bool IsDisjoint) : IsDisjoint(IsDisjoint) {}
};
+ struct NonNegFlagsTy {
+ char NonNeg : 1;
+ NonNegFlagsTy(bool IsNonNeg) : NonNeg(IsNonNeg) {}
+ };
+
private:
struct ExactFlagsTy {
char IsExact : 1;
};
- struct NonNegFlagsTy {
- char NonNeg : 1;
- };
struct FastMathFlagsTy {
char AllowReassoc : 1;
char NoNaNs : 1;
@@ -709,6 +713,12 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
: VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::DisjointOp),
DisjointFlags(DisjointFlags) {}
+ template <typename IterT>
+ VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
+ NonNegFlagsTy NonNegFlags, DebugLoc DL = {})
+ : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::NonNegOp),
+ NonNegFlags(NonNegFlags) {}
+
protected:
template <typename IterT>
VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
@@ -728,7 +738,9 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
R->getVPDefID() == VPRecipeBase::VPReductionEVLSC ||
R->getVPDefID() == VPRecipeBase::VPReplicateSC ||
R->getVPDefID() == VPRecipeBase::VPVectorEndPointerSC ||
- R->getVPDefID() == VPRecipeBase::VPVectorPointerSC;
+ R->getVPDefID() == VPRecipeBase::VPVectorPointerSC ||
+ R->getVPDefID() == VPRecipeBase::VPExtendedReductionSC ||
+ R->getVPDefID() == VPRecipeBase::VPMulAccumulateReductionSC;
}
static inline bool classof(const VPUser *U) {
@@ -820,6 +832,15 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
FastMathFlags getFastMathFlags() const;
+ /// Returns true if the recipe has non-negative flag.
+ bool hasNonNegFlag() const { return OpType == OperationType::NonNegOp; }
+
+ bool isNonNeg() const {
+ assert(OpType == OperationType::NonNegOp &&
+ "recipe doesn't have a NNEG flag");
+ return NonNegFlags.NonNeg;
+ }
+
bool hasNoUnsignedWrap() const {
assert(OpType == OperationType::OverflowingBinOp &&
"recipe doesn't have a NUW flag");
@@ -1231,11 +1252,22 @@ class VPWidenRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
: VPRecipeWithIRFlags(VPDefOpcode, Operands, I), VPIRMetadata(I),
Opcode(I.getOpcode()) {}
+ template <typename IterT>
+ VPWidenRecipe(unsigned VPDefOpcode, unsigned Opcode,
+ iterator_range<IterT> Operands, bool NUW, bool NSW, DebugLoc DL)
+ : VPRecipeWithIRFlags(VPDefOpcode, Operands, WrapFlagsTy(NUW, NSW), DL),
+ Opcode(Opcode) {}
+
public:
template <typename IterT>
VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands)
: VPWidenRecipe(VPDef::VPWidenSC, I, Operands) {}
+ template <typename IterT>
+ VPWidenRecipe(unsigned Opcode, iterator_range<IterT> Operands, bool NUW,
+ bool NSW, DebugLoc DL)
+ : VPWidenRecipe(VPDef::VPWidenSC, Opcode, Operands, NUW, NSW, DL) {}
+
~VPWidenRecipe() override = default;
VPWidenRecipe *clone() override {
@@ -1280,10 +1312,16 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
"opcode of underlying cast doesn't match");
}
- VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
- : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op), VPIRMetadata(),
+ VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL = {})
+ : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, DL), VPIRMetadata(),
Opcode(Opcode), ResultTy(ResultTy) {}
+ VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,
+ bool IsNonNeg, DebugLoc DL = {})
+ : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, NonNegFlagsTy(IsNonNeg),
+ DL),
+ Opcode(Opcode), ResultTy(ResultTy) {}
+
~VPWidenCastRecipe() override = default;
VPWidenCastRecipe *clone() override {
@@ -2373,6 +2411,28 @@ class VPReductionRecipe : public VPRecipeWithIRFlags {
setUnderlyingValue(I);
}
+ /// For VPExtendedReductionRecipe.
+ /// Note that the debug location is from the extend.
+ VPReductionRecipe(const unsigned char SC, const RecurKind RdxKind,
+ ArrayRef<VPValue *> Operands, VPValue *CondOp,
+ bool IsOrdered, DebugLoc DL)
+ : VPRecipeWithIRFlags(SC, Operands, DL), RdxKind(RdxKind),
+ IsOrdered(IsOrdered), IsConditional(CondOp) {
+ if (CondOp)
+ addOperand(CondOp);
+ }
+
+ /// For VPMulAccumulateReductionRecipe.
+ /// Note that the NUW/NSW flags and the debug location are from the Mul.
+ VPReductionRecipe(const unsigned char SC, const RecurKind RdxKind,
+ ArrayRef<VPValue *> Operands, VPValue *CondOp,
+ bool IsOrdered, WrapFlagsTy WrapFlags, DebugLoc DL)
+ : VPRecipeWithIRFlags(SC, Operands, WrapFlags, DL), RdxKind(RdxKind),
+ IsOrdered(IsOrdered), IsConditional(CondOp) {
+ if (CondOp)
+ addOperand(CondOp);
+ }
+
public:
VPReductionRecipe(RecurKind RdxKind, FastMathFlags FMFs, Instruction *I,
VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
@@ -2381,6 +2441,13 @@ class VPReductionRecipe : public VPRecipeWithIRFlags {
ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp,
IsOrdered, DL) {}
+ VPReductionRecipe(const RecurKind RdxKind, FastMathFlags FMFs,
+ VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
+ bool IsOrdered, DebugLoc DL = {})
+ : VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, nullptr,
+ ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp,
+ IsOrdered, DL) {}
+
~VPReductionRecipe() override = default;
VPReductionRecipe *clone() override {
@@ -2391,7 +2458,9 @@ class VPReductionRecipe : public VPRecipeWithIRFlags {
static inline bool classof(const VPRecipeBase *R) {
return R->getVPDefID() == VPRecipeBase::VPReductionSC ||
- R->getVPDefID() == VPRecipeBase::VPReductionEVLSC;
+ R->getVPDefID() == VPRecipeBase::VPReductionEVLSC ||
+ R->getVPDefID() == VPRecipeBase::VPExtendedReductionSC ||
+ R->getVPDefID() == VPRecipeBase::VPMulAccumulateReductionSC;
}
static inline bool classof(const VPUser *U) {
@@ -2471,6 +2540,181 @@ class VPReductionEVLRecipe : public VPReductionRecipe {
}
};
+/// A recipe to represent inloop extended reduction operations, performing a
+/// reduction on a extended vector operand into a scalar value, and adding the
+/// result to a chain. This recipe is abstract and needs to be lowered to
+/// concrete recipes before codegen. The operands are {ChainOp, VecOp,
+/// [Condition]}.
+class VPExtendedReductionRecipe : public VPReductionRecipe {
+ /// Opcode of the extend recipe will be lowered to.
+ Instruction::CastOps ExtOp;
+
+ Type *ResultTy;
+
+ /// For cloning VPExtendedReductionRecipe.
+ VPExtendedReductionRecipe(VPExtendedReductionRecipe *ExtRed)
+ : VPReductionRecipe(
+ VPDef::VPExtendedReductionSC, ExtRed->getRecurrenceKind(),
+ {ExtRed->getChainOp(), ExtRed->getVecOp()}, ExtRed->getCondOp(),
+ ExtRed->isOrdered(), ExtRed->getDebugLoc()),
+ ExtOp(ExtRed->getExtOpcode()), ResultTy(ExtRed->getResultType()) {
+ transferFlags(*ExtRed);
+ }
+
+public:
+ VPExtendedReductionRecipe(VPReductionRecipe *R, VPWidenCastRecipe *Ext)
+ : VPReductionRecipe(VPDef::VPExtendedReductionSC, R->getRecurrenceKind(),
+ {R->getChainOp(), Ext->getOperand(0)}, R->getCondOp(),
+ R->isOrdered(), Ext->getDebugLoc()),
+ ExtOp(Ext->getOpcode()), ResultTy(Ext->getResultType()) {
+ // Not all WidenCastRecipes contain nneg flag. Need to transfer flags from
+ // the original recipe to prevent setting wrong flags.
+ transferFlags(*Ext);
+ }
+
+ ~VPExtendedReductionRecipe() override = default;
+
+ VPExtendedReductionRecipe *clone() override {
+ auto *Copy = new VPExtendedReductionRecipe(this);
+ Copy->transferFlags(*this);
+ return Copy;
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPExtendedReductionSC);
+
+ void execute(VPTransformState &State) override {
+ llvm_unreachable("VPExtendedReductionRecipe should be transform to "
+ "VPExtendedRecipe + VPReductionRecipe before execution.");
+ };
+
+ /// Return the cost of VPExtendedReductionRecipe.
+ InstructionCost computeCost(ElementCount VF,
+ VPCostContext &Ctx) const override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
+ /// The scalar type after extending.
+ Type *getResultType() const { return ResultTy; }
+
+ /// Is the extend ZExt?
+ bool isZExt() const { return getExtOpcode() == Instruction::ZExt; }
+
+ /// The opcode of extend recipe.
+ Instruction::CastOps getExtOpcode() const { return ExtOp; }
+};
+
+/// A recipe to represent inloop MulAccumulateReduction operations, performing a
+/// reduction.add on the result of vector operands (might be extended)
+/// multiplication into a scalar value, and adding the result to a chain. This
+/// recipe is abstract and needs to be lowered to concrete recipes before
+/// codegen. The operands are {ChainOp, VecOp1, VecOp2, [Condition]}.
+class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
+ /// Opcode of the extend recipe.
+ Instruction::CastOps ExtOp;
+
+ /// Non-neg flag of the extend recipe.
+ bool IsNonNeg = false;
+
+ Type *ResultTy;
+
+ /// For cloning VPMulAccumulateReductionRecipe.
+ VPMulAccumulateReductionRecipe(VPMulAccumulateReductionRecipe *MulAcc)
+ : VPReductionRecipe(
+ VPDef::VPMulAccumulateReductionSC, MulAcc->getRecurrenceKind(),
+ {MulAcc->getChainOp(), MulAcc->getVecOp0(), MulAcc->getVecOp1()},
+ MulAcc->getCondOp(), MulAcc->isOrdered(),
+ WrapFlagsTy(MulAcc->hasNoUnsignedWrap(), MulAcc->hasNoSignedWrap()),
+ MulAcc->getDebugLoc()),
+ ExtOp(MulAcc->getExtOpcode()), IsNonNeg(MulAcc->isNonNeg()),
+ ResultTy(MulAcc->getResultType()) {}
+
+public:
+ VPMulAccumulateReductionRecipe(VPReductionRecipe *R, VPWidenRecipe *Mul,
+ VPWidenCastRecipe *Ext0,
+ VPWidenCastRecipe *Ext1, Type *ResultTy)
+ : VPReductionRecipe(
+ VPDef::VPMulAccumulateReductionSC, R->getRecurrenceKind(),
+ {R->getChainOp(), Ext0->getOperand(0), Ext1->getOperand(0)},
+ R->getCondOp(), R->isOrdered(),
+ WrapFlagsTy(Mul->hasNoUnsignedWrap(), Mul->hasNoSignedWrap()),
+ R->getDebugLoc()),
+ ExtOp(Ext0->getOpcode()), ResultTy(ResultTy) {
+ assert(RecurrenceDescriptor::getOpcode(getRecurrenceKind()) ==
+ Instruction::Add &&
+ "The reduction instruction in MulAccumulateteReductionRecipe must "
+ "be Add");
+ // Only set the non-negative flag if the original recipe contains.
+ if (Ext0->hasNonNegFlag())
+ IsNonNeg = Ext0->isNonNeg();
+ }
+
+ VPMulAccumulateReductionRecipe(VPReductionRecipe *R, VPWidenRecipe *Mul)
+ : VPReductionRecipe(
+ VPDef::VPMulAccumulateReductionSC, R->getRecurrenceKind(),
+ {R->getChainOp(), Mul->getOperand(0), Mul->getOperand(1)},
+ R->getCondOp(), R->isOrdered(),
+ WrapFlagsTy(Mul->hasNoUnsignedWrap(), Mul->hasNoSignedWrap()),
+ R->getDebugLoc()),
+ ExtOp(Instruction::CastOps::CastOpsEnd) {
+ assert(RecurrenceDescriptor::getOpcode(getRecurrenceKind()) ==
+ Instruction::Add &&
+ "The reduction instruction in MulAccumulateReductionRecipe must be "
+ "Add");
+ }
+
+ ~VPMulAccumulateReductionRecipe() override = default;
+
+ VPMulAccumulateReductionRecipe *clone() override {
+ auto *Copy = new VPMulAccumulateReductionRecipe(this);
+ Copy->transferFlags(*this);
+ return Copy;
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPMulAccumulateReductionSC);
+
+ void execute(VPTransformState &State) override {
+ llvm_unreachable("VPMulAccumulateReductionRecipe should transform to "
+ "VPWidenCastRecipe + "
+ "VPWidenRecipe + VPReductionRecipe before execution");
+ }
+
+ /// Return the cost of VPMulAccumulateReductionRecipe.
+ InstructionCost computeCost(ElementCount VF,
+ VPCostContext &Ctx) const override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
+ Type *getResultType() const {
+ assert(isExtended() && "Only support getResultType when this recipe "
+ "contains implicit extend.");
+ return ResultTy;
+ }
+
+ /// The VPValue of the vector value to be extended and reduced.
+ VPValue *getVecOp0() const { return getOperand(1); }
+ VPValue *getVecOp1() const { return getOperand(2); }
+
+ /// Return if this MulAcc recipe contains extended operands.
+ bool isExtended() const { return ExtOp != Instruction::CastOps::CastOpsEnd; }
+
+ /// Return the opcode of the extends for the operands.
+ Instruction::CastOps getExtOpcode() const { return ExtOp; }
+
+ /// Return if the operands are zero extended.
+ bool isZExt() const { return ExtOp == Instruction::CastOps::ZExt; }
+
+ /// Return the non negative flag of the ext recipe.
+ bool isNonNeg() const { return IsNonNeg; }
+};
+
/// VPReplicateRecipe replicates a given instruction producing multiple scalar
/// copies of the original scalar type, one per lane, instead of producing a
/// single copy of widened type for all lanes. If the instruction is known to be
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index c86815c84d8d9..7dcbd72c25191 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -273,6 +273,8 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
// TODO: Use info from interleave group.
return V->getUnderlyingValue()->getType();
})
+ .Case<VPExtendedReductionRecipe, VPMulAccumulateReductionRecipe>(
+ [](const auto *R) { return R->getResultType(); })
.Case<VPExpandSCEVRecipe>([](const VPExpandSCEVRecipe *R) {
return R->getSCEV()->getType();
})
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 75d056026025a..32e35b0fb78d3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -71,6 +71,8 @@ bool VPRecipeBase::mayWriteToMemory() const {
case VPBlendSC:
case VPReductionEVLSC:
case VPReductionSC:
+ case VPExtendedReductionSC:
+ case VPMulAccumulateReductionSC:
case VPVectorPointerSC:
case VPWidenCanonicalIVSC:
case VPWidenCastSC:
@@ -118,6 +120,8 @@ bool VPRecipeBase::mayReadFromMemory() const {
case VPBlendSC:
case VPReductionEVLSC:
case VPReductionSC:
+ case VPExtendedReductionSC:
+ case VPMulAccumulateReductionSC:
case VPVectorPointerSC:
case VPWidenCanonicalIVSC:
case VPWidenCastSC:
@@ -155,6 +159,8 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPBlendSC:
case VPReductionEVLSC:
case VPReductionSC:
+ case VPExtendedReductionSC:
+ case VPMulAccumulateReductionSC:
case VPScalarIVStepsSC:
case VPVectorPointerSC:
case VPWidenCanonicalIVSC:
@@ -2489,28 +2495,49 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind);
FastMathFlags FMFs = getFastMathFlags();
+ std::optional<FastMathFlags> OptionalFMF =
+ ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt;
- // TODO: Support any-of and in-loop reductions.
+ // TODO: Support any-of reductions.
assert(
(!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) ||
ForceTargetInstructionCost.getNumOccurrences() > 0) &&
"Any-of reduction not implemented in VPlan-based cost model currently.");
- assert(
- (!cast<VPReductionPHIRecipe>(getOperand(0))->isInLoop() ||
- ForceTargetInstructionCost.getNumOccurrences() > 0) &&
- "In-loop reduction not implemented in VPlan-based cost model currently.");
- // Cost = Reduction cost + BinOp cost
- InstructionCost Cost =
- Ctx.TTI.getArithmeticInstrCost(Opcode, ElementTy, Ctx.CostKind);
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) {
Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);
- return Cost +
- Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind);
+ return Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind);
}
- return Cost + Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, FMFs,
- Ctx.CostKind);
+ return Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, OptionalFMF,
+ Ctx.CostKind);
+}
+
+InstructionCost
+VPExtendedReductionRecipe::computeCost(ElementCount VF,
+ VPCostContext &Ctx) const {
+ unsigned Opcode = RecurrenceDescriptor::getOpcode(getRecurrenceKind());
+ Type *RedTy = Ctx.Types.inferScalarType(this);
+ auto *SrcVecTy =
+ cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(getVecOp()), VF));
+ assert(RedTy->isIntegerTy() &&
+ "Exte...
[truncated]
|
@llvm/pr-subscribers-vectorizers Author: Elvis Wang (ElvisWang123) ChangesThis patch implements the transformation that match the following
The converted abstract recipes will be lower to the concrete recipes This should be a cost-model based decision which will be implemented in the Split from #113903. Stacked on #137745. Patch is 68.39 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/137746.diff 12 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 4684378687ef6..f5e3d1664b407 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9631,10 +9631,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
"entry block must be set to a VPRegionBlock having a non-empty entry "
"VPBasicBlock");
- for (ElementCount VF : Range)
- Plan->addVF(VF);
- Plan->setName("Initial VPlan");
-
// Update wide induction increments to use the same step as the corresponding
// wide induction. This enables detecting induction increments directly in
// VPlan and removes redundant splats.
@@ -9670,6 +9666,21 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
// Adjust the recipes for any inloop reductions.
adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
+ // Transform recipes to abstract recipes if it is legal and beneficial and
+ // clamp the range for better cost estimation.
+ // TODO: Enable following transform when the EVL-version of extended-reduction
+ // and mulacc-reduction are implemented.
+ if (!CM.foldTailWithEVL()) {
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
+ CM.CostKind);
+ VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
+ CostCtx, Range);
+ }
+
+ for (ElementCount VF : Range)
+ Plan->addVF(VF);
+ Plan->setName("Initial VPlan");
+
// Interleave memory: for each Interleave Group we marked earlier as relevant
// for this VPlan, replace the Recipes widening its memory instructions with a
// single VPInterleaveRecipe at its insertion point.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index afad73bcd3501..587ba29965646 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -525,6 +525,8 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPInstructionSC:
case VPRecipeBase::VPReductionEVLSC:
case VPRecipeBase::VPReductionSC:
+ case VPRecipeBase::VPMulAccumulateReductionSC:
+ case VPRecipeBase::VPExtendedReductionSC:
case VPRecipeBase::VPReplicateSC:
case VPRecipeBase::VPScalarIVStepsSC:
case VPRecipeBase::VPVectorPointerSC:
@@ -609,13 +611,15 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
DisjointFlagsTy(bool IsDisjoint) : IsDisjoint(IsDisjoint) {}
};
+ struct NonNegFlagsTy {
+ char NonNeg : 1;
+ NonNegFlagsTy(bool IsNonNeg) : NonNeg(IsNonNeg) {}
+ };
+
private:
struct ExactFlagsTy {
char IsExact : 1;
};
- struct NonNegFlagsTy {
- char NonNeg : 1;
- };
struct FastMathFlagsTy {
char AllowReassoc : 1;
char NoNaNs : 1;
@@ -709,6 +713,12 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
: VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::DisjointOp),
DisjointFlags(DisjointFlags) {}
+ template <typename IterT>
+ VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
+ NonNegFlagsTy NonNegFlags, DebugLoc DL = {})
+ : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::NonNegOp),
+ NonNegFlags(NonNegFlags) {}
+
protected:
template <typename IterT>
VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
@@ -728,7 +738,9 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
R->getVPDefID() == VPRecipeBase::VPReductionEVLSC ||
R->getVPDefID() == VPRecipeBase::VPReplicateSC ||
R->getVPDefID() == VPRecipeBase::VPVectorEndPointerSC ||
- R->getVPDefID() == VPRecipeBase::VPVectorPointerSC;
+ R->getVPDefID() == VPRecipeBase::VPVectorPointerSC ||
+ R->getVPDefID() == VPRecipeBase::VPExtendedReductionSC ||
+ R->getVPDefID() == VPRecipeBase::VPMulAccumulateReductionSC;
}
static inline bool classof(const VPUser *U) {
@@ -820,6 +832,15 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
FastMathFlags getFastMathFlags() const;
+ /// Returns true if the recipe has non-negative flag.
+ bool hasNonNegFlag() const { return OpType == OperationType::NonNegOp; }
+
+ bool isNonNeg() const {
+ assert(OpType == OperationType::NonNegOp &&
+ "recipe doesn't have a NNEG flag");
+ return NonNegFlags.NonNeg;
+ }
+
bool hasNoUnsignedWrap() const {
assert(OpType == OperationType::OverflowingBinOp &&
"recipe doesn't have a NUW flag");
@@ -1231,11 +1252,22 @@ class VPWidenRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
: VPRecipeWithIRFlags(VPDefOpcode, Operands, I), VPIRMetadata(I),
Opcode(I.getOpcode()) {}
+ template <typename IterT>
+ VPWidenRecipe(unsigned VPDefOpcode, unsigned Opcode,
+ iterator_range<IterT> Operands, bool NUW, bool NSW, DebugLoc DL)
+ : VPRecipeWithIRFlags(VPDefOpcode, Operands, WrapFlagsTy(NUW, NSW), DL),
+ Opcode(Opcode) {}
+
public:
template <typename IterT>
VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands)
: VPWidenRecipe(VPDef::VPWidenSC, I, Operands) {}
+ template <typename IterT>
+ VPWidenRecipe(unsigned Opcode, iterator_range<IterT> Operands, bool NUW,
+ bool NSW, DebugLoc DL)
+ : VPWidenRecipe(VPDef::VPWidenSC, Opcode, Operands, NUW, NSW, DL) {}
+
~VPWidenRecipe() override = default;
VPWidenRecipe *clone() override {
@@ -1280,10 +1312,16 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
"opcode of underlying cast doesn't match");
}
- VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
- : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op), VPIRMetadata(),
+ VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL = {})
+ : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, DL), VPIRMetadata(),
Opcode(Opcode), ResultTy(ResultTy) {}
+ VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,
+ bool IsNonNeg, DebugLoc DL = {})
+ : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, NonNegFlagsTy(IsNonNeg),
+ DL),
+ Opcode(Opcode), ResultTy(ResultTy) {}
+
~VPWidenCastRecipe() override = default;
VPWidenCastRecipe *clone() override {
@@ -2373,6 +2411,28 @@ class VPReductionRecipe : public VPRecipeWithIRFlags {
setUnderlyingValue(I);
}
+ /// For VPExtendedReductionRecipe.
+ /// Note that the debug location is from the extend.
+ VPReductionRecipe(const unsigned char SC, const RecurKind RdxKind,
+ ArrayRef<VPValue *> Operands, VPValue *CondOp,
+ bool IsOrdered, DebugLoc DL)
+ : VPRecipeWithIRFlags(SC, Operands, DL), RdxKind(RdxKind),
+ IsOrdered(IsOrdered), IsConditional(CondOp) {
+ if (CondOp)
+ addOperand(CondOp);
+ }
+
+ /// For VPMulAccumulateReductionRecipe.
+ /// Note that the NUW/NSW flags and the debug location are from the Mul.
+ VPReductionRecipe(const unsigned char SC, const RecurKind RdxKind,
+ ArrayRef<VPValue *> Operands, VPValue *CondOp,
+ bool IsOrdered, WrapFlagsTy WrapFlags, DebugLoc DL)
+ : VPRecipeWithIRFlags(SC, Operands, WrapFlags, DL), RdxKind(RdxKind),
+ IsOrdered(IsOrdered), IsConditional(CondOp) {
+ if (CondOp)
+ addOperand(CondOp);
+ }
+
public:
VPReductionRecipe(RecurKind RdxKind, FastMathFlags FMFs, Instruction *I,
VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
@@ -2381,6 +2441,13 @@ class VPReductionRecipe : public VPRecipeWithIRFlags {
ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp,
IsOrdered, DL) {}
+ VPReductionRecipe(const RecurKind RdxKind, FastMathFlags FMFs,
+ VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
+ bool IsOrdered, DebugLoc DL = {})
+ : VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, nullptr,
+ ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp,
+ IsOrdered, DL) {}
+
~VPReductionRecipe() override = default;
VPReductionRecipe *clone() override {
@@ -2391,7 +2458,9 @@ class VPReductionRecipe : public VPRecipeWithIRFlags {
static inline bool classof(const VPRecipeBase *R) {
return R->getVPDefID() == VPRecipeBase::VPReductionSC ||
- R->getVPDefID() == VPRecipeBase::VPReductionEVLSC;
+ R->getVPDefID() == VPRecipeBase::VPReductionEVLSC ||
+ R->getVPDefID() == VPRecipeBase::VPExtendedReductionSC ||
+ R->getVPDefID() == VPRecipeBase::VPMulAccumulateReductionSC;
}
static inline bool classof(const VPUser *U) {
@@ -2471,6 +2540,181 @@ class VPReductionEVLRecipe : public VPReductionRecipe {
}
};
+/// A recipe to represent inloop extended reduction operations, performing a
+/// reduction on a extended vector operand into a scalar value, and adding the
+/// result to a chain. This recipe is abstract and needs to be lowered to
+/// concrete recipes before codegen. The operands are {ChainOp, VecOp,
+/// [Condition]}.
+class VPExtendedReductionRecipe : public VPReductionRecipe {
+ /// Opcode of the extend recipe will be lowered to.
+ Instruction::CastOps ExtOp;
+
+ Type *ResultTy;
+
+ /// For cloning VPExtendedReductionRecipe.
+ VPExtendedReductionRecipe(VPExtendedReductionRecipe *ExtRed)
+ : VPReductionRecipe(
+ VPDef::VPExtendedReductionSC, ExtRed->getRecurrenceKind(),
+ {ExtRed->getChainOp(), ExtRed->getVecOp()}, ExtRed->getCondOp(),
+ ExtRed->isOrdered(), ExtRed->getDebugLoc()),
+ ExtOp(ExtRed->getExtOpcode()), ResultTy(ExtRed->getResultType()) {
+ transferFlags(*ExtRed);
+ }
+
+public:
+ VPExtendedReductionRecipe(VPReductionRecipe *R, VPWidenCastRecipe *Ext)
+ : VPReductionRecipe(VPDef::VPExtendedReductionSC, R->getRecurrenceKind(),
+ {R->getChainOp(), Ext->getOperand(0)}, R->getCondOp(),
+ R->isOrdered(), Ext->getDebugLoc()),
+ ExtOp(Ext->getOpcode()), ResultTy(Ext->getResultType()) {
+ // Not all WidenCastRecipes contain nneg flag. Need to transfer flags from
+ // the original recipe to prevent setting wrong flags.
+ transferFlags(*Ext);
+ }
+
+ ~VPExtendedReductionRecipe() override = default;
+
+ VPExtendedReductionRecipe *clone() override {
+ auto *Copy = new VPExtendedReductionRecipe(this);
+ Copy->transferFlags(*this);
+ return Copy;
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPExtendedReductionSC);
+
+ void execute(VPTransformState &State) override {
+ llvm_unreachable("VPExtendedReductionRecipe should be transform to "
+ "VPExtendedRecipe + VPReductionRecipe before execution.");
+ };
+
+ /// Return the cost of VPExtendedReductionRecipe.
+ InstructionCost computeCost(ElementCount VF,
+ VPCostContext &Ctx) const override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
+ /// The scalar type after extending.
+ Type *getResultType() const { return ResultTy; }
+
+ /// Is the extend ZExt?
+ bool isZExt() const { return getExtOpcode() == Instruction::ZExt; }
+
+ /// The opcode of extend recipe.
+ Instruction::CastOps getExtOpcode() const { return ExtOp; }
+};
+
+/// A recipe to represent inloop MulAccumulateReduction operations, performing a
+/// reduction.add on the result of vector operands (might be extended)
+/// multiplication into a scalar value, and adding the result to a chain. This
+/// recipe is abstract and needs to be lowered to concrete recipes before
+/// codegen. The operands are {ChainOp, VecOp1, VecOp2, [Condition]}.
+class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
+ /// Opcode of the extend recipe.
+ Instruction::CastOps ExtOp;
+
+ /// Non-neg flag of the extend recipe.
+ bool IsNonNeg = false;
+
+ Type *ResultTy;
+
+ /// For cloning VPMulAccumulateReductionRecipe.
+ VPMulAccumulateReductionRecipe(VPMulAccumulateReductionRecipe *MulAcc)
+ : VPReductionRecipe(
+ VPDef::VPMulAccumulateReductionSC, MulAcc->getRecurrenceKind(),
+ {MulAcc->getChainOp(), MulAcc->getVecOp0(), MulAcc->getVecOp1()},
+ MulAcc->getCondOp(), MulAcc->isOrdered(),
+ WrapFlagsTy(MulAcc->hasNoUnsignedWrap(), MulAcc->hasNoSignedWrap()),
+ MulAcc->getDebugLoc()),
+ ExtOp(MulAcc->getExtOpcode()), IsNonNeg(MulAcc->isNonNeg()),
+ ResultTy(MulAcc->getResultType()) {}
+
+public:
+ VPMulAccumulateReductionRecipe(VPReductionRecipe *R, VPWidenRecipe *Mul,
+ VPWidenCastRecipe *Ext0,
+ VPWidenCastRecipe *Ext1, Type *ResultTy)
+ : VPReductionRecipe(
+ VPDef::VPMulAccumulateReductionSC, R->getRecurrenceKind(),
+ {R->getChainOp(), Ext0->getOperand(0), Ext1->getOperand(0)},
+ R->getCondOp(), R->isOrdered(),
+ WrapFlagsTy(Mul->hasNoUnsignedWrap(), Mul->hasNoSignedWrap()),
+ R->getDebugLoc()),
+ ExtOp(Ext0->getOpcode()), ResultTy(ResultTy) {
+ assert(RecurrenceDescriptor::getOpcode(getRecurrenceKind()) ==
+ Instruction::Add &&
+ "The reduction instruction in MulAccumulateteReductionRecipe must "
+ "be Add");
+ // Only set the non-negative flag if the original recipe contains.
+ if (Ext0->hasNonNegFlag())
+ IsNonNeg = Ext0->isNonNeg();
+ }
+
+ VPMulAccumulateReductionRecipe(VPReductionRecipe *R, VPWidenRecipe *Mul)
+ : VPReductionRecipe(
+ VPDef::VPMulAccumulateReductionSC, R->getRecurrenceKind(),
+ {R->getChainOp(), Mul->getOperand(0), Mul->getOperand(1)},
+ R->getCondOp(), R->isOrdered(),
+ WrapFlagsTy(Mul->hasNoUnsignedWrap(), Mul->hasNoSignedWrap()),
+ R->getDebugLoc()),
+ ExtOp(Instruction::CastOps::CastOpsEnd) {
+ assert(RecurrenceDescriptor::getOpcode(getRecurrenceKind()) ==
+ Instruction::Add &&
+ "The reduction instruction in MulAccumulateReductionRecipe must be "
+ "Add");
+ }
+
+ ~VPMulAccumulateReductionRecipe() override = default;
+
+ VPMulAccumulateReductionRecipe *clone() override {
+ auto *Copy = new VPMulAccumulateReductionRecipe(this);
+ Copy->transferFlags(*this);
+ return Copy;
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPMulAccumulateReductionSC);
+
+ void execute(VPTransformState &State) override {
+ llvm_unreachable("VPMulAccumulateReductionRecipe should transform to "
+ "VPWidenCastRecipe + "
+ "VPWidenRecipe + VPReductionRecipe before execution");
+ }
+
+ /// Return the cost of VPMulAccumulateReductionRecipe.
+ InstructionCost computeCost(ElementCount VF,
+ VPCostContext &Ctx) const override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
+ Type *getResultType() const {
+ assert(isExtended() && "Only support getResultType when this recipe "
+ "contains implicit extend.");
+ return ResultTy;
+ }
+
+ /// The VPValue of the vector value to be extended and reduced.
+ VPValue *getVecOp0() const { return getOperand(1); }
+ VPValue *getVecOp1() const { return getOperand(2); }
+
+ /// Return if this MulAcc recipe contains extended operands.
+ bool isExtended() const { return ExtOp != Instruction::CastOps::CastOpsEnd; }
+
+ /// Return the opcode of the extends for the operands.
+ Instruction::CastOps getExtOpcode() const { return ExtOp; }
+
+ /// Return if the operands are zero extended.
+ bool isZExt() const { return ExtOp == Instruction::CastOps::ZExt; }
+
+ /// Return the non negative flag of the ext recipe.
+ bool isNonNeg() const { return IsNonNeg; }
+};
+
/// VPReplicateRecipe replicates a given instruction producing multiple scalar
/// copies of the original scalar type, one per lane, instead of producing a
/// single copy of widened type for all lanes. If the instruction is known to be
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index c86815c84d8d9..7dcbd72c25191 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -273,6 +273,8 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
// TODO: Use info from interleave group.
return V->getUnderlyingValue()->getType();
})
+ .Case<VPExtendedReductionRecipe, VPMulAccumulateReductionRecipe>(
+ [](const auto *R) { return R->getResultType(); })
.Case<VPExpandSCEVRecipe>([](const VPExpandSCEVRecipe *R) {
return R->getSCEV()->getType();
})
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 75d056026025a..32e35b0fb78d3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -71,6 +71,8 @@ bool VPRecipeBase::mayWriteToMemory() const {
case VPBlendSC:
case VPReductionEVLSC:
case VPReductionSC:
+ case VPExtendedReductionSC:
+ case VPMulAccumulateReductionSC:
case VPVectorPointerSC:
case VPWidenCanonicalIVSC:
case VPWidenCastSC:
@@ -118,6 +120,8 @@ bool VPRecipeBase::mayReadFromMemory() const {
case VPBlendSC:
case VPReductionEVLSC:
case VPReductionSC:
+ case VPExtendedReductionSC:
+ case VPMulAccumulateReductionSC:
case VPVectorPointerSC:
case VPWidenCanonicalIVSC:
case VPWidenCastSC:
@@ -155,6 +159,8 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPBlendSC:
case VPReductionEVLSC:
case VPReductionSC:
+ case VPExtendedReductionSC:
+ case VPMulAccumulateReductionSC:
case VPScalarIVStepsSC:
case VPVectorPointerSC:
case VPWidenCanonicalIVSC:
@@ -2489,28 +2495,49 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind);
FastMathFlags FMFs = getFastMathFlags();
+ std::optional<FastMathFlags> OptionalFMF =
+ ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt;
- // TODO: Support any-of and in-loop reductions.
+ // TODO: Support any-of reductions.
assert(
(!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) ||
ForceTargetInstructionCost.getNumOccurrences() > 0) &&
"Any-of reduction not implemented in VPlan-based cost model currently.");
- assert(
- (!cast<VPReductionPHIRecipe>(getOperand(0))->isInLoop() ||
- ForceTargetInstructionCost.getNumOccurrences() > 0) &&
- "In-loop reduction not implemented in VPlan-based cost model currently.");
- // Cost = Reduction cost + BinOp cost
- InstructionCost Cost =
- Ctx.TTI.getArithmeticInstrCost(Opcode, ElementTy, Ctx.CostKind);
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) {
Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);
- return Cost +
- Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind);
+ return Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind);
}
- return Cost + Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, FMFs,
- Ctx.CostKind);
+ return Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, OptionalFMF,
+ Ctx.CostKind);
+}
+
+InstructionCost
+VPExtendedReductionRecipe::computeCost(ElementCount VF,
+ VPCostContext &Ctx) const {
+ unsigned Opcode = RecurrenceDescriptor::getOpcode(getRecurrenceKind());
+ Type *RedTy = Ctx.Types.inferScalarType(this);
+ auto *SrcVecTy =
+ cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(getVecOp()), VF));
+ assert(RedTy->isIntegerTy() &&
+ "Exte...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
19d217b
to
c95c790
Compare
auto *Copy = new VPExtendedReductionRecipe(this); | ||
Copy->transferFlags(*this); | ||
return Copy; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
auto *Copy = new VPExtendedReductionRecipe(this); | |
Copy->transferFlags(*this); | |
return Copy; | |
return new VPExtendedReductionRecipe(this); |
The constructor already calls transferFlags
, so I don't think you need to do it a second time.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Given that this is just code split out from the already reviewed PR #113903, this looks good to me (with @huntergr-arm's comment addressed)
One minor nit on the commit message:
and correspond vplan transformations
->
and corresponding vplan transformations
} | ||
|
||
InstructionCost | ||
VPExtendedReductionRecipe::computeCost(ElementCount VF, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note that this patch still relies on legacy cost model the calculate the cost for these patters.
Will enable vplan-based cost decision in #113903.
as per the description, I'd expect no need to implement computeCost in this patch (and no need to change change
PReductionRecipe::computeCost).
Is it possible to remove those? I think you might have to preserve the original underlying instruction in the new reduction recipes, so we can actually lookup the legacy cost?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it's hard to do that.
ConvertToAbstract()
will executed before precomputeCost()
. So we cannot get the legacy cost at transformation.
Moving the precomputeCost()
just before convertToAbstract()
recipe may get the wrong cost for other recipes since the vplan construction/transformation is not finish yet.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hm, but does convertToAbstract
actually call computeCost for the new recipes? From a quick look it seems liek they aren't called, but instead uses TTI directly for the cost of the yet-to-be-created abstract recipes?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes. Removed. Thanks!
} | ||
|
||
InstructionCost | ||
VPExtendedReductionRecipe::computeCost(ElementCount VF, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hm, but does convertToAbstract
actually call computeCost for the new recipes? From a quick look it seems liek they aren't called, but instead uses TTI directly for the cost of the yet-to-be-created abstract recipes?
} | ||
|
||
return Cost + Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, FMFs, | ||
Ctx.CostKind); | ||
return Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, OptionalFMF, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why is the cost of the bin-op being dropped? VPReductionRecipe should perform a reduction on the vector op and scalar bin-op to add the result to the chain I think?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, it will generate scalar BInOp.
Remove this is tried to match the legacy cost. Move this change to the follow up patch.
@fhahn apologies for the prod, but could you give this another look? There are several other PRs that are gated by this one, so it would be great to see this land so the follow-up work can make some progress. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Glad to see getting rid of the cost changes worked out!
Some more suggestions, mostly around trying to have the API comments consistent.
I also did some testing and ran into an assert with the IR below
; bin/opt -prefer-inloop-reductions=true -p loop-vectorize
target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
target triple = "arm64-apple-macosx15.0.0"
define i32 @test(i32 %0) {
entry:
br label %for.body143.i.i
for.body143.i.i: ; preds = %for.end185.i.i, %entry
%summa.1191.i.i = phi float [ 0.000000e+00, %entry ], [ %conv200.i.i, %for.end185.i.i ]
br label %for.body157.i.i
for.body157.i.i: ; preds = %for.body157.i.i, %for.body143.i.i
%indvars.iv229.i.i1 = phi i64 [ 0, %for.body143.i.i ], [ %indvars.iv.next230.i.i, %for.body157.i.i ]
%summer.1186.i.i = phi i32 [ 0, %for.body143.i.i ], [ %add174.i.i, %for.body157.i.i ]
%conv167.i.i = fptosi float %summa.1191.i.i to i32
%mul173.i.i = mul i32 %0, %conv167.i.i
%add174.i.i = add i32 %mul173.i.i, %summer.1186.i.i
%indvars.iv.next230.i.i = add i64 %indvars.iv229.i.i1, 1
%exitcond232.not.i.i = icmp eq i64 %indvars.iv229.i.i1, 32
br i1 %exitcond232.not.i.i, label %for.end185.i.i, label %for.body157.i.i
for.end185.i.i: ; preds = %for.body157.i.i
%conv200.i.i = sitofp i32 %add174.i.i to float
br label %for.body143.i.i
}
ExtOp(Ext->getOpcode()), ResultTy(Ext->getResultType()) { | ||
assert((ExtOp == Instruction::CastOps::ZExt || | ||
ExtOp == Instruction::CastOps::SExt) && | ||
"VPExtendedReductionRecipe only support zext and sext."); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"VPExtendedReductionRecipe only support zext and sext."); | |
"VPExtendedReductionRecipe only supports zext and sext."); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks!
// Not all WidenCastRecipes contain nneg flag. Need to transfer flags from | ||
// the original recipe to prevent setting wrong flags. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this comment still accurate? It unconditionally calls transferFlags now?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removed, thanks!
/// concrete recipes before codegen. The operands are {ChainOp, VecOp, | ||
/// [Condition]}. | ||
class VPExtendedReductionRecipe : public VPReductionRecipe { | ||
/// Opcode of the extend recipe will be lowered to. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
/// Opcode of the extend recipe will be lowered to. | |
/// Opcode of the extend for VecOp. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks!
/// Is the extend ZExt? | ||
bool isZExt() const { return getExtOpcode() == Instruction::ZExt; } | ||
|
||
/// The opcode of extend recipe. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
/// The opcode of extend recipe. | |
/// Get the opcode of the extend for VecOp. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks!
|
||
VPValue *VecOp = Red->getVecOp(); | ||
VPValue *A, *B; | ||
// Try to match reduce.add(mul(...)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// Try to match reduce.add(mul(...)) | |
// Try to match reduce.add(mul(...)). |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated.
dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe()); | ||
auto *Mul = cast<VPWidenRecipe>(VecOp->getDefiningRecipe()); | ||
|
||
// Match reduce.add(mul(ext, ext)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// Match reduce.add(mul(ext, ext)) | |
// Match reduce.add(mul(ext, ext)). |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated.
unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()); | ||
if (Opcode != Instruction::Add) | ||
return nullptr; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
move early exit to the top of the function?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Moved. Thanks!
static void | ||
expandVPMulAccumulateReduction(VPMulAccumulateReductionRecipe *MulAcc) { | ||
// Generate inner VPWidenCastRecipes if necessary. | ||
// Note that we will drop the extend after mul which transform |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// Note that we will drop the extend after mul which transform | |
// Note that we will drop the extend after mul which transforms |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated.
assert(isExtended() && "Only support getResultType when this recipe " | ||
"contains implicit extend."); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
assert(isExtended() && "Only support getResultType when this recipe " | |
"contains implicit extend."); | |
assert(isExtended() && "Only support getResultType when this recipe " | |
"is implicitly extend."); |
I saw this assert triggering when doing some testing of the PR:
; bin/opt -prefer-inloop-reductions=true -p loop-vectorize
target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
target triple = "arm64-apple-macosx15.0.0"
define i32 @test(i32 %0) {
entry:
br label %for.body143.i.i
for.body143.i.i: ; preds = %for.end185.i.i, %entry
%summa.1191.i.i = phi float [ 0.000000e+00, %entry ], [ %conv200.i.i, %for.end185.i.i ]
br label %for.body157.i.i
for.body157.i.i: ; preds = %for.body157.i.i, %for.body143.i.i
%indvars.iv229.i.i1 = phi i64 [ 0, %for.body143.i.i ], [ %indvars.iv.next230.i.i, %for.body157.i.i ]
%summer.1186.i.i = phi i32 [ 0, %for.body143.i.i ], [ %add174.i.i, %for.body157.i.i ]
%conv167.i.i = fptosi float %summa.1191.i.i to i32
%mul173.i.i = mul i32 %0, %conv167.i.i
%add174.i.i = add i32 %mul173.i.i, %summer.1186.i.i
%indvars.iv.next230.i.i = add i64 %indvars.iv229.i.i1, 1
%exitcond232.not.i.i = icmp eq i64 %indvars.iv229.i.i1, 32
br i1 %exitcond232.not.i.i, label %for.end185.i.i, label %for.body157.i.i
for.end185.i.i: ; preds = %for.body157.i.i
%conv200.i.i = sitofp i32 %add174.i.i to float
br label %for.body143.i.i
}
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated and fixed by 39f14c4.
Should I add this test case in a new file? Will not trigger the failure when putting the test in mve-reductions.ll
and mve-reduction-types.ll
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Best to add it as separate test, if it requires AArch64.
34f19db
to
39f14c4
Compare
if (auto *ExtRed = dyn_cast<VPExtendedReductionRecipe>(&R)) | ||
expandVPExtendedReduction(ExtRed); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (auto *ExtRed = dyn_cast<VPExtendedReductionRecipe>(&R)) | |
expandVPExtendedReduction(ExtRed); | |
if (auto *ExtRed = dyn_cast<VPExtendedReductionRecipe>(&R)) { | |
expandVPExtendedReduction(ExtRed); | |
continue; | |
} |
otherwise there may be a use-after-free below if R has been removed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks.
assert(isExtended() && "Only support getResultType when this recipe " | ||
"contains implicit extend."); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Best to add it as separate test, if it requires AArch64.
/// Non-neg flag of the extend recipe. | ||
bool IsNonNeg = false; | ||
|
||
Type *ResultTy; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should this say if extended otherwise nullptr
and initialize to nullptr
?
WrapFlagsTy(MulAcc->hasNoUnsignedWrap(), MulAcc->hasNoSignedWrap()), | ||
MulAcc->getDebugLoc()), | ||
ExtOp(MulAcc->getExtOpcode()), IsNonNeg(MulAcc->isNonNeg()) { | ||
if (MulAcc->isExtended()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we even form the recipe if no extend is needed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This recipe is for reduce.add(mul(...))
with optional extends. I think we still need to create this recipe even without extends.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah yes, but still should set the result type here as well, otherwise getResultType
may read unitialized memory?
We have the result type available at construction through inference, would be good to always pass it to the constructor, even if there's no extend and always copy it when cloning?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, always setting the result type here. Thanks, that's better.
The type of the original reduction recipe will be inferred when constructing the VPMulAccumulateReductionRecipe. in tryToMatchAndCreateMulAccumulateReduction()
.
template <typename IterT> | ||
VPWidenRecipe(unsigned VPDefOpcode, unsigned Opcode, ArrayRef<IterT> Operands, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
template <typename IterT> | |
VPWidenRecipe(unsigned VPDefOpcode, unsigned Opcode, ArrayRef<IterT> Operands, | |
VPWidenRecipe(unsigned VPDefOpcode, unsigned Opcode, ArrayRef<VPValue *> Operands, |
should work
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Update, thanks!
template <typename IterT> | ||
VPWidenRecipe(unsigned Opcode, ArrayRef<IterT> Operands, bool NUW, bool NSW, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
template <typename IterT> | |
VPWidenRecipe(unsigned Opcode, ArrayRef<IterT> Operands, bool NUW, bool NSW, | |
VPWidenRecipe(unsigned Opcode, ArrayRef<VPValue *> Operands, bool NUW, bool NSW, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks!
WrapFlagsTy(MulAcc->hasNoUnsignedWrap(), MulAcc->hasNoSignedWrap()), | ||
MulAcc->getDebugLoc()), | ||
ExtOp(MulAcc->getExtOpcode()), IsNonNeg(MulAcc->isNonNeg()) { | ||
if (MulAcc->isExtended()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah yes, but still should set the result type here as well, otherwise getResultType
may read unitialized memory?
We have the result type available at construction through inference, would be good to always pass it to the constructor, even if there's no extend and always copy it when cloning?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thansk for the lastest updates! A few more small comments, LGTM with those addressed
/// A recipe to represent inloop MulAccumulateReduction operations, performing a | ||
/// reduction.add on the result of vector operands (might be extended) | ||
/// multiplication into a scalar value, and adding the result to a chain. This |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
/// A recipe to represent inloop MulAccumulateReduction operations, performing a | |
/// reduction.add on the result of vector operands (might be extended) | |
/// multiplication into a scalar value, and adding the result to a chain. This | |
/// A recipe to represent inloop MulAccumulateReduction operations, multiplying the vector operands (which may be extended), performing a reduction.add on the result, and adding the scalar result to a chain. This |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks!
|
||
VPMulAccumulateReductionRecipe *clone() override { | ||
auto *Copy = new VPMulAccumulateReductionRecipe(this); | ||
Copy->transferFlags(*this); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can be moved to the constructor, same as already done in. VPExtendedReductionRecipe
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Moved, thanks!
// TODO: Support any-of reductions. | ||
assert( | ||
(!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) || | ||
ForceTargetInstructionCost.getNumOccurrences() > 0) && | ||
"Any-of reduction not implemented in VPlan-based cost model currently."); | ||
assert( | ||
(!cast<VPReductionPHIRecipe>(getOperand(0))->isInLoop() || | ||
ForceTargetInstructionCost.getNumOccurrences() > 0) && | ||
"In-loop reduction not implemented in VPlan-based cost model currently."); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Original code still applies to the latest version right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, need to remove this because need to calculate the pure reduction cost (no extend/mul...) in transformations.
WrapFlagsTy(MulAcc->hasNoUnsignedWrap(), MulAcc->hasNoSignedWrap()), | ||
MulAcc->getDebugLoc()), | ||
ExtOp(MulAcc->getExtOpcode()), IsNonNeg(MulAcc->isNonNeg()), | ||
ResultTy(MulAcc->getResultType()) {} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
note: underlying value not set here, but that should be fine as cost is computed before cloning at the moment.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Set the underlying value in ctor.
VPDef::VPExtendedReductionSC, ExtRed->getRecurrenceKind(), | ||
{ExtRed->getChainOp(), ExtRed->getVecOp()}, ExtRed->getCondOp(), | ||
ExtRed->isOrdered(), ExtRed->getDebugLoc()), | ||
ExtOp(ExtRed->getExtOpcode()), ResultTy(ExtRed->getResultType()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
note: underlying value not set here, but that should be fine as cost is computed before cloning at the moment.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added, thanks!
… to abstract recipe. This patch introduce two new recipes. * VPExtendedReductionRecipe - cast + reduction. * VPMulAccumulateReductionRecipe - (cast) + mul + reduction. This patch also implements the transformation that match following patterns via vplan and converts to abstract recipes for better cost estimation. * VPExtendedReduction - reduce(cast(...)) * VPMulAccumulateReductionRecipe - reduce.add(mul(...)) - reduce.add(mul(ext(...), ext(...)) - reduce.add(ext(mul(ext(...), ext(...)))) The conveted abstract recipes will be lower to the concrete recipes (widen-cast + widen-mul + reduction) just before recipe execution. Split from llvm#113903.
9e97bd0
to
fca5a28
Compare
This patch introduce two new recipes.
VPExtendedReductionRecipe
VPMulAccumulateReductionRecipe
This patch also implements the transformation that match following
patterns via vplan and converts to abstract recipes for better cost
estimation.
VPExtendedReduction
VPMulAccumulateReductionRecipe
The converted abstract recipes will be lower to the concrete recipes
(widen-cast + widen-mul + reduction) just before recipe execution.
Note that this patch still relies on legacy cost model the calculate the cost for these patters.
Will enable vplan-based cost decision in #113903.
Split from #113903.