Skip to content

Commit 7871f9e

Browse files
committed
[VPlan] Implement transformation of widen-cast/widen-mul + reduction to abstract recipe.
This patch implements the transformation that match the following patterns in the vplan and converts to abstract recipes for better cost estimation. * VPExtendedReductionRecipe - cast + reduction. * VPMulAccumulateReductionRecipe - (cast) + mul + reduction. The conveted abstract recipes will be lower to the concrete recipes (widen-cast + widen-mul + reduction) just before vector codegen. This should be a cost-model based decision which will be implemented in the following patch. In current status, still rely on the legacy cost model to calaulate the right cost. Split from #113903.
1 parent 22578ee commit 7871f9e

10 files changed

+535
-75
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9631,10 +9631,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
96319631
"entry block must be set to a VPRegionBlock having a non-empty entry "
96329632
"VPBasicBlock");
96339633

9634-
for (ElementCount VF : Range)
9635-
Plan->addVF(VF);
9636-
Plan->setName("Initial VPlan");
9637-
96389634
// Update wide induction increments to use the same step as the corresponding
96399635
// wide induction. This enables detecting induction increments directly in
96409636
// VPlan and removes redundant splats.
@@ -9670,6 +9666,21 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
96709666
// Adjust the recipes for any inloop reductions.
96719667
adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
96729668

9669+
// Transform recipes to abstract recipes if it is legal and beneficial and
9670+
// clamp the range for better cost estimation.
9671+
// TODO: Enable following transform when the EVL-version of extended-reduction
9672+
// and mulacc-reduction are implemented.
9673+
if (!CM.foldTailWithEVL()) {
9674+
VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
9675+
CM.CostKind);
9676+
VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
9677+
CostCtx, Range);
9678+
}
9679+
9680+
for (ElementCount VF : Range)
9681+
Plan->addVF(VF);
9682+
Plan->setName("Initial VPlan");
9683+
96739684
// Interleave memory: for each Interleave Group we marked earlier as relevant
96749685
// for this VPlan, replace the Recipes widening its memory instructions with a
96759686
// single VPInterleaveRecipe at its insertion point.

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1252,11 +1252,22 @@ class VPWidenRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
12521252
: VPRecipeWithIRFlags(VPDefOpcode, Operands, I), VPIRMetadata(I),
12531253
Opcode(I.getOpcode()) {}
12541254

1255+
template <typename IterT>
1256+
VPWidenRecipe(unsigned VPDefOpcode, unsigned Opcode,
1257+
iterator_range<IterT> Operands, bool NUW, bool NSW, DebugLoc DL)
1258+
: VPRecipeWithIRFlags(VPDefOpcode, Operands, WrapFlagsTy(NUW, NSW), DL),
1259+
Opcode(Opcode) {}
1260+
12551261
public:
12561262
template <typename IterT>
12571263
VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands)
12581264
: VPWidenRecipe(VPDef::VPWidenSC, I, Operands) {}
12591265

1266+
template <typename IterT>
1267+
VPWidenRecipe(unsigned Opcode, iterator_range<IterT> Operands, bool NUW,
1268+
bool NSW, DebugLoc DL)
1269+
: VPWidenRecipe(VPDef::VPWidenSC, Opcode, Operands, NUW, NSW, DL) {}
1270+
12601271
~VPWidenRecipe() override = default;
12611272

12621273
VPWidenRecipe *clone() override {
@@ -1301,10 +1312,16 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
13011312
"opcode of underlying cast doesn't match");
13021313
}
13031314

1304-
VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
1305-
: VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op), VPIRMetadata(),
1315+
VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL = {})
1316+
: VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, DL), VPIRMetadata(),
13061317
Opcode(Opcode), ResultTy(ResultTy) {}
13071318

1319+
VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,
1320+
bool IsNonNeg, DebugLoc DL = {})
1321+
: VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, NonNegFlagsTy(IsNonNeg),
1322+
DL),
1323+
Opcode(Opcode), ResultTy(ResultTy) {}
1324+
13081325
~VPWidenCastRecipe() override = default;
13091326

13101327
VPWidenCastRecipe *clone() override {

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2495,40 +2495,49 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
24952495
auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
24962496
unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind);
24972497
FastMathFlags FMFs = getFastMathFlags();
2498+
std::optional<FastMathFlags> OptionalFMF =
2499+
ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt;
24982500

2499-
// TODO: Support any-of and in-loop reductions.
2501+
// TODO: Support any-of reductions.
25002502
assert(
25012503
(!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) ||
25022504
ForceTargetInstructionCost.getNumOccurrences() > 0) &&
25032505
"Any-of reduction not implemented in VPlan-based cost model currently.");
2504-
assert(
2505-
(!cast<VPReductionPHIRecipe>(getOperand(0))->isInLoop() ||
2506-
ForceTargetInstructionCost.getNumOccurrences() > 0) &&
2507-
"In-loop reduction not implemented in VPlan-based cost model currently.");
25082506

2509-
// Cost = Reduction cost + BinOp cost
2510-
InstructionCost Cost =
2511-
Ctx.TTI.getArithmeticInstrCost(Opcode, ElementTy, Ctx.CostKind);
25122507
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) {
25132508
Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);
2514-
return Cost +
2515-
Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind);
2509+
return Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind);
25162510
}
25172511

2518-
return Cost + Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, FMFs,
2519-
Ctx.CostKind);
2512+
return Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, OptionalFMF,
2513+
Ctx.CostKind);
25202514
}
25212515

25222516
InstructionCost
25232517
VPExtendedReductionRecipe::computeCost(ElementCount VF,
25242518
VPCostContext &Ctx) const {
2525-
return 0;
2519+
unsigned Opcode = RecurrenceDescriptor::getOpcode(getRecurrenceKind());
2520+
Type *RedTy = Ctx.Types.inferScalarType(this);
2521+
auto *SrcVecTy =
2522+
cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(getVecOp()), VF));
2523+
assert(RedTy->isIntegerTy() &&
2524+
"ExtendedReduction only support integer type currently.");
2525+
InstructionCost Cost = Ctx.TTI.getExtendedReductionCost(
2526+
Opcode, isZExt(), RedTy, SrcVecTy, std::nullopt, Ctx.CostKind);
2527+
// The cost of this recipe should be decided by the legacy model.
2528+
return Cost.isValid() ? 0 : Cost;
25262529
}
25272530

25282531
InstructionCost
25292532
VPMulAccumulateReductionRecipe::computeCost(ElementCount VF,
25302533
VPCostContext &Ctx) const {
2531-
return 0;
2534+
Type *RedTy = Ctx.Types.inferScalarType(this);
2535+
auto *SrcVecTy =
2536+
cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(getVecOp0()), VF));
2537+
InstructionCost Cost =
2538+
Ctx.TTI.getMulAccReductionCost(isZExt(), RedTy, SrcVecTy, Ctx.CostKind);
2539+
// The cost of this recipe should be decided by the legacy model.
2540+
return Cost.isValid() ? 0 : Cost;
25322541
}
25332542

25342543
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

0 commit comments

Comments
 (0)