Skip to content

Commit 372daf0

Browse files
[LV] Decompose WidenIntOrFPInduction into phi and update recipes
Loop Vectorizer still has two recipes `VPWidenIntOrFpInductionRecipe` and `VPWidenPointerInductionRecipe` that behave in a VPlan as phi-like, as they're derived from `VPHeaderPHIRecipe`, but their generate functions construct vector phi and vector self-update in the vectorized loop. This is not only bad from readability of a VPlan, but also requires more code to maintain such behavior. For instance, there's already ad-hoc code motion to move generated updates of these recipes closer to the loop latch. The changeset: * Adds `WidenVFxUF` to represent `broadcast({1...UF} x `VFxUF`)` value * Decomposes existing `VPWidenIntOrFpInductionRecipe` into ``` WIDEN-INDUCTION vp<%iv> = phi ir<0>, vp<%be-value> ... EMIT vp<%widen-step> = mul ir<%step>, vp<WidenVFxUF> EMIT vp<%be-value> = add vp<%iv>,vp<%widen-step> ``` * Moves trunc optimization of widen IV into VPlan xform * Adds trivial cyclic dependency removal and mark some binops as non side-effecting * Adds element type to `VPValue` to query it for artifical added `VPValue` without underlying instruction
1 parent 36e73e4 commit 372daf0

File tree

171 files changed

+16092
-10117
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

171 files changed

+16092
-10117
lines changed

llvm/include/llvm/Analysis/IVDescriptors.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,11 @@ class InductionDescriptor {
363363
return nullptr;
364364
}
365365

366+
const Instruction *getExactFPMathInst() const {
367+
return const_cast<const Instruction *>(
368+
const_cast<InductionDescriptor *>(this)->getExactFPMathInst());
369+
}
370+
366371
/// Returns binary opcode of the induction operator.
367372
Instruction::BinaryOps getInductionOpcode() const {
368373
return InductionBinOp ? InductionBinOp->getOpcode()

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 88 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -8114,34 +8114,6 @@ VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
81148114
return nullptr;
81158115
}
81168116

8117-
VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8118-
TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) {
8119-
// Optimize the special case where the source is a constant integer
8120-
// induction variable. Notice that we can only optimize the 'trunc' case
8121-
// because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8122-
// (c) other casts depend on pointer size.
8123-
8124-
// Determine whether \p K is a truncation based on an induction variable that
8125-
// can be optimized.
8126-
auto isOptimizableIVTruncate =
8127-
[&](Instruction *K) -> std::function<bool(ElementCount)> {
8128-
return [=](ElementCount VF) -> bool {
8129-
return CM.isOptimizableIVTruncate(K, VF);
8130-
};
8131-
};
8132-
8133-
if (LoopVectorizationPlanner::getDecisionAndClampRange(
8134-
isOptimizableIVTruncate(I), Range)) {
8135-
8136-
auto *Phi = cast<PHINode>(I->getOperand(0));
8137-
const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8138-
VPValue *Start = Plan.getVPValueOrAddLiveIn(II.getStartValue());
8139-
return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
8140-
*OrigLoop, Range);
8141-
}
8142-
return nullptr;
8143-
}
8144-
81458117
VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
81468118
ArrayRef<VPValue *> Operands,
81478119
VPlanPtr &Plan) {
@@ -8275,6 +8247,70 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
82758247
Range);
82768248
}
82778249

8250+
VPWidenCastRecipe *VPRecipeBuilder::createCast(VPValue *V, Type *From,
8251+
Type *To) {
8252+
if (From == To)
8253+
return nullptr;
8254+
Instruction::CastOps CastOpcode;
8255+
if (To->isIntegerTy() && From->isIntegerTy())
8256+
CastOpcode = To->getPrimitiveSizeInBits() < From->getPrimitiveSizeInBits()
8257+
? Instruction::Trunc
8258+
: Instruction::ZExt;
8259+
else if (To->isIntegerTy())
8260+
CastOpcode = Instruction::FPToUI;
8261+
else
8262+
CastOpcode = Instruction::UIToFP;
8263+
8264+
return new VPWidenCastRecipe(CastOpcode, V, To);
8265+
}
8266+
8267+
VPRecipeBase *
8268+
VPRecipeBuilder::createWidenStep(VPWidenIntOrFpInductionRecipe &WIV,
8269+
ScalarEvolution &SE, VPlan &Plan,
8270+
DenseSet<VPRecipeBase *> *CreatedRecipes) {
8271+
PHINode *PN = WIV.getPHINode();
8272+
const InductionDescriptor &IndDesc = WIV.getInductionDescriptor();
8273+
VPValue *ScalarStep =
8274+
vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
8275+
Type *VFxUFTy = Plan.getVFxUF().getElementType();
8276+
Type *StepTy = IndDesc.getStep()->getType();
8277+
VPValue *WidenVFxUF = &Plan.getWidenVFxUF();
8278+
VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock();
8279+
if (VPWidenCastRecipe *WidenVFxUFCast =
8280+
createCast(&Plan.getWidenVFxUF(), VFxUFTy, StepTy)) {
8281+
WidenVFxUFCast->insertBefore(LatchVPBB->getTerminator());
8282+
if (CreatedRecipes)
8283+
CreatedRecipes->insert(WidenVFxUFCast);
8284+
WidenVFxUF = WidenVFxUFCast->getVPSingleValue();
8285+
}
8286+
const Instruction::BinaryOps UpdateOp =
8287+
IndDesc.getInductionOpcode() != Instruction::BinaryOpsEnd
8288+
? IndDesc.getInductionOpcode()
8289+
: Instruction::Add;
8290+
VPInstruction *Update;
8291+
if (StepTy->isIntegerTy()) {
8292+
VPInstruction *Mul = new VPInstruction(
8293+
Instruction::Mul, {WidenVFxUF, ScalarStep}, PN->getDebugLoc());
8294+
Mul->insertBefore(LatchVPBB->getTerminator());
8295+
if (CreatedRecipes)
8296+
CreatedRecipes->insert(Mul);
8297+
Update = new VPInstruction(UpdateOp, {&WIV, Mul}, PN->getDebugLoc());
8298+
Update->insertBefore(LatchVPBB->getTerminator());
8299+
} else {
8300+
FastMathFlags FMF = IndDesc.getExactFPMathInst()
8301+
? IndDesc.getExactFPMathInst()->getFastMathFlags()
8302+
: FastMathFlags();
8303+
VPInstruction *Mul = new VPInstruction(
8304+
Instruction::FMul, {WidenVFxUF, ScalarStep}, FMF, PN->getDebugLoc());
8305+
Mul->insertBefore(LatchVPBB->getTerminator());
8306+
Update = new VPInstruction(UpdateOp, {&WIV, Mul}, FMF, PN->getDebugLoc());
8307+
Update->insertBefore(LatchVPBB->getTerminator());
8308+
}
8309+
if (CreatedRecipes)
8310+
CreatedRecipes->insert(Update);
8311+
return Update;
8312+
}
8313+
82788314
VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
82798315
ArrayRef<VPValue *> Operands,
82808316
VPBasicBlock *VPBB, VPlanPtr &Plan) {
@@ -8324,10 +8360,15 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
83248360
};
83258361
}
83268362

8327-
void VPRecipeBuilder::fixHeaderPhis() {
8363+
void VPRecipeBuilder::fixHeaderPhis(VPlan &Plan) {
83288364
BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
83298365
for (VPHeaderPHIRecipe *R : PhisToFix) {
8330-
auto *PN = cast<PHINode>(R->getUnderlyingValue());
8366+
if (auto *VPWIFR = dyn_cast<VPWidenIntOrFpInductionRecipe>(R)) {
8367+
VPWIFR->addOperand(
8368+
createWidenStep(*VPWIFR, *PSE.getSE(), Plan)->getVPSingleValue());
8369+
continue;
8370+
}
8371+
PHINode *PN = cast<PHINode>(R->getUnderlyingValue());
83318372
VPRecipeBase *IncR =
83328373
getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
83338374
R->addOperand(IncR->getVPSingleValue());
@@ -8405,8 +8446,12 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
84058446
// can have earlier phis as incoming values.
84068447
recordRecipeOf(Phi);
84078448

8408-
if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range)))
8449+
if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range))) {
8450+
if (isa<VPWidenPointerInductionRecipe>(Recipe))
8451+
return Recipe;
8452+
PhisToFix.push_back(cast<VPWidenIntOrFpInductionRecipe>(Recipe));
84098453
return Recipe;
8454+
}
84108455

84118456
VPHeaderPHIRecipe *PhiRecipe = nullptr;
84128457
assert((Legal->isReductionVariable(Phi) ||
@@ -8441,10 +8486,17 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
84418486
return PhiRecipe;
84428487
}
84438488

8444-
if (isa<TruncInst>(Instr) &&
8445-
(Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8446-
Range, *Plan)))
8447-
return Recipe;
8489+
if (isa<TruncInst>(Instr)) {
8490+
auto IsOptimizableIVTruncate =
8491+
[&](Instruction *K) -> std::function<bool(ElementCount)> {
8492+
return [=](ElementCount VF) -> bool {
8493+
return CM.isOptimizableIVTruncate(K, VF);
8494+
};
8495+
};
8496+
8497+
LoopVectorizationPlanner::getDecisionAndClampRange(
8498+
IsOptimizableIVTruncate(Instr), Range);
8499+
}
84488500

84498501
// All widen recipes below deal only with VF > 1.
84508502
if (LoopVectorizationPlanner::getDecisionAndClampRange(
@@ -8707,7 +8759,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
87078759
!Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
87088760
"entry block must be set to a VPRegionBlock having a non-empty entry "
87098761
"VPBasicBlock");
8710-
RecipeBuilder.fixHeaderPhis();
8762+
RecipeBuilder.fixHeaderPhis(*Plan);
87118763

87128764
// ---------------------------------------------------------------------------
87138765
// Transform initial VPlan: Apply previously taken decisions, in order, to

llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,18 @@ class VPRecipeBuilder {
146146
/// between SRC and DST.
147147
VPValue *getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const;
148148

149+
/// A helper function to create VPWidenCastRecipe of a \p V VPValue to a \p To
150+
/// type.
151+
/// FIXME: Remove \p From argument and take it from a \p V value
152+
static VPWidenCastRecipe *createCast(VPValue *V, Type *From, Type *To);
153+
154+
/// A helper function which widens \p WIV step, multiplies it by WidenVFxUF
155+
/// and attaches to loop latch of the \p Plan. Returns multiplication.
156+
static VPRecipeBase *
157+
createWidenStep(VPWidenIntOrFpInductionRecipe &WIV, ScalarEvolution &SE,
158+
VPlan &Plan,
159+
DenseSet<VPRecipeBase *> *CreatedRecipes = nullptr);
160+
149161
/// Mark given ingredient for recording its recipe once one is created for
150162
/// it.
151163
void recordRecipeOf(Instruction *I) {
@@ -171,7 +183,7 @@ class VPRecipeBuilder {
171183

172184
/// Add the incoming values from the backedge to reduction & first-order
173185
/// recurrence cross-iteration phis.
174-
void fixHeaderPhis();
186+
void fixHeaderPhis(VPlan &Plan);
175187
};
176188
} // end namespace llvm
177189

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 54 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -76,12 +76,25 @@ Value *VPLane::getAsRuntimeExpr(IRBuilderBase &Builder,
7676
llvm_unreachable("Unknown lane kind");
7777
}
7878

79-
VPValue::VPValue(const unsigned char SC, Value *UV, VPDef *Def)
80-
: SubclassID(SC), UnderlyingVal(UV), Def(Def) {
79+
VPValue::VPValue(const unsigned char SC, Value *UV, VPDef *Def, Type *Ty)
80+
: SubclassID(SC), UnderlyingVal(UV), UnderlyingTy(Ty), Def(Def) {
81+
if (UnderlyingTy)
82+
assert((!UnderlyingVal || UnderlyingVal->getType() == UnderlyingTy) &&
83+
"VPValue with set type should either be created without underlying "
84+
"value or type should match the given type");
8185
if (Def)
8286
Def->addDefinedValue(this);
8387
}
8488

89+
Type *VPValue::getElementType() {
90+
return const_cast<Type *>(
91+
const_cast<const VPValue *>(this)->getElementType());
92+
}
93+
94+
const Type *VPValue::getElementType() const {
95+
return UnderlyingVal ? UnderlyingVal->getType() : UnderlyingTy;
96+
}
97+
8598
VPValue::~VPValue() {
8699
assert(Users.empty() && "trying to delete a VPValue with remaining users");
87100
if (Def)
@@ -763,6 +776,10 @@ VPlanPtr VPlan::createInitialVPlan(const SCEV *TripCount, ScalarEvolution &SE) {
763776
auto Plan = std::make_unique<VPlan>(Preheader, VecPreheader);
764777
Plan->TripCount =
765778
vputils::getOrCreateVPValueForSCEVExpr(*Plan, TripCount, SE);
779+
Type *TCType = TripCount->getType();
780+
Plan->getVectorTripCount().setElementType(TCType);
781+
Plan->getVFxUF().setElementType(TCType);
782+
Plan->getWidenVFxUF().setElementType(TCType);
766783
// Create empty VPRegionBlock, to be filled during processing later.
767784
auto *TopRegion = new VPRegionBlock("vector loop", false /*isReplicator*/);
768785
VPBlockUtils::insertBlockAfter(TopRegion, VecPreheader);
@@ -796,6 +813,18 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
796813
createStepForVF(Builder, TripCountV->getType(), State.VF, State.UF),
797814
0);
798815

816+
if (WidenVFxUF.getNumUsers() > 0)
817+
for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
818+
Value *Step =
819+
createStepForVF(Builder, TripCountV->getType(), State.VF, Part+1);
820+
if (State.VF.isScalar())
821+
State.set(&WidenVFxUF, Step, Part);
822+
else
823+
State.set(&WidenVFxUF,
824+
Builder.CreateVectorSplat(State.VF, Step, "widen.vfxuf"),
825+
Part);
826+
}
827+
799828
// When vectorizing the epilogue loop, the canonical induction start value
800829
// needs to be changed from zero to the value after the main vector loop.
801830
// FIXME: Improve modeling for canonical IV start values in the epilogue loop.
@@ -845,21 +874,16 @@ void VPlan::execute(VPTransformState *State) {
845874
if (isa<VPWidenPHIRecipe>(&R))
846875
continue;
847876

848-
if (isa<VPWidenPointerInductionRecipe>(&R) ||
849-
isa<VPWidenIntOrFpInductionRecipe>(&R)) {
877+
if (isa<VPWidenPointerInductionRecipe>(&R)) {
850878
PHINode *Phi = nullptr;
851-
if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
852-
Phi = cast<PHINode>(State->get(R.getVPSingleValue(), 0));
853-
} else {
854-
auto *WidenPhi = cast<VPWidenPointerInductionRecipe>(&R);
855-
// TODO: Split off the case that all users of a pointer phi are scalar
856-
// from the VPWidenPointerInductionRecipe.
857-
if (WidenPhi->onlyScalarsGenerated(State->VF.isScalable()))
858-
continue;
859-
860-
auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi, 0));
861-
Phi = cast<PHINode>(GEP->getPointerOperand());
862-
}
879+
auto *WidenPhi = cast<VPWidenPointerInductionRecipe>(&R);
880+
// TODO: Split off the case that all users of a pointer phi are scalar
881+
// from the VPWidenPointerInductionRecipe.
882+
if (WidenPhi->onlyScalarsGenerated(State->VF.isScalable()))
883+
continue;
884+
885+
auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi, 0));
886+
Phi = cast<PHINode>(GEP->getPointerOperand());
863887

864888
Phi->setIncomingBlock(1, VectorLatchBB);
865889

@@ -877,6 +901,7 @@ void VPlan::execute(VPTransformState *State) {
877901
// generated.
878902
bool SinglePartNeeded = isa<VPCanonicalIVPHIRecipe>(PhiR) ||
879903
isa<VPFirstOrderRecurrencePHIRecipe>(PhiR) ||
904+
isa<VPWidenIntOrFpInductionRecipe>(PhiR) ||
880905
(isa<VPReductionPHIRecipe>(PhiR) &&
881906
cast<VPReductionPHIRecipe>(PhiR)->isOrdered());
882907
unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF;
@@ -908,6 +933,12 @@ void VPlan::printLiveIns(raw_ostream &O) const {
908933
O << " = VF * UF";
909934
}
910935

936+
if (WidenVFxUF.getNumUsers() > 0) {
937+
O << "\nLive-in ";
938+
WidenVFxUF.printAsOperand(O, SlotTracker);
939+
O << " = WIDEN VF * UF";
940+
}
941+
911942
if (VectorTripCount.getNumUsers() > 0) {
912943
O << "\nLive-in ";
913944
VectorTripCount.printAsOperand(O, SlotTracker);
@@ -1083,6 +1114,11 @@ VPlan *VPlan::duplicate() {
10831114
}
10841115
Old2NewVPValues[&VectorTripCount] = &NewPlan->VectorTripCount;
10851116
Old2NewVPValues[&VFxUF] = &NewPlan->VFxUF;
1117+
Old2NewVPValues[&WidenVFxUF] = &NewPlan->WidenVFxUF;
1118+
NewPlan->getVectorTripCount().setElementType(
1119+
getVectorTripCount().getElementType());
1120+
NewPlan->getVFxUF().setElementType(getVFxUF().getElementType());
1121+
NewPlan->getWidenVFxUF().setElementType(getWidenVFxUF().getElementType());
10861122
if (BackedgeTakenCount) {
10871123
NewPlan->BackedgeTakenCount = new VPValue();
10881124
Old2NewVPValues[BackedgeTakenCount] = NewPlan->BackedgeTakenCount;
@@ -1379,6 +1415,8 @@ void VPSlotTracker::assignSlot(const VPValue *V) {
13791415
void VPSlotTracker::assignSlots(const VPlan &Plan) {
13801416
if (Plan.VFxUF.getNumUsers() > 0)
13811417
assignSlot(&Plan.VFxUF);
1418+
if (Plan.WidenVFxUF.getNumUsers() > 0)
1419+
assignSlot(&Plan.WidenVFxUF);
13821420
assignSlot(&Plan.VectorTripCount);
13831421
if (Plan.BackedgeTakenCount)
13841422
assignSlot(Plan.BackedgeTakenCount);

0 commit comments

Comments
 (0)