Skip to content

Commit 1a08a04

Browse files
committed
[VPlan] Implement interleaving as VPlan-to-VPlan transform.
This patch implements explicit interleaving as VPlan transform. In follow up patches this will allow simplifying VPTransform state (no need to store unrolled parts) as well as recipe execution (no need to generate code for multiple parts in a each recipe). It also allows for more general optimziations (e.g. avoid generating code for recipes that are uniform-across parts). In the initial implementation, a number of recipes still take the unrolled part as additional, optional argument, if their execution depends on the unrolled part. The computation for start/step values for scalable inductions changed slightly. Previously the step would be computed as scalar and then splatted, now vscale gets splatted and multiplied by the step in a vector mul. This has been split off #94339 which also includes changes to simplify VPTransfomState and recipes' ::execute. The current version mostly leaves existing ::execute untouched and instead sets VPTransfomState::UF to 1.
1 parent 5b04b6f commit 1a08a04

29 files changed

+908
-351
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,15 @@ class VPBuilder {
161161
return tryInsertInstruction(
162162
new VPInstruction(Opcode, Operands, WrapFlags, DL, Name));
163163
}
164+
165+
VPInstruction *createFPOp(unsigned Opcode,
166+
std::initializer_list<VPValue *> Operands,
167+
DebugLoc DL = {}, const Twine &Name = "",
168+
FastMathFlags FMFs = {}) {
169+
auto *Op = new VPInstruction(Opcode, Operands, FMFs, DL, Name);
170+
return tryInsertInstruction(Op);
171+
}
172+
164173
VPValue *createNot(VPValue *Operand, DebugLoc DL = {},
165174
const Twine &Name = "") {
166175
return createInstruction(VPInstruction::Not, {Operand}, DL, Name);

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 31 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3476,10 +3476,7 @@ void InnerLoopVectorizer::fixFixedOrderRecurrence(VPLiveOut *LO,
34763476
// initial value for the recurrence when jumping to the scalar loop.
34773477
VPValue *VPExtract = LO->getOperand(0);
34783478
using namespace llvm::VPlanPatternMatch;
3479-
assert(match(VPExtract, m_VPInstruction<VPInstruction::ExtractFromEnd>(
3480-
m_VPValue(), m_VPValue())) &&
3481-
"FOR LiveOut expects to use an extract from end.");
3482-
Value *ResumeScalarFOR = State.get(VPExtract, UF - 1, true);
3479+
Value *ResumeScalarFOR = State.get(VPExtract, 0, true);
34833480

34843481
// Fix the initial value of the original recurrence in the scalar loop.
34853482
PHINode *ScalarHeaderPhi = LO->getPhi();
@@ -7429,6 +7426,8 @@ LoopVectorizationPlanner::executePlan(
74297426
"expanded SCEVs to reuse can only be used during epilogue vectorization");
74307427
(void)IsEpilogueVectorization;
74317428

7429+
VPlanTransforms::interleave(BestVPlan, BestUF,
7430+
OrigLoop->getHeader()->getModule()->getContext());
74327431
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
74337432

74347433
LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
@@ -9152,42 +9151,59 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
91529151

91539152
auto *IVR = getParent()->getPlan()->getCanonicalIV();
91549153
PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true));
9154+
unsigned CurrentPart = 0;
9155+
if (getNumOperands() == 5)
9156+
CurrentPart =
9157+
cast<ConstantInt>(getOperand(4)->getLiveInIRValue())->getZExtValue();
91559158
Type *PhiType = IndDesc.getStep()->getType();
91569159

91579160
// Build a pointer phi
91589161
Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
91599162
Type *ScStValueType = ScalarStartValue->getType();
9160-
PHINode *NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi",
9161-
CanonicalIV->getIterator());
9163+
PHINode *NewPointerPhi = nullptr;
91629164

91639165
BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9164-
NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
9166+
if (getNumOperands() == 5) {
9167+
auto *GEP = cast<GetElementPtrInst>(State.get(getOperand(3), 0));
9168+
NewPointerPhi = cast<PHINode>(GEP->getPointerOperand());
9169+
} else {
9170+
NewPointerPhi =
9171+
PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
9172+
NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
9173+
}
91659174

91669175
// A pointer induction, performed by using a gep
91679176
BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
9177+
unsigned UF = getNumOperands() == 2
9178+
? 1
9179+
: cast<ConstantInt>(getOperand(2)->getLiveInIRValue())
9180+
->getZExtValue();
91689181

91699182
Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
91709183
Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
91719184
Value *NumUnrolledElems =
9172-
State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
9173-
Value *InductionGEP = GetElementPtrInst::Create(
9174-
State.Builder.getInt8Ty(), NewPointerPhi,
9175-
State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
9176-
InductionLoc);
9185+
State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, UF));
91779186
// Add induction update using an incorrect block temporarily. The phi node
91789187
// will be fixed after VPlan execution. Note that at this point the latch
91799188
// block cannot be used, as it does not exist yet.
91809189
// TODO: Model increment value in VPlan, by turning the recipe into a
91819190
// multi-def and a subclass of VPHeaderPHIRecipe.
9182-
NewPointerPhi->addIncoming(InductionGEP, VectorPH);
9191+
if (getNumOperands() != 5) {
9192+
Value *InductionGEP = GetElementPtrInst::Create(
9193+
State.Builder.getInt8Ty(), NewPointerPhi,
9194+
State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
9195+
InductionLoc);
9196+
9197+
NewPointerPhi->addIncoming(InductionGEP, VectorPH);
9198+
}
91839199

91849200
// Create UF many actual address geps that use the pointer
91859201
// phi as base and a vectorized version of the step value
91869202
// (<step*0, ..., step*N>) as offset.
91879203
for (unsigned Part = 0; Part < State.UF; ++Part) {
91889204
Type *VecPhiType = VectorType::get(PhiType, State.VF);
9189-
Value *StartOffsetScalar =
9190-
State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
9205+
Value *StartOffsetScalar = State.Builder.CreateMul(
9206+
RuntimeVF, ConstantInt::get(PhiType, CurrentPart));
91919207
Value *StartOffset =
91929208
State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
91939209
// Create a vector of consecutive numbers from zero to VF.

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -821,6 +821,10 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
821821
// FIXME: Model VF * UF computation completely in VPlan.
822822
VFxUF.setUnderlyingValue(
823823
createStepForVF(Builder, TripCountV->getType(), State.VF, State.UF));
824+
if (VF.getNumUsers() > 0) {
825+
VF.setUnderlyingValue(
826+
createStepForVF(Builder, TripCountV->getType(), State.VF, 1));
827+
}
824828

825829
// When vectorizing the epilogue loop, the canonical induction start value
826830
// needs to be changed from zero to the value after the main vector loop.
@@ -846,6 +850,7 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
846850
/// Assumes a single pre-header basic-block was created for this. Introduce
847851
/// additional basic-blocks as needed, and fill them all.
848852
void VPlan::execute(VPTransformState *State) {
853+
State->UF = 1;
849854
// Initialize CFG state.
850855
State->CFG.PrevVPBB = nullptr;
851856
State->CFG.ExitBB = State->CFG.PrevBB->getSingleSuccessor();
@@ -890,6 +895,9 @@ void VPlan::execute(VPTransformState *State) {
890895
// Move the last step to the end of the latch block. This ensures
891896
// consistent placement of all induction updates.
892897
Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
898+
if (isa<VPWidenIntOrFpInductionRecipe>(&R) && R.getNumOperands() == 4)
899+
Inc->setOperand(0, State->get(R.getOperand(3), 0));
900+
893901
Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
894902
continue;
895903
}
@@ -1254,6 +1262,10 @@ void VPlanIngredient::print(raw_ostream &O) const {
12541262

12551263
template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT);
12561264

1265+
bool VPValue::isDefinedOutsideVectorRegions() const {
1266+
return !hasDefiningRecipe() || !getDefiningRecipe()->getParent()->getParent();
1267+
}
1268+
12571269
void VPValue::replaceAllUsesWith(VPValue *New) {
12581270
replaceUsesWithIf(New, [](VPUser &, unsigned) { return true; });
12591271
}

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -701,6 +701,8 @@ class VPLiveOut : public VPUser {
701701

702702
PHINode *getPhi() const { return Phi; }
703703

704+
bool onlyFirstPartUsed(const VPValue *Op) const override { return true; }
705+
704706
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
705707
/// Print the VPLiveOut to \p O.
706708
void print(raw_ostream &O, VPSlotTracker &SlotTracker) const;
@@ -1330,6 +1332,9 @@ class VPInstruction : public VPRecipeWithIRFlags {
13301332
/// Returns true if this VPInstruction produces a scalar value from a vector,
13311333
/// e.g. by performing a reduction or extracting a lane.
13321334
bool isVectorToScalar() const;
1335+
1336+
/// Return the interleave count from the VPInstruction's last argument.
1337+
unsigned getInterleaveCount() const;
13331338
};
13341339

13351340
/// VPWidenRecipe is a recipe for producing a copy of vector type its
@@ -1611,6 +1616,9 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags {
16111616
isInBounds(), getDebugLoc());
16121617
}
16131618

1619+
/// Return the current part for this vector pointer.
1620+
unsigned getPartForRecipe() const;
1621+
16141622
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
16151623
/// Print the recipe.
16161624
void print(raw_ostream &O, const Twine &Indent,
@@ -1951,6 +1959,9 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
19511959

19521960
/// Returns true, if the phi is part of an in-loop reduction.
19531961
bool isInLoop() const { return IsInLoop; }
1962+
1963+
/// Return the current part for this scalar step.
1964+
unsigned getPartForRecipe() const;
19541965
};
19551966

19561967
/// A recipe for vectorizing a phi-node as a sequence of mask-based select
@@ -2593,6 +2604,9 @@ class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe {
25932604
/// Generate the canonical scalar induction phi of the vector loop.
25942605
void execute(VPTransformState &State) override;
25952606

2607+
/// Return the current part for this scalar step.
2608+
unsigned getPartForRecipe() const;
2609+
25962610
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
25972611
/// Print the recipe.
25982612
void print(raw_ostream &O, const Twine &Indent,
@@ -2637,7 +2651,9 @@ class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe {
26372651
~VPActiveLaneMaskPHIRecipe() override = default;
26382652

26392653
VPActiveLaneMaskPHIRecipe *clone() override {
2640-
return new VPActiveLaneMaskPHIRecipe(getOperand(0), getDebugLoc());
2654+
auto *R = new VPActiveLaneMaskPHIRecipe(getOperand(0), getDebugLoc());
2655+
R->addOperand(getOperand(1));
2656+
return R;
26412657
}
26422658

26432659
VP_CLASSOF_IMPL(VPDef::VPActiveLaneMaskPHISC)
@@ -2715,6 +2731,9 @@ class VPWidenCanonicalIVRecipe : public VPSingleDefRecipe {
27152731
/// step = <VF*UF, VF*UF, ..., VF*UF>.
27162732
void execute(VPTransformState &State) override;
27172733

2734+
/// Return the current part for this scalar step.
2735+
unsigned getPartForRecipe() const;
2736+
27182737
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
27192738
/// Print the recipe.
27202739
void print(raw_ostream &O, const Twine &Indent,
@@ -2827,6 +2846,9 @@ class VPScalarIVStepsRecipe : public VPRecipeWithIRFlags {
28272846
"Op must be an operand of the recipe");
28282847
return true;
28292848
}
2849+
2850+
/// Return the current part for this scalar step.
2851+
unsigned getPartForRecipe() const;
28302852
};
28312853

28322854
/// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It
@@ -3145,6 +3167,8 @@ class VPlan {
31453167
/// Represents the loop-invariant VF * UF of the vector loop region.
31463168
VPValue VFxUF;
31473169

3170+
VPValue VF;
3171+
31483172
/// Holds a mapping between Values and their corresponding VPValue inside
31493173
/// VPlan.
31503174
Value2VPValueTy Value2VPValue;
@@ -3232,6 +3256,7 @@ class VPlan {
32323256

32333257
/// Returns VF * UF of the vector loop region.
32343258
VPValue &getVFxUF() { return VFxUF; }
3259+
VPValue &getVF() { return VF; }
32353260

32363261
void addVF(ElementCount VF) { VFs.insert(VF); }
32373262

@@ -3665,6 +3690,29 @@ inline bool isUniformAfterVectorization(VPValue *VPV) {
36653690
return VPI->isVectorToScalar();
36663691
return false;
36673692
}
3693+
3694+
/// Checks if \p C is uniform across all VFs and UFs. It is considered as such
3695+
/// if it is either defined outside the vector region or its operand is known to
3696+
/// be uniform across all VFs and UFs (e.g. VPDerivedIV or VPCanonicalIVPHI).
3697+
inline bool isUniformAcrossVFsAndUFs(VPValue *V) {
3698+
if (auto *VPI = dyn_cast_or_null<VPInstruction>(V->getDefiningRecipe())) {
3699+
return VPI ==
3700+
VPI->getParent()->getPlan()->getCanonicalIV()->getBackedgeValue();
3701+
}
3702+
if (isa<VPCanonicalIVPHIRecipe, VPDerivedIVRecipe, VPExpandSCEVRecipe>(V))
3703+
return true;
3704+
if (isa<VPReplicateRecipe>(V) && cast<VPReplicateRecipe>(V)->isUniform() &&
3705+
(isa<LoadInst, StoreInst>(V->getUnderlyingValue())) &&
3706+
all_of(V->getDefiningRecipe()->operands(),
3707+
[](VPValue *Op) { return Op->isDefinedOutsideVectorRegions(); }))
3708+
return true;
3709+
3710+
auto *C = dyn_cast_or_null<VPScalarCastRecipe>(V->getDefiningRecipe());
3711+
return C && (C->isDefinedOutsideVectorRegions() ||
3712+
isa<VPDerivedIVRecipe>(C->getOperand(0)) ||
3713+
isa<VPCanonicalIVPHIRecipe>(C->getOperand(0)));
3714+
}
3715+
36683716
} // end namespace vputils
36693717

36703718
} // end namespace llvm

0 commit comments

Comments
 (0)