Skip to content

Commit e536698

Browse files
committed
[VPlan] Implement interleaving as VPlan-to-VPlan transform.
This patch implements explicit interleaving as VPlan transform, thus simplifying VPTransform state (no need to store unrolled parts) as well as recipe execution (no need to generate code for multiple parts in a each recipe). It also allos for more general optimziations (e.g. avoid generating code for recipes that are uniform-across parts). In the initial implementation, a number of recipes still take the unrolled part as additional, optional argument, if their execution depends on the unrolled part. The computation for start/step values for scalable inductions changed slightly. Previously the step would be computed as scalar and then splatted, now vscale gets splatted and multiplied by the step in a vector mul. Depends on #93396.
1 parent 52d29eb commit e536698

29 files changed

+1368
-1067
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,15 @@ class VPBuilder {
161161
return tryInsertInstruction(
162162
new VPInstruction(Opcode, Operands, WrapFlags, DL, Name));
163163
}
164+
165+
VPInstruction *createFPOp(unsigned Opcode,
166+
std::initializer_list<VPValue *> Operands,
167+
DebugLoc DL = {}, const Twine &Name = "",
168+
FastMathFlags FMFs = {}) {
169+
auto *Op = new VPInstruction(Opcode, Operands, FMFs, DL, Name);
170+
return tryInsertInstruction(Op);
171+
}
172+
164173
VPValue *createNot(VPValue *Operand, DebugLoc DL = {},
165174
const Twine &Name = "") {
166175
return createInstruction(VPInstruction::Not, {Operand}, DL, Name);

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 167 additions & 204 deletions
Large diffs are not rendered by default.

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 84 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -223,47 +223,47 @@ VPTransformState::VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI,
223223
LVer(nullptr),
224224
TypeAnalysis(Plan->getCanonicalIV()->getScalarType(), Ctx) {}
225225

226-
Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) {
226+
Value *VPTransformState::get(VPValue *Def, const VPLane &Lane) {
227227
if (Def->isLiveIn())
228228
return Def->getLiveInIRValue();
229229

230-
if (hasScalarValue(Def, Instance)) {
231-
return Data
232-
.PerPartScalars[Def][Instance.Part][Instance.Lane.mapToCacheIndex(VF)];
230+
if (hasScalarValue(Def, Lane)) {
231+
return Data.Scalars[Def][Lane.mapToCacheIndex(VF)];
233232
}
234233

235-
assert(hasVectorValue(Def, Instance.Part));
236-
auto *VecPart = Data.PerPartOutput[Def][Instance.Part];
234+
assert(hasVectorValue(Def));
235+
auto *VecPart = Data.Output[Def];
237236
if (!VecPart->getType()->isVectorTy()) {
238-
assert(Instance.Lane.isFirstLane() && "cannot get lane > 0 for scalar");
237+
assert(Lane.isFirstLane() && "cannot get lane > 0 for scalar");
239238
return VecPart;
240239
}
241240
// TODO: Cache created scalar values.
242-
Value *Lane = Instance.Lane.getAsRuntimeExpr(Builder, VF);
243-
auto *Extract = Builder.CreateExtractElement(VecPart, Lane);
244-
// set(Def, Extract, Instance);
241+
Value *LaneV = Lane.getAsRuntimeExpr(Builder, VF);
242+
auto *Extract = Builder.CreateExtractElement(VecPart, LaneV);
243+
// set(Def, Extract, Lane);
245244
return Extract;
246245
}
247246

248-
Value *VPTransformState::get(VPValue *Def, unsigned Part, bool NeedsScalar) {
247+
Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) {
249248
if (NeedsScalar) {
250-
assert((VF.isScalar() || Def->isLiveIn() || hasVectorValue(Def, Part) ||
251-
(hasScalarValue(Def, VPIteration(Part, 0)) &&
252-
Data.PerPartScalars[Def][Part].size() == 1)) &&
253-
"Trying to access a single scalar per part but has multiple scalars "
254-
"per part.");
255-
return get(Def, VPIteration(Part, 0));
249+
assert(
250+
(VF.isScalar() || Def->isLiveIn() || hasVectorValue(Def) ||
251+
(hasScalarValue(Def, VPLane(0)) && Data.Scalars[Def].size() == 1)) &&
252+
"Trying to access a single scalar per part but has multiple scalars "
253+
"per part.");
254+
return get(Def, VPLane(0));
256255
}
257256

258257
// If Values have been set for this Def return the one relevant for \p Part.
259-
if (hasVectorValue(Def, Part))
260-
return Data.PerPartOutput[Def][Part];
258+
if (hasVectorValue(Def))
259+
return Data.Output[Def];
261260

262261
auto GetBroadcastInstrs = [this, Def](Value *V) {
263262
bool SafeToHoist = Def->isDefinedOutsideVectorRegions();
264263
if (VF.isScalar())
265264
return V;
266-
// Place the code for broadcasting invariant variables in the new preheader.
265+
// Place the code for broadcasting invariant variables in the new
266+
// preheader.
267267
IRBuilder<>::InsertPointGuard Guard(Builder);
268268
if (SafeToHoist) {
269269
BasicBlock *LoopVectorPreHeader = CFG.VPBB2IRBB[cast<VPBasicBlock>(
@@ -272,56 +272,55 @@ Value *VPTransformState::get(VPValue *Def, unsigned Part, bool NeedsScalar) {
272272
Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
273273
}
274274

275-
// Place the code for broadcasting invariant variables in the new preheader.
276-
// Broadcast the scalar into all locations in the vector.
275+
// Place the code for broadcasting invariant variables in the new
276+
// preheader. Broadcast the scalar into all locations in the vector.
277277
Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
278278

279279
return Shuf;
280280
};
281281

282-
if (!hasScalarValue(Def, {Part, 0})) {
282+
if (!hasScalarValue(Def, VPLane(0))) {
283283
assert(Def->isLiveIn() && "expected a live-in");
284-
if (Part != 0)
285-
return get(Def, 0);
286284
Value *IRV = Def->getLiveInIRValue();
287285
Value *B = GetBroadcastInstrs(IRV);
288-
set(Def, B, Part);
286+
set(Def, B);
289287
return B;
290288
}
291289

292-
Value *ScalarValue = get(Def, {Part, 0});
290+
Value *ScalarValue = get(Def, VPLane(0));
293291
// If we aren't vectorizing, we can just copy the scalar map values over
294292
// to the vector map.
295293
if (VF.isScalar()) {
296-
set(Def, ScalarValue, Part);
294+
set(Def, ScalarValue);
297295
return ScalarValue;
298296
}
299297

300298
bool IsUniform = vputils::isUniformAfterVectorization(Def);
301299

302-
unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
300+
VPLane LastLane = VPLane(IsUniform ? 0 : VF.getKnownMinValue() - 1);
303301
// Check if there is a scalar value for the selected lane.
304-
if (!hasScalarValue(Def, {Part, LastLane})) {
305-
// At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and
306-
// VPExpandSCEVRecipes can also be uniform.
302+
if (!hasScalarValue(Def, LastLane)) {
303+
// At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes
304+
// and VPExpandSCEVRecipes can also be uniform.
307305
assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) ||
308306
isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe()) ||
309307
isa<VPExpandSCEVRecipe>(Def->getDefiningRecipe())) &&
310308
"unexpected recipe found to be invariant");
311309
IsUniform = true;
312-
LastLane = 0;
310+
LastLane = VPLane(0);
313311
}
314312

315-
auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
316-
// Set the insert point after the last scalarized instruction or after the
317-
// last PHI, if LastInst is a PHI. This ensures the insertelement sequence
318-
// will directly follow the scalar definitions.
319313
auto OldIP = Builder.saveIP();
320-
auto NewIP =
321-
isa<PHINode>(LastInst)
322-
? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
323-
: std::next(BasicBlock::iterator(LastInst));
324-
Builder.SetInsertPoint(&*NewIP);
314+
if (auto *LastInst = dyn_cast<Instruction>(get(Def, LastLane))) {
315+
// Set the insert point after the last scalarized instruction or after the
316+
// last PHI, if LastInst is a PHI. This ensures the insertelement sequence
317+
// will directly follow the scalar definitions.
318+
auto NewIP =
319+
isa<PHINode>(LastInst)
320+
? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
321+
: std::next(BasicBlock::iterator(LastInst));
322+
Builder.SetInsertPoint(&*NewIP);
323+
}
325324

326325
// However, if we are vectorizing, we need to construct the vector values.
327326
// If the value is known to be uniform after vectorization, we can just
@@ -332,15 +331,16 @@ Value *VPTransformState::get(VPValue *Def, unsigned Part, bool NeedsScalar) {
332331
Value *VectorValue = nullptr;
333332
if (IsUniform) {
334333
VectorValue = GetBroadcastInstrs(ScalarValue);
335-
set(Def, VectorValue, Part);
334+
set(Def, VectorValue);
336335
} else {
337336
// Initialize packing with insertelements to start from undef.
338337
assert(!VF.isScalable() && "VF is assumed to be non scalable.");
339-
Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
340-
set(Def, Undef, Part);
338+
Value *Undef =
339+
PoisonValue::get(VectorType::get(ScalarValue->getType(), VF));
340+
set(Def, Undef);
341341
for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
342-
packScalarIntoVectorValue(Def, {Part, Lane});
343-
VectorValue = get(Def, Part);
342+
packScalarIntoVectorValue(Def, Lane);
343+
VectorValue = get(Def);
344344
}
345345
Builder.restoreIP(OldIP);
346346
return VectorValue;
@@ -392,12 +392,12 @@ void VPTransformState::setDebugLocFrom(DebugLoc DL) {
392392
}
393393

394394
void VPTransformState::packScalarIntoVectorValue(VPValue *Def,
395-
const VPIteration &Instance) {
396-
Value *ScalarInst = get(Def, Instance);
397-
Value *VectorValue = get(Def, Instance.Part);
398-
VectorValue = Builder.CreateInsertElement(
399-
VectorValue, ScalarInst, Instance.Lane.getAsRuntimeExpr(Builder, VF));
400-
set(Def, VectorValue, Instance.Part);
395+
const VPLane &Lane) {
396+
Value *ScalarInst = get(Def, Lane);
397+
Value *VectorValue = get(Def);
398+
VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
399+
Lane.getAsRuntimeExpr(Builder, VF));
400+
set(Def, VectorValue);
401401
}
402402

403403
BasicBlock *
@@ -453,7 +453,7 @@ void VPIRBasicBlock::execute(VPTransformState *State) {
453453
}
454454

455455
void VPBasicBlock::execute(VPTransformState *State) {
456-
bool Replica = State->Instance && !State->Instance->isFirstIteration();
456+
bool Replica = State->Lane && !State->Lane->isFirstLane();
457457
VPBasicBlock *PrevVPBB = State->CFG.PrevVPBB;
458458
VPBlockBase *SingleHPred = nullptr;
459459
BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible.
@@ -724,27 +724,24 @@ void VPRegionBlock::execute(VPTransformState *State) {
724724
return;
725725
}
726726

727-
assert(!State->Instance && "Replicating a Region with non-null instance.");
727+
assert(!State->Lane && "Replicating a Region with non-null instance.");
728728

729729
// Enter replicating mode.
730-
State->Instance = VPIteration(0, 0);
731-
732-
for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) {
733-
State->Instance->Part = Part;
734-
assert(!State->VF.isScalable() && "VF is assumed to be non scalable.");
735-
for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF;
736-
++Lane) {
737-
State->Instance->Lane = VPLane(Lane, VPLane::Kind::First);
738-
// Visit the VPBlocks connected to \p this, starting from it.
739-
for (VPBlockBase *Block : RPOT) {
740-
LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
741-
Block->execute(State);
742-
}
730+
State->Lane = VPLane(0);
731+
732+
assert(!State->VF.isScalable() && "VF is assumed to be non scalable.");
733+
for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF;
734+
++Lane) {
735+
State->Lane = VPLane(Lane, VPLane::Kind::First);
736+
// Visit the VPBlocks connected to \p this, starting from it.
737+
for (VPBlockBase *Block : RPOT) {
738+
LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
739+
Block->execute(State);
743740
}
744741
}
745742

746743
// Exit replicating mode.
747-
State->Instance.reset();
744+
State->Lane.reset();
748745
}
749746

750747
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -816,10 +813,15 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
816813
// FIXME: Model VF * UF computation completely in VPlan.
817814
VFxUF.setUnderlyingValue(
818815
createStepForVF(Builder, TripCountV->getType(), State.VF, State.UF));
816+
if (VF.getNumUsers() > 0) {
817+
VF.setUnderlyingValue(
818+
createStepForVF(Builder, TripCountV->getType(), State.VF, 1));
819+
}
819820

820821
// When vectorizing the epilogue loop, the canonical induction start value
821822
// needs to be changed from zero to the value after the main vector loop.
822-
// FIXME: Improve modeling for canonical IV start values in the epilogue loop.
823+
// FIXME: Improve modeling for canonical IV start values in the epilogue
824+
// loop.
823825
if (CanonicalIVStartValue) {
824826
VPValue *VPV = getOrAddLiveIn(CanonicalIVStartValue);
825827
auto *IV = getCanonicalIV();
@@ -871,12 +873,12 @@ void VPlan::execute(VPTransformState *State) {
871873
isa<VPWidenIntOrFpInductionRecipe>(&R)) {
872874
PHINode *Phi = nullptr;
873875
if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
874-
Phi = cast<PHINode>(State->get(R.getVPSingleValue(), 0));
876+
Phi = cast<PHINode>(State->get(R.getVPSingleValue()));
875877
} else {
876878
auto *WidenPhi = cast<VPWidenPointerInductionRecipe>(&R);
877879
assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) &&
878880
"recipe generating only scalars should have been replaced");
879-
auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi, 0));
881+
auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi));
880882
Phi = cast<PHINode>(GEP->getPointerOperand());
881883
}
882884

@@ -885,6 +887,9 @@ void VPlan::execute(VPTransformState *State) {
885887
// Move the last step to the end of the latch block. This ensures
886888
// consistent placement of all induction updates.
887889
Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
890+
if (isa<VPWidenIntOrFpInductionRecipe>(&R) && R.getNumOperands() == 4)
891+
Inc->setOperand(0, State->get(R.getOperand(3)));
892+
888893
Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
889894
continue;
890895
}
@@ -894,24 +899,13 @@ void VPlan::execute(VPTransformState *State) {
894899
// only a single part is generated, which provides the last part from the
895900
// previous iteration. For non-ordered reductions all UF parts are
896901
// generated.
897-
bool SinglePartNeeded =
898-
isa<VPCanonicalIVPHIRecipe>(PhiR) ||
899-
isa<VPFirstOrderRecurrencePHIRecipe, VPEVLBasedIVPHIRecipe>(PhiR) ||
900-
(isa<VPReductionPHIRecipe>(PhiR) &&
901-
cast<VPReductionPHIRecipe>(PhiR)->isOrdered());
902902
bool NeedsScalar =
903903
isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe>(PhiR) ||
904904
(isa<VPReductionPHIRecipe>(PhiR) &&
905905
cast<VPReductionPHIRecipe>(PhiR)->isInLoop());
906-
unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF;
907-
908-
for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
909-
Value *Phi = State->get(PhiR, Part, NeedsScalar);
910-
Value *Val =
911-
State->get(PhiR->getBackedgeValue(),
912-
SinglePartNeeded ? State->UF - 1 : Part, NeedsScalar);
913-
cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
914-
}
906+
Value *Phi = State->get(PhiR, NeedsScalar);
907+
Value *Val = State->get(PhiR->getBackedgeValue(), NeedsScalar);
908+
cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
915909
}
916910

917911
State->CFG.DTU.flush();
@@ -1249,6 +1243,10 @@ void VPlanIngredient::print(raw_ostream &O) const {
12491243

12501244
template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT);
12511245

1246+
bool VPValue::isDefinedOutsideVectorRegions() const {
1247+
return !hasDefiningRecipe() || !getDefiningRecipe()->getParent()->getParent();
1248+
}
1249+
12521250
void VPValue::replaceAllUsesWith(VPValue *New) {
12531251
replaceUsesWithIf(New, [](VPUser &, unsigned) { return true; });
12541252
}

0 commit comments

Comments
 (0)