Skip to content

Commit fd31112

Browse files
committed
[VPlan] Insert Trunc/Exts for reductions directly in VPlan.
Update the code to create Trunc/Ext recipes directly in adjustRecipesForReductions instead of fixing it up later in fixReductions. This explicitly models the required conversions and also makes sure they are generated at the right place (instead of after the exit condition), hence the changes in a few tests.
1 parent dd64c82 commit fd31112

File tree

4 files changed

+47
-44
lines changed

4 files changed

+47
-44
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 35 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -3792,8 +3792,6 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
37923792
State.setDebugLocFrom(I->getDebugLoc());
37933793

37943794
VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
3795-
// This is the vector-clone of the value that leaves the loop.
3796-
Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
37973795

37983796
// Before each round, move the insertion point right between
37993797
// the PHIs and the values we are going to write.
@@ -3805,10 +3803,6 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
38053803
State.setDebugLocFrom(LoopExitInst->getDebugLoc());
38063804

38073805
Type *PhiTy = OrigPhi->getType();
3808-
3809-
VPBasicBlock *LatchVPBB =
3810-
PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock();
3811-
BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB];
38123806
// If tail is folded by masking, the vector value to leave the loop should be
38133807
// a Select choosing between the vectorized LoopExitInst and vectorized Phi,
38143808
// instead of the former. For an inloop reduction the reduction will already
@@ -3834,23 +3828,12 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
38343828
// then extend the loop exit value to enable InstCombine to evaluate the
38353829
// entire expression in the smaller type.
38363830
if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
3837-
assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
3838-
Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3839-
Builder.SetInsertPoint(VectorLoopLatch->getTerminator());
3840-
for (unsigned Part = 0; Part < UF; ++Part) {
3841-
Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3842-
Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3843-
: Builder.CreateZExt(Trunc, VecTy);
3844-
for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
3845-
if (U != Trunc) {
3846-
U->replaceUsesOfWith(RdxParts[Part], Extnd);
3847-
RdxParts[Part] = Extnd;
3848-
}
3849-
}
38503831
Builder.SetInsertPoint(LoopMiddleBlock,
38513832
LoopMiddleBlock->getFirstInsertionPt());
3852-
for (unsigned Part = 0; Part < UF; ++Part)
3833+
Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3834+
for (unsigned Part = 0; Part < UF; ++Part) {
38533835
RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3836+
}
38543837
}
38553838

38563839
// Reduce all of the unrolled parts into a single vector.
@@ -9155,35 +9138,55 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
91559138
PreviousLink = RedRecipe;
91569139
}
91579140
}
9158-
9159-
// If tail is folded by masking, introduce selects between the phi
9160-
// and the live-out instruction of each reduction, at the beginning of the
9161-
// dedicated latch block.
9162-
if (CM.foldTailByMasking()) {
91639141
Builder.setInsertPoint(&*LatchVPBB->begin());
91649142
for (VPRecipeBase &R :
91659143
Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9166-
VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9167-
if (!PhiR || PhiR->isInLoop())
9168-
continue;
9169-
const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9144+
VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9145+
if (!PhiR || PhiR->isInLoop())
9146+
continue;
9147+
9148+
const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9149+
auto *Result = PhiR->getBackedgeValue()->getDefiningRecipe();
9150+
// If tail is folded by masking, introduce selects between the phi
9151+
// and the live-out instruction of each reduction, at the beginning of the
9152+
// dedicated latch block.
9153+
if (CM.foldTailByMasking()) {
91709154
VPValue *Cond =
91719155
RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), *Plan);
91729156
VPValue *Red = PhiR->getBackedgeValue();
91739157
assert(Red->getDefiningRecipe()->getParent() != LatchVPBB &&
91749158
"reduction recipe must be defined before latch");
91759159
FastMathFlags FMFs = RdxDesc.getFastMathFlags();
91769160
Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
9177-
auto *Select =
9161+
Result =
91789162
PhiTy->isFloatingPointTy()
91799163
? new VPInstruction(Instruction::Select, {Cond, Red, PhiR}, FMFs)
91809164
: new VPInstruction(Instruction::Select, {Cond, Red, PhiR});
9181-
Select->insertBefore(&*Builder.getInsertPoint());
9165+
Result->insertBefore(&*Builder.getInsertPoint());
91829166
if (PreferPredicatedReductionSelect ||
91839167
TTI.preferPredicatedReductionSelect(
91849168
PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy,
91859169
TargetTransformInfo::ReductionFlags()))
9186-
PhiR->setOperand(1, Select);
9170+
PhiR->setOperand(1, Result->getVPSingleValue());
9171+
}
9172+
// If the vector reduction can be performed in a smaller type, we truncate
9173+
// then extend the loop exit value to enable InstCombine to evaluate the
9174+
// entire expression in the smaller type.
9175+
Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
9176+
if (PhiTy != RdxDesc.getRecurrenceType()) {
9177+
assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9178+
Type *RdxTy = RdxDesc.getRecurrenceType();
9179+
auto *Trunc = new VPWidenCastRecipe(Instruction::Trunc,
9180+
Result->getVPSingleValue(), RdxTy);
9181+
auto *Extnd =
9182+
RdxDesc.isSigned()
9183+
? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
9184+
: new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
9185+
9186+
Trunc->insertAfter(Result);
9187+
Extnd->insertAfter(Trunc);
9188+
Result->getVPSingleValue()->replaceAllUsesWith(Extnd);
9189+
Trunc->setOperand(0, Result->getVPSingleValue());
91879190
}
91889191
}
91899192

llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -207,10 +207,10 @@ define i16 @reduction_or_trunc(ptr noalias nocapture %ptr) {
207207
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP3]], align 2
208208
; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i16> [[WIDE_LOAD]] to <4 x i32>
209209
; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i32> [[TMP1]], [[TMP4]]
210-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
211-
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
212210
; CHECK-NEXT: [[TMP7:%.*]] = trunc <4 x i32> [[TMP5]] to <4 x i16>
213211
; CHECK-NEXT: [[TMP8]] = zext <4 x i16> [[TMP7]] to <4 x i32>
212+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
213+
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
214214
; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
215215
; CHECK: middle.block:
216216
; CHECK-NEXT: [[TMP9:%.*]] = trunc <4 x i32> [[TMP8]] to <4 x i16>
@@ -234,10 +234,10 @@ define i16 @reduction_or_trunc(ptr noalias nocapture %ptr) {
234234
; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i16>, ptr [[TMP16]], align 2
235235
; CHECK-NEXT: [[TMP17:%.*]] = zext <4 x i16> [[WIDE_LOAD4]] to <4 x i32>
236236
; CHECK-NEXT: [[TMP18:%.*]] = or <4 x i32> [[TMP14]], [[TMP17]]
237-
; CHECK-NEXT: [[INDEX_NEXT5]] = add nuw i32 [[INDEX2]], 4
238-
; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT5]], 256
239237
; CHECK-NEXT: [[TMP20:%.*]] = trunc <4 x i32> [[TMP18]] to <4 x i16>
240238
; CHECK-NEXT: [[TMP21]] = zext <4 x i16> [[TMP20]] to <4 x i32>
239+
; CHECK-NEXT: [[INDEX_NEXT5]] = add nuw i32 [[INDEX2]], 4
240+
; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT5]], 256
241241
; CHECK-NEXT: br i1 [[TMP19]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
242242
; CHECK: vec.epilog.middle.block:
243243
; CHECK-NEXT: [[TMP22:%.*]] = trunc <4 x i32> [[TMP21]] to <4 x i16>

llvm/test/Transforms/LoopVectorize/reduction-small-size.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,10 @@ define i8 @PR34687(i1 %c, i32 %x, i32 %n) {
2222
; CHECK-NEXT: [[TMP0:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
2323
; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i32> [[VEC_PHI]], <i32 255, i32 255, i32 255, i32 255>
2424
; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[BROADCAST_SPLAT2]]
25-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
26-
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
2725
; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i8>
2826
; CHECK-NEXT: [[TMP5]] = zext <4 x i8> [[TMP4]] to <4 x i32>
27+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
28+
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
2929
; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
3030
; CHECK: middle.block:
3131
; CHECK-NEXT: [[TMP6:%.*]] = trunc <4 x i32> [[TMP5]] to <4 x i8>
@@ -99,10 +99,10 @@ define i32 @PR35734(i32 %x, i32 %y) {
9999
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
100100
; CHECK-NEXT: [[TMP3:%.*]] = and <4 x i32> [[VEC_PHI]], <i32 1, i32 1, i32 1, i32 1>
101101
; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP3]], <i32 -1, i32 -1, i32 -1, i32 -1>
102-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
103-
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
104102
; CHECK-NEXT: [[TMP6:%.*]] = trunc <4 x i32> [[TMP4]] to <4 x i1>
105103
; CHECK-NEXT: [[TMP7]] = sext <4 x i1> [[TMP6]] to <4 x i32>
104+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
105+
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
106106
; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
107107
; CHECK: middle.block:
108108
; CHECK-NEXT: [[TMP8:%.*]] = trunc <4 x i32> [[TMP7]] to <4 x i1>

llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,14 @@ define i8 @reduction_add_trunc(ptr noalias nocapture %A) {
1717
; CHECK-NEXT: [[TMP27:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
1818
; CHECK-NEXT: [[TMP28:%.*]] = add <vscale x 8 x i32> [[TMP14]], [[TMP26]]
1919
; CHECK-NEXT: [[TMP29:%.*]] = add <vscale x 8 x i32> [[TMP15]], [[TMP27]]
20+
; CHECK-NEXT: [[TMP33:%.*]] = trunc <vscale x 8 x i32> [[TMP28]] to <vscale x 8 x i8>
21+
; CHECK-NEXT: [[TMP35:%.*]] = trunc <vscale x 8 x i32> [[TMP29]] to <vscale x 8 x i8>
22+
; CHECK-NEXT: [[TMP34]] = zext <vscale x 8 x i8> [[TMP33]] to <vscale x 8 x i32>
23+
; CHECK-NEXT: [[TMP36]] = zext <vscale x 8 x i8> [[TMP35]] to <vscale x 8 x i32>
2024
; CHECK-NEXT: [[TMP30:%.*]] = call i32 @llvm.vscale.i32()
2125
; CHECK-NEXT: [[TMP31:%.*]] = mul i32 [[TMP30]], 16
2226
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP31]]
2327
; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i32 [[INDEX_NEXT]], {{%.*}}
24-
; CHECK-NEXT: [[TMP33:%.*]] = trunc <vscale x 8 x i32> [[TMP28]] to <vscale x 8 x i8>
25-
; CHECK-NEXT: [[TMP34]] = zext <vscale x 8 x i8> [[TMP33]] to <vscale x 8 x i32>
26-
; CHECK-NEXT: [[TMP35:%.*]] = trunc <vscale x 8 x i32> [[TMP29]] to <vscale x 8 x i8>
27-
; CHECK-NEXT: [[TMP36]] = zext <vscale x 8 x i8> [[TMP35]] to <vscale x 8 x i32>
2828
; CHECK: middle.block:
2929
; CHECK-NEXT: [[TMP37:%.*]] = trunc <vscale x 8 x i32> [[TMP34]] to <vscale x 8 x i8>
3030
; CHECK-NEXT: [[TMP38:%.*]] = trunc <vscale x 8 x i32> [[TMP36]] to <vscale x 8 x i8>

0 commit comments

Comments
 (0)