Skip to content

Commit 2bdc1a1

Browse files
authored
[LV] Use frozen start value for FindLastIV if needed. (#132691)
FindLastIV introduces multiple uses of the start value, where in the original source there was only a single use, when the epilogue is vectorized. Each use of undef may produce a different result, so introducing multiple uses can produce incorrect results when the input is undef/poison. If the start value may be undef or poison, freeze it and use the frozen value, which will be the same at all uses. See the following scenarios in Alive2: * Both main and epilogue vector loops execute, go to exit block: https://alive2.llvm.org/ce/z/_TSvRr * Both main and epilogue vector loops execute, go to scalar loop: https://alive2.llvm.org/ce/z/CsPj5v * Only epilogue vector loop executes, go to exit block: https://alive2.llvm.org/ce/z/5XqkNV * Only epilogue vector loop executes, go to scalar loop: https://alive2.llvm.org/ce/z/JUpqRN The latter 2 show requiring freezing the resume phi. That means we cannot freeze in the preheader. We could move the freeze to the main iteration count check, but that would be a bit fragile to find and other transforms can sink the freeze if needed. Depends on #132689 and #132690. Fixes #126836 PR: #132691
1 parent d6c076e commit 2bdc1a1

File tree

4 files changed

+95
-36
lines changed

4 files changed

+95
-36
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 73 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -7659,14 +7659,17 @@ static void fixReductionScalarResumeWhenVectorizingEpilog(
76597659
} else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(
76607660
RdxDesc.getRecurrenceKind())) {
76617661
using namespace llvm::PatternMatch;
7662-
Value *Cmp, *OrigResumeV;
7662+
Value *Cmp, *OrigResumeV, *CmpOp;
76637663
bool IsExpectedPattern =
76647664
match(MainResumeValue, m_Select(m_OneUse(m_Value(Cmp)),
76657665
m_Specific(RdxDesc.getSentinelValue()),
76667666
m_Value(OrigResumeV))) &&
7667-
match(Cmp,
7668-
m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV),
7669-
m_Specific(RdxDesc.getRecurrenceStartValue())));
7667+
(match(Cmp, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV),
7668+
m_Value(CmpOp))) &&
7669+
(match(CmpOp,
7670+
m_Freeze(m_Specific(RdxDesc.getRecurrenceStartValue()))) ||
7671+
(CmpOp == RdxDesc.getRecurrenceStartValue() &&
7672+
isGuaranteedNotToBeUndefOrPoison(CmpOp))));
76707673
assert(IsExpectedPattern && "Unexpected reduction resume pattern");
76717674
(void)IsExpectedPattern;
76727675
MainResumeValue = OrigResumeV;
@@ -10374,6 +10377,36 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
1037410377
VPlanTransforms::runPass(VPlanTransforms::removeDeadRecipes, MainPlan);
1037510378

1037610379
using namespace VPlanPatternMatch;
10380+
// When vectorizing the epilogue, FindLastIV reductions can introduce multiple
10381+
// uses of undef/poison. If the reduction start value may be undef or poison
10382+
// it needs to be frozen and the frozen start has to be used when computing
10383+
// the reduction result. We also need to use the frozen value in the resume
10384+
// phi generated by the main vector loop, as this is also used to compute the
10385+
// reduction result after the epilogue vector loop.
10386+
auto AddFreezeForFindLastIVReductions = [](VPlan &Plan,
10387+
bool UpdateResumePhis) {
10388+
VPBuilder Builder(Plan.getEntry());
10389+
for (VPRecipeBase &R : *Plan.getMiddleBlock()) {
10390+
auto *VPI = dyn_cast<VPInstruction>(&R);
10391+
if (!VPI || VPI->getOpcode() != VPInstruction::ComputeFindLastIVResult)
10392+
continue;
10393+
VPValue *OrigStart = VPI->getOperand(1);
10394+
if (isGuaranteedNotToBeUndefOrPoison(OrigStart->getLiveInIRValue()))
10395+
continue;
10396+
VPInstruction *Freeze =
10397+
Builder.createNaryOp(Instruction::Freeze, {OrigStart}, {}, "fr");
10398+
VPI->setOperand(1, Freeze);
10399+
if (UpdateResumePhis)
10400+
OrigStart->replaceUsesWithIf(Freeze, [Freeze](VPUser &U, unsigned) {
10401+
return Freeze != &U && isa<VPInstruction>(&U) &&
10402+
cast<VPInstruction>(&U)->getOpcode() ==
10403+
VPInstruction::ResumePhi;
10404+
});
10405+
}
10406+
};
10407+
AddFreezeForFindLastIVReductions(MainPlan, true);
10408+
AddFreezeForFindLastIVReductions(EpiPlan, false);
10409+
1037710410
VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
1037810411
VPValue *VectorTC = &MainPlan.getVectorTripCount();
1037910412
// If there is a suitable resume value for the canonical induction in the
@@ -10401,24 +10434,7 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
1040110434
VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
1040210435
Header->setName("vec.epilog.vector.body");
1040310436

10404-
// Re-use the trip count and steps expanded for the main loop, as
10405-
// skeleton creation needs it as a value that dominates both the scalar
10406-
// and vector epilogue loops
10407-
// TODO: This is a workaround needed for epilogue vectorization and it
10408-
// should be removed once induction resume value creation is done
10409-
// directly in VPlan.
10410-
for (auto &R : make_early_inc_range(*Plan.getEntry())) {
10411-
auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R);
10412-
if (!ExpandR)
10413-
continue;
10414-
auto *ExpandedVal =
10415-
Plan.getOrAddLiveIn(ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10416-
ExpandR->replaceAllUsesWith(ExpandedVal);
10417-
if (Plan.getTripCount() == ExpandR)
10418-
Plan.resetTripCount(ExpandedVal);
10419-
ExpandR->eraseFromParent();
10420-
}
10421-
10437+
DenseMap<Value *, Value *> ToFrozen;
1042210438
// Ensure that the start values for all header phi recipes are updated before
1042310439
// vectorizing the epilogue loop.
1042410440
for (VPRecipeBase &R : Header->phis()) {
@@ -10484,6 +10500,10 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
1048410500
ResumeV =
1048510501
Builder.CreateICmpNE(ResumeV, RdxDesc.getRecurrenceStartValue());
1048610502
} else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) {
10503+
ToFrozen[RdxDesc.getRecurrenceStartValue()] =
10504+
cast<PHINode>(ResumeV)->getIncomingValueForBlock(
10505+
EPI.MainLoopIterationCountCheck);
10506+
1048710507
// VPReductionPHIRecipe for FindLastIV reductions requires an adjustment
1048810508
// to the resume value. The resume value is adjusted to the sentinel
1048910509
// value when the final value from the main vector loop equals the start
@@ -10492,8 +10512,8 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
1049210512
// variable.
1049310513
BasicBlock *ResumeBB = cast<Instruction>(ResumeV)->getParent();
1049410514
IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt());
10495-
Value *Cmp =
10496-
Builder.CreateICmpEQ(ResumeV, RdxDesc.getRecurrenceStartValue());
10515+
Value *Cmp = Builder.CreateICmpEQ(
10516+
ResumeV, ToFrozen[RdxDesc.getRecurrenceStartValue()]);
1049710517
ResumeV =
1049810518
Builder.CreateSelect(Cmp, RdxDesc.getSentinelValue(), ResumeV);
1049910519
}
@@ -10509,6 +10529,35 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
1050910529
VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
1051010530
cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
1051110531
}
10532+
10533+
// For some VPValues in the epilogue plan we must re-use the generated IR
10534+
// values from the main plan. Replace them with live-in VPValues.
10535+
// TODO: This is a workaround needed for epilogue vectorization and it
10536+
// should be removed once induction resume value creation is done
10537+
// directly in VPlan.
10538+
for (auto &R : make_early_inc_range(*Plan.getEntry())) {
10539+
// Re-use frozen values from the main plan for Freeze VPInstructions in the
10540+
// epilogue plan. This ensures all users use the same frozen value.
10541+
auto *VPI = dyn_cast<VPInstruction>(&R);
10542+
if (VPI && VPI->getOpcode() == Instruction::Freeze) {
10543+
VPI->replaceAllUsesWith(Plan.getOrAddLiveIn(
10544+
ToFrozen.lookup(VPI->getOperand(0)->getLiveInIRValue())));
10545+
continue;
10546+
}
10547+
10548+
// Re-use the trip count and steps expanded for the main loop, as
10549+
// skeleton creation needs it as a value that dominates both the scalar
10550+
// and vector epilogue loops
10551+
auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R);
10552+
if (!ExpandR)
10553+
continue;
10554+
auto *ExpandedVal =
10555+
Plan.getOrAddLiveIn(ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10556+
ExpandR->replaceAllUsesWith(ExpandedVal);
10557+
if (Plan.getTripCount() == ExpandR)
10558+
Plan.resetTripCount(ExpandedVal);
10559+
ExpandR->eraseFromParent();
10560+
}
1051210561
}
1051310562

1051410563
// Generate bypass values from the additional bypass block. Note that when the

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -423,6 +423,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
423423
if (isSingleScalar() || isVectorToScalar())
424424
return true;
425425
switch (Opcode) {
426+
case Instruction::Freeze:
426427
case Instruction::ICmp:
427428
case Instruction::PHI:
428429
case Instruction::Select:
@@ -474,6 +475,10 @@ Value *VPInstruction::generate(VPTransformState &State) {
474475
Value *Idx = State.get(getOperand(1), /*IsScalar=*/true);
475476
return Builder.CreateExtractElement(Vec, Idx, Name);
476477
}
478+
case Instruction::Freeze: {
479+
Value *Op = State.get(getOperand(0), vputils::onlyFirstLaneUsed(this));
480+
return Builder.CreateFreeze(Op, Name);
481+
}
477482
case Instruction::ICmp: {
478483
bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
479484
Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
@@ -909,6 +914,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
909914
return false;
910915
switch (getOpcode()) {
911916
case Instruction::ExtractElement:
917+
case Instruction::Freeze:
912918
case Instruction::ICmp:
913919
case Instruction::Select:
914920
case VPInstruction::AnyOf:
@@ -941,6 +947,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
941947
case Instruction::ICmp:
942948
case Instruction::Select:
943949
case Instruction::Or:
950+
case Instruction::Freeze:
944951
// TODO: Cover additional opcodes.
945952
return vputils::onlyFirstLaneUsed(this);
946953
case VPInstruction::ActiveLaneMask:

llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
99
; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[TMP0]] to i32
1010
; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1
1111
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP2]], 8
12+
; CHECK-NEXT: [[FR:%.*]] = freeze i8 [[START]]
1213
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
1314
; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
1415
; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP2]], 32
@@ -42,7 +43,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
4243
; CHECK-NEXT: [[RDX_MINMAX:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
4344
; CHECK-NEXT: [[TMP13:%.*]] = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> [[RDX_MINMAX]])
4445
; CHECK-NEXT: [[RDX_SELECT_CMP12:%.*]] = icmp ne i8 [[TMP13]], -128
45-
; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP12]], i8 [[TMP13]], i8 [[START]]
46+
; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP12]], i8 [[TMP13]], i8 [[FR]]
4647
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
4748
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
4849
; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
@@ -53,8 +54,8 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
5354
; CHECK: [[VEC_EPILOG_PH]]:
5455
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
5556
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[TMP3]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
56-
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
57-
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i8 [[BC_MERGE_RDX]], [[START]]
57+
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[FR]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
58+
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i8 [[BC_MERGE_RDX]], [[FR]]
5859
; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i8 -128, i8 [[BC_MERGE_RDX]]
5960
; CHECK-NEXT: [[N_MOD_VF4:%.*]] = urem i32 [[TMP2]], 8
6061
; CHECK-NEXT: [[N_VEC5:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF4]]
@@ -82,7 +83,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
8283
; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]:
8384
; CHECK-NEXT: [[TMP22:%.*]] = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> [[TMP20]])
8485
; CHECK-NEXT: [[RDX_SELECT_CMP14:%.*]] = icmp ne i8 [[TMP22]], -128
85-
; CHECK-NEXT: [[RDX_SELECT15:%.*]] = select i1 [[RDX_SELECT_CMP14]], i8 [[TMP22]], i8 [[START]]
86+
; CHECK-NEXT: [[RDX_SELECT15:%.*]] = select i1 [[RDX_SELECT_CMP14]], i8 [[TMP22]], i8 [[FR]]
8687
; CHECK-NEXT: [[CMP_N16:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC5]]
8788
; CHECK-NEXT: br i1 [[CMP_N16]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
8889
; CHECK: [[VEC_EPILOG_SCALAR_PH]]:
@@ -128,6 +129,7 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 {
128129
; CHECK-NEXT: [[N_POS:%.*]] = icmp sgt i32 [[N]], 0
129130
; CHECK-NEXT: call void @llvm.assume(i1 [[N_POS]])
130131
; CHECK-NEXT: [[N_EXT:%.*]] = zext i32 [[N]] to i64
132+
; CHECK-NEXT: [[FR:%.*]] = freeze i32 [[START]]
131133
; CHECK-NEXT: [[TMP0:%.*]] = add nuw nsw i64 [[N_EXT]], 1
132134
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
133135
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
@@ -166,7 +168,7 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 {
166168
; CHECK-NEXT: [[RDX_MINMAX6:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[RDX_MINMAX5]], <4 x i32> [[TMP6]])
167169
; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[RDX_MINMAX6]])
168170
; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[TMP8]], -2147483648
169-
; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP8]], i32 [[START]]
171+
; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP8]], i32 [[FR]]
170172
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
171173
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
172174
; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
@@ -175,8 +177,8 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 {
175177
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
176178
; CHECK: [[VEC_EPILOG_PH]]:
177179
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
178-
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
179-
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[BC_MERGE_RDX]], [[START]]
180+
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[FR]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
181+
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[BC_MERGE_RDX]], [[FR]]
180182
; CHECK-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 -2147483648, i32 [[BC_MERGE_RDX]]
181183
; CHECK-NEXT: [[N_MOD_VF7:%.*]] = urem i64 [[TMP0]], 4
182184
; CHECK-NEXT: [[N_VEC8:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF7]]
@@ -203,7 +205,7 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 {
203205
; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]:
204206
; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP14]])
205207
; CHECK-NEXT: [[RDX_SELECT_CMP18:%.*]] = icmp ne i32 [[TMP16]], -2147483648
206-
; CHECK-NEXT: [[RDX_SELECT19:%.*]] = select i1 [[RDX_SELECT_CMP18]], i32 [[TMP16]], i32 [[START]]
208+
; CHECK-NEXT: [[RDX_SELECT19:%.*]] = select i1 [[RDX_SELECT_CMP18]], i32 [[TMP16]], i32 [[FR]]
207209
; CHECK-NEXT: [[CMP_N20:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC8]]
208210
; CHECK-NEXT: br i1 [[CMP_N20]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
209211
; CHECK: [[VEC_EPILOG_SCALAR_PH]]:

llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
217217
; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[TMP0]] to i32
218218
; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1
219219
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP2]], 4
220+
; CHECK-NEXT: [[FR:%.*]] = freeze i8 [[START]]
220221
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
221222
; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
222223
; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP2]], 4
@@ -243,7 +244,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
243244
; CHECK: [[MIDDLE_BLOCK]]:
244245
; CHECK-NEXT: [[TMP10:%.*]] = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> [[TMP8]])
245246
; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i8 [[TMP10]], -128
246-
; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i8 [[TMP10]], i8 [[START]]
247+
; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i8 [[TMP10]], i8 [[FR]]
247248
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
248249
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
249250
; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
@@ -254,8 +255,8 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
254255
; CHECK: [[VEC_EPILOG_PH]]:
255256
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
256257
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[TMP3]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
257-
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
258-
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i8 [[BC_MERGE_RDX]], [[START]]
258+
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[FR]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
259+
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i8 [[BC_MERGE_RDX]], [[FR]]
259260
; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i8 -128, i8 [[BC_MERGE_RDX]]
260261
; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i32 [[TMP2]], 4
261262
; CHECK-NEXT: [[N_VEC3:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF2]]
@@ -283,7 +284,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
283284
; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]:
284285
; CHECK-NEXT: [[TMP19:%.*]] = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> [[TMP17]])
285286
; CHECK-NEXT: [[RDX_SELECT_CMP12:%.*]] = icmp ne i8 [[TMP19]], -128
286-
; CHECK-NEXT: [[RDX_SELECT13:%.*]] = select i1 [[RDX_SELECT_CMP12]], i8 [[TMP19]], i8 [[START]]
287+
; CHECK-NEXT: [[RDX_SELECT13:%.*]] = select i1 [[RDX_SELECT_CMP12]], i8 [[TMP19]], i8 [[FR]]
287288
; CHECK-NEXT: [[CMP_N14:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC3]]
288289
; CHECK-NEXT: br i1 [[CMP_N14]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
289290
; CHECK: [[VEC_EPILOG_SCALAR_PH]]:

0 commit comments

Comments
 (0)