Skip to content

Commit 2ee7bcb

Browse files
committed
[LV] Optimise users of induction variables in early exit blocks
This is the second of two PRs that attempts to improve the IR generated in the exit blocks of vectorised loops with uncountable early exits. It follows on from PR #128880. In this PR I am improving the generated code for users of induction variables in early exit blocks. This requires using a newly added VPInstruction called FirstActiveLane, which calculates the index of the first active predicate in the mask operand. I have added a new function optimizeEarlyExitInductionUser that is called from optimizeInductionExitUsers when handling with users in early exit blocks.
1 parent 7e9a13e commit 2ee7bcb

File tree

4 files changed

+154
-172
lines changed

4 files changed

+154
-172
lines changed

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 68 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -737,6 +737,66 @@ static VPWidenInductionRecipe *getOptimizableIVOf(VPValue *VPV) {
737737
return IsWideIVInc() ? WideIV : nullptr;
738738
}
739739

740+
/// Attempts to optimize the induction variable exit values for users in the
741+
/// early exit block.
742+
static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
743+
VPTypeAnalysis &TypeInfo,
744+
VPBlockBase *PredVPBB,
745+
VPValue *Op) {
746+
using namespace VPlanPatternMatch;
747+
748+
VPValue *Incoming, *Mask;
749+
if (!match(Op, m_VPInstruction<Instruction::ExtractElement>(
750+
m_VPValue(Incoming),
751+
m_VPInstruction<VPInstruction::FirstActiveLane>(
752+
m_VPValue(Mask)))))
753+
return nullptr;
754+
755+
auto *WideIV = getOptimizableIVOf(Incoming);
756+
if (!WideIV)
757+
return nullptr;
758+
759+
auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
760+
if (WideIntOrFp && WideIntOrFp->getTruncInst())
761+
return nullptr;
762+
763+
// Calculate the final index.
764+
VPValue *EndValue = Plan.getCanonicalIV();
765+
auto CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
766+
VPBuilder B(cast<VPBasicBlock>(PredVPBB));
767+
768+
DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
769+
VPValue *FirstActiveLane =
770+
B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
771+
if (CanonicalIVType != TypeInfo.inferScalarType(FirstActiveLane)) {
772+
Instruction::CastOps CastOp = CanonicalIVType->getScalarSizeInBits() < 64
773+
? Instruction::Trunc
774+
: Instruction::ZExt;
775+
FirstActiveLane =
776+
B.createScalarCast(CastOp, FirstActiveLane, CanonicalIVType, DL);
777+
}
778+
EndValue = B.createNaryOp(Instruction::Add, {EndValue, FirstActiveLane}, DL);
779+
780+
// `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
781+
// changed it means the exit is using the incremented value, so we need to
782+
// add the step.
783+
if (Incoming != WideIV) {
784+
VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(CanonicalIVType, 1));
785+
EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);
786+
}
787+
788+
if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
789+
const InductionDescriptor &ID = WideIV->getInductionDescriptor();
790+
VPValue *Start = WideIV->getStartValue();
791+
VPValue *Step = WideIV->getStepValue();
792+
EndValue = B.createDerivedIV(
793+
ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
794+
Start, EndValue, Step);
795+
}
796+
797+
return EndValue;
798+
}
799+
740800
/// Attempts to optimize the induction variable exit values for users in the
741801
/// exit block coming from the latch in the original scalar loop.
742802
static VPValue *
@@ -799,12 +859,15 @@ void VPlanTransforms::optimizeInductionExitUsers(
799859
break;
800860

801861
for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
862+
VPValue *Escape = nullptr;
802863
if (PredVPBB == MiddleVPBB)
803-
if (VPValue *Escape = optimizeLatchExitInductionUser(
804-
Plan, TypeInfo, PredVPBB, ExitIRI->getOperand(Idx),
805-
EndValues))
806-
ExitIRI->setOperand(Idx, Escape);
807-
// TODO: Optimize early exit induction users in follow-on patch.
864+
Escape = optimizeLatchExitInductionUser(
865+
Plan, TypeInfo, PredVPBB, ExitIRI->getOperand(Idx), EndValues);
866+
else
867+
Escape = optimizeEarlyExitInductionUser(Plan, TypeInfo, PredVPBB,
868+
ExitIRI->getOperand(Idx));
869+
if (Escape)
870+
ExitIRI->setOperand(Idx, Escape);
808871
}
809872
}
810873
}

llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,12 @@ define i64 @same_exit_block_pre_inc_use1_sve() #1 {
1212
; CHECK: LV: Selecting VF: vscale x 16
1313
; CHECK: Calculating cost of work in exit block vector.early.exit
1414
; CHECK-NEXT: Cost of 4 for VF vscale x 16: EMIT vp<{{.*}}> = first-active-lane vp<{{.*}}>
15-
; CHECK-NEXT: Cost of 2 for VF vscale x 16: EMIT vp<{{.*}}> = extractelement ir<{{.*}}>, vp<{{.*}}>
16-
; CHECK-NEXT: Cost of 4 for VF vscale x 16: EMIT vp<{{.*}}>.1 = first-active-lane vp<{{.*}}>
17-
; CHECK-NEXT: Cost of 2 for VF vscale x 16: EMIT vp<{{.*}}>.1 = extractelement ir<{{.*}}>, vp<{{.*}}>.1
18-
; CHECK: LV: Minimum required TC for runtime checks to be profitable:32
15+
; CHECK-NEXT: Cost of 0 for VF vscale x 16: EMIT vp<{{.*}}> = add
16+
; CHECK-NEXT: Cost of 0 for VF vscale x 16: vp<{{.*}}> = DERIVED-IV
17+
; CHECK-NEXT: Cost of 4 for VF vscale x 16: EMIT vp<{{.*}}> = first-active-lane vp<{{.*}}>
18+
; CHECK-NEXT: Cost of 0 for VF vscale x 16: EMIT vp<{{.*}}> = add
19+
; CHECK-NEXT: Cost of 0 for VF vscale x 16: vp<{{.*}}> = DERIVED-IV
20+
; CHECK: LV: Minimum required TC for runtime checks to be profitable:16
1921
entry:
2022
%p1 = alloca [1024 x i8]
2123
%p2 = alloca [1024 x i8]
@@ -51,11 +53,13 @@ define i64 @same_exit_block_pre_inc_use1_nosve() {
5153
; CHECK: LV: Selecting VF: 16
5254
; CHECK: Calculating cost of work in exit block vector.early.exit
5355
; CHECK-NEXT: Cost of 48 for VF 16: EMIT vp<{{.*}}> = first-active-lane vp<{{.*}}>
54-
; CHECK-NEXT: Cost of 2 for VF 16: EMIT vp<{{.*}}> = extractelement ir<{{.*}}>, vp<{{.*}}>
55-
; CHECK-NEXT: Cost of 48 for VF 16: EMIT vp<{{.*}}>.1 = first-active-lane vp<{{.*}}>
56-
; CHECK-NEXT: Cost of 2 for VF 16: EMIT vp<{{.*}}>.1 = extractelement ir<{{.*}}>, vp<{{.*}}>.1
57-
; CHECK: LV: Minimum required TC for runtime checks to be profitable:176
58-
; CHECK-NEXT: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (64 < 176)
56+
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.*}}> = add
57+
; CHECK-NEXT: Cost of 0 for VF 16: vp<{{.*}}> = DERIVED-IV
58+
; CHECK-NEXT: Cost of 48 for VF 16: EMIT vp<{{.*}}> = first-active-lane vp<{{.*}}>
59+
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.*}}> = add
60+
; CHECK-NEXT: Cost of 0 for VF 16: vp<{{.*}}> = DERIVED-IV
61+
; CHECK: LV: Minimum required TC for runtime checks to be profitable:160
62+
; CHECK-NEXT: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (64 < 160)
5963
; CHECK-NEXT: LV: Too many memory checks needed.
6064
entry:
6165
%p1 = alloca [1024 x i8]

llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll

Lines changed: 10 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -25,16 +25,9 @@ define i64 @same_exit_block_pre_inc_use1() #1 {
2525
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
2626
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
2727
; CHECK-NEXT: [[TMP6:%.*]] = add i64 3, [[N_VEC]]
28-
; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 16 x i64> @llvm.stepvector.nxv16i64()
29-
; CHECK-NEXT: [[TMP8:%.*]] = mul <vscale x 16 x i64> [[TMP7]], splat (i64 1)
30-
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 16 x i64> splat (i64 3), [[TMP8]]
31-
; CHECK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP5]]
32-
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[TMP9]], i64 0
33-
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
3428
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
3529
; CHECK: vector.body:
3630
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
37-
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
3831
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
3932
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 0
4033
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP10]]
@@ -48,7 +41,6 @@ define i64 @same_exit_block_pre_inc_use1() #1 {
4841
; CHECK-NEXT: [[TMP16:%.*]] = xor <vscale x 16 x i1> [[TMP15]], splat (i1 true)
4942
; CHECK-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP16]])
5043
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT3]], [[N_VEC]]
51-
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i64> [[VEC_IND]], [[DOTSPLAT]]
5244
; CHECK-NEXT: [[TMP19:%.*]] = or i1 [[TMP17]], [[TMP18]]
5345
; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
5446
; CHECK: middle.split:
@@ -58,7 +50,8 @@ define i64 @same_exit_block_pre_inc_use1() #1 {
5850
; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_END:%.*]], label [[SCALAR_PH]]
5951
; CHECK: vector.early.exit:
6052
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP16]], i1 true)
61-
; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <vscale x 16 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
53+
; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
54+
; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP20]]
6255
; CHECK-NEXT: br label [[LOOP_END]]
6356
; CHECK: scalar.ph:
6457
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
@@ -140,7 +133,8 @@ define i64 @same_exit_block_pre_inc_use4() {
140133
; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
141134
; CHECK: vector.early.exit:
142135
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP4]], i1 true)
143-
; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <2 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
136+
; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
137+
; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP8]]
144138
; CHECK-NEXT: br label [[LOOP_END]]
145139
; CHECK: scalar.ph:
146140
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
@@ -197,7 +191,6 @@ define i64 @loop_contains_safe_call() #1 {
197191
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
198192
; CHECK: vector.body:
199193
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
200-
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 4, i64 5, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
201194
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
202195
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
203196
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[P1]], i64 [[TMP0]]
@@ -209,7 +202,6 @@ define i64 @loop_contains_safe_call() #1 {
209202
; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP4]], splat (i1 true)
210203
; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
211204
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 64
212-
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
213205
; CHECK-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
214206
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
215207
; CHECK: middle.split:
@@ -218,7 +210,8 @@ define i64 @loop_contains_safe_call() #1 {
218210
; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
219211
; CHECK: vector.early.exit:
220212
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 true)
221-
; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
213+
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
214+
; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP9]]
222215
; CHECK-NEXT: br label [[LOOP_END]]
223216
; CHECK: scalar.ph:
224217
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
@@ -284,16 +277,9 @@ define i64 @loop_contains_safe_div() #1 {
284277
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
285278
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
286279
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
287-
; CHECK-NEXT: [[TMP16:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
288-
; CHECK-NEXT: [[TMP17:%.*]] = mul <vscale x 4 x i64> [[TMP16]], splat (i64 1)
289-
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> splat (i64 3), [[TMP17]]
290-
; CHECK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP5]]
291-
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
292-
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
293280
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
294281
; CHECK: vector.body:
295282
; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
296-
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
297283
; CHECK-NEXT: [[OFFSET_IDX1:%.*]] = add i64 3, [[INDEX2]]
298284
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX1]], 0
299285
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[TMP0]]
@@ -305,7 +291,6 @@ define i64 @loop_contains_safe_div() #1 {
305291
; CHECK-NEXT: [[TMP15:%.*]] = xor <vscale x 4 x i1> [[TMP14]], splat (i1 true)
306292
; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP15]])
307293
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[INDEX1]]
308-
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
309294
; CHECK-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
310295
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
311296
; CHECK: middle.split:
@@ -315,7 +300,8 @@ define i64 @loop_contains_safe_div() #1 {
315300
; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_END:%.*]], label [[SCALAR_PH]]
316301
; CHECK: vector.early.exit:
317302
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> [[TMP15]], i1 true)
318-
; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <vscale x 4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
303+
; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX2]], [[FIRST_ACTIVE_LANE]]
304+
; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP16]]
319305
; CHECK-NEXT: br label [[LOOP_END]]
320306
; CHECK: scalar.ph:
321307
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[OFFSET_IDX]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
@@ -376,7 +362,6 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(
376362
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
377363
; CHECK: vector.body:
378364
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
379-
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 4, i64 5, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
380365
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
381366
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
382367
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[TMP0]]
@@ -390,7 +375,6 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(
390375
; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true)
391376
; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
392377
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
393-
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
394378
; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
395379
; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
396380
; CHECK: middle.split:
@@ -400,7 +384,8 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(
400384
; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
401385
; CHECK: vector.early.exit:
402386
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
403-
; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
387+
; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
388+
; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP11]]
404389
; CHECK-NEXT: br label [[LOOP_END]]
405390
; CHECK: scalar.ph:
406391
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]

0 commit comments

Comments
 (0)