Skip to content

Commit 693a926

Browse files
committed
[LV] Optimise users of induction variables in early exit blocks
This is the second of two PRs that attempts to improve the IR generated in the exit blocks of vectorised loops with uncountable early exits. It follows on from PR #128880. In this PR I am improving the generated code for users of induction variables in early exit blocks. This requires using a newly added VPInstruction called FirstActiveLane, which calculates the index of the first active predicate in the mask operand. I have added a new function optimizeEarlyExitInductionUser that is called from optimizeInductionExitUsers when handling with users in early exit blocks.
1 parent 1dab4a1 commit 693a926

File tree

4 files changed

+160
-182
lines changed

4 files changed

+160
-182
lines changed

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 68 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -741,6 +741,66 @@ static VPWidenInductionRecipe *getOptimizableIVOf(VPValue *VPV) {
741741
return IsWideIVInc() ? WideIV : nullptr;
742742
}
743743

744+
/// Attempts to optimize the induction variable exit values for users in the
745+
/// early exit block.
746+
static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
747+
VPTypeAnalysis &TypeInfo,
748+
VPBlockBase *PredVPBB,
749+
VPValue *Op) {
750+
using namespace VPlanPatternMatch;
751+
752+
VPValue *Incoming, *Mask;
753+
if (!match(Op, m_VPInstruction<Instruction::ExtractElement>(
754+
m_VPValue(Incoming),
755+
m_VPInstruction<VPInstruction::FirstActiveLane>(
756+
m_VPValue(Mask)))))
757+
return nullptr;
758+
759+
auto *WideIV = getOptimizableIVOf(Incoming);
760+
if (!WideIV)
761+
return nullptr;
762+
763+
auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
764+
if (WideIntOrFp && WideIntOrFp->getTruncInst())
765+
return nullptr;
766+
767+
// Calculate the final index.
768+
VPValue *EndValue = Plan.getCanonicalIV();
769+
auto CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
770+
VPBuilder B(cast<VPBasicBlock>(PredVPBB));
771+
772+
DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
773+
VPValue *FirstActiveLane =
774+
B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
775+
if (CanonicalIVType != TypeInfo.inferScalarType(FirstActiveLane)) {
776+
Instruction::CastOps CastOp = CanonicalIVType->getScalarSizeInBits() < 64
777+
? Instruction::Trunc
778+
: Instruction::ZExt;
779+
FirstActiveLane =
780+
B.createScalarCast(CastOp, FirstActiveLane, CanonicalIVType, DL);
781+
}
782+
EndValue = B.createNaryOp(Instruction::Add, {EndValue, FirstActiveLane}, DL);
783+
784+
// `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
785+
// changed it means the exit is using the incremented value, so we need to
786+
// add the step.
787+
if (Incoming != WideIV) {
788+
VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(CanonicalIVType, 1));
789+
EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);
790+
}
791+
792+
if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
793+
const InductionDescriptor &ID = WideIV->getInductionDescriptor();
794+
VPValue *Start = WideIV->getStartValue();
795+
VPValue *Step = WideIV->getStepValue();
796+
EndValue = B.createDerivedIV(
797+
ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
798+
Start, EndValue, Step);
799+
}
800+
801+
return EndValue;
802+
}
803+
744804
/// Attempts to optimize the induction variable exit values for users in the
745805
/// exit block coming from the latch in the original scalar loop.
746806
static VPValue *
@@ -803,12 +863,15 @@ void VPlanTransforms::optimizeInductionExitUsers(
803863
break;
804864

805865
for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
866+
VPValue *Escape = nullptr;
806867
if (PredVPBB == MiddleVPBB)
807-
if (VPValue *Escape = optimizeLatchExitInductionUser(
808-
Plan, TypeInfo, PredVPBB, ExitIRI->getOperand(Idx),
809-
EndValues))
810-
ExitIRI->setOperand(Idx, Escape);
811-
// TODO: Optimize early exit induction users in follow-on patch.
868+
Escape = optimizeLatchExitInductionUser(
869+
Plan, TypeInfo, PredVPBB, ExitIRI->getOperand(Idx), EndValues);
870+
else
871+
Escape = optimizeEarlyExitInductionUser(Plan, TypeInfo, PredVPBB,
872+
ExitIRI->getOperand(Idx));
873+
if (Escape)
874+
ExitIRI->setOperand(Idx, Escape);
812875
}
813876
}
814877
}

llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,12 @@ define i64 @same_exit_block_pre_inc_use1_sve() #1 {
1111
; CHECK: LV: Selecting VF: vscale x 16
1212
; CHECK: Calculating cost of work in exit block vector.early.exit
1313
; CHECK-NEXT: Cost of 4 for VF vscale x 16: EMIT vp<{{.*}}> = first-active-lane vp<{{.*}}>
14-
; CHECK-NEXT: Cost of 2 for VF vscale x 16: EMIT vp<{{.*}}> = extractelement ir<{{.*}}>, vp<{{.*}}>
15-
; CHECK-NEXT: Cost of 4 for VF vscale x 16: EMIT vp<{{.*}}>.1 = first-active-lane vp<{{.*}}>
16-
; CHECK-NEXT: Cost of 2 for VF vscale x 16: EMIT vp<{{.*}}>.1 = extractelement ir<{{.*}}>, vp<{{.*}}>.1
17-
; CHECK: LV: Minimum required TC for runtime checks to be profitable:32
14+
; CHECK-NEXT: Cost of 0 for VF vscale x 16: EMIT vp<{{.*}}> = add
15+
; CHECK-NEXT: Cost of 0 for VF vscale x 16: vp<{{.*}}> = DERIVED-IV
16+
; CHECK-NEXT: Cost of 4 for VF vscale x 16: EMIT vp<{{.*}}> = first-active-lane vp<{{.*}}>
17+
; CHECK-NEXT: Cost of 0 for VF vscale x 16: EMIT vp<{{.*}}> = add
18+
; CHECK-NEXT: Cost of 0 for VF vscale x 16: vp<{{.*}}> = DERIVED-IV
19+
; CHECK: LV: Minimum required TC for runtime checks to be profitable:16
1820
entry:
1921
%p1 = alloca [1024 x i8]
2022
%p2 = alloca [1024 x i8]
@@ -50,11 +52,13 @@ define i64 @same_exit_block_pre_inc_use1_nosve() {
5052
; CHECK: LV: Selecting VF: 16
5153
; CHECK: Calculating cost of work in exit block vector.early.exit
5254
; CHECK-NEXT: Cost of 48 for VF 16: EMIT vp<{{.*}}> = first-active-lane vp<{{.*}}>
53-
; CHECK-NEXT: Cost of 2 for VF 16: EMIT vp<{{.*}}> = extractelement ir<{{.*}}>, vp<{{.*}}>
54-
; CHECK-NEXT: Cost of 48 for VF 16: EMIT vp<{{.*}}>.1 = first-active-lane vp<{{.*}}>
55-
; CHECK-NEXT: Cost of 2 for VF 16: EMIT vp<{{.*}}>.1 = extractelement ir<{{.*}}>, vp<{{.*}}>.1
56-
; CHECK: LV: Minimum required TC for runtime checks to be profitable:176
57-
; CHECK-NEXT: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (64 < 176)
55+
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.*}}> = add
56+
; CHECK-NEXT: Cost of 0 for VF 16: vp<{{.*}}> = DERIVED-IV
57+
; CHECK-NEXT: Cost of 48 for VF 16: EMIT vp<{{.*}}> = first-active-lane vp<{{.*}}>
58+
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.*}}> = add
59+
; CHECK-NEXT: Cost of 0 for VF 16: vp<{{.*}}> = DERIVED-IV
60+
; CHECK: LV: Minimum required TC for runtime checks to be profitable:160
61+
; CHECK-NEXT: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (64 < 160)
5862
; CHECK-NEXT: LV: Too many memory checks needed.
5963
entry:
6064
%p1 = alloca [1024 x i8]

llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll

Lines changed: 10 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -25,16 +25,9 @@ define i64 @same_exit_block_pre_inc_use1() #1 {
2525
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
2626
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
2727
; CHECK-NEXT: [[TMP6:%.*]] = add i64 3, [[N_VEC]]
28-
; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 16 x i64> @llvm.stepvector.nxv16i64()
29-
; CHECK-NEXT: [[TMP8:%.*]] = mul <vscale x 16 x i64> [[TMP7]], splat (i64 1)
30-
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 16 x i64> splat (i64 3), [[TMP8]]
31-
; CHECK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP5]]
32-
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[TMP9]], i64 0
33-
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
3428
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
3529
; CHECK: vector.body:
3630
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
37-
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
3831
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
3932
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
4033
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
@@ -47,7 +40,6 @@ define i64 @same_exit_block_pre_inc_use1() #1 {
4740
; CHECK-NEXT: [[TMP16:%.*]] = xor <vscale x 16 x i1> [[TMP15]], splat (i1 true)
4841
; CHECK-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP16]])
4942
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT3]], [[N_VEC]]
50-
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i64> [[VEC_IND]], [[DOTSPLAT]]
5143
; CHECK-NEXT: [[TMP19:%.*]] = or i1 [[TMP17]], [[TMP18]]
5244
; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
5345
; CHECK: middle.split:
@@ -57,7 +49,8 @@ define i64 @same_exit_block_pre_inc_use1() #1 {
5749
; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_END:%.*]], label [[SCALAR_PH]]
5850
; CHECK: vector.early.exit:
5951
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP16]], i1 true)
60-
; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <vscale x 16 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
52+
; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
53+
; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP20]]
6154
; CHECK-NEXT: br label [[LOOP_END]]
6255
; CHECK: scalar.ph:
6356
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
@@ -138,7 +131,8 @@ define i64 @same_exit_block_pre_inc_use4() {
138131
; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
139132
; CHECK: vector.early.exit:
140133
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP4]], i1 true)
141-
; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <2 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
134+
; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
135+
; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP8]]
142136
; CHECK-NEXT: br label [[LOOP_END]]
143137
; CHECK: scalar.ph:
144138
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
@@ -195,7 +189,6 @@ define i64 @loop_contains_safe_call() #1 {
195189
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
196190
; CHECK: vector.body:
197191
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
198-
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 4, i64 5, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
199192
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
200193
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[P1]], i64 [[OFFSET_IDX]]
201194
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
@@ -206,7 +199,6 @@ define i64 @loop_contains_safe_call() #1 {
206199
; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP4]], splat (i1 true)
207200
; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
208201
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 64
209-
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
210202
; CHECK-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
211203
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
212204
; CHECK: middle.split:
@@ -215,7 +207,8 @@ define i64 @loop_contains_safe_call() #1 {
215207
; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
216208
; CHECK: vector.early.exit:
217209
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 true)
218-
; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
210+
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
211+
; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP9]]
219212
; CHECK-NEXT: br label [[LOOP_END]]
220213
; CHECK: scalar.ph:
221214
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
@@ -281,16 +274,9 @@ define i64 @loop_contains_safe_div() #1 {
281274
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
282275
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
283276
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
284-
; CHECK-NEXT: [[TMP16:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
285-
; CHECK-NEXT: [[TMP17:%.*]] = mul <vscale x 4 x i64> [[TMP16]], splat (i64 1)
286-
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> splat (i64 3), [[TMP17]]
287-
; CHECK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP5]]
288-
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
289-
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
290277
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
291278
; CHECK: vector.body:
292279
; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
293-
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
294280
; CHECK-NEXT: [[OFFSET_IDX1:%.*]] = add i64 3, [[INDEX2]]
295281
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[OFFSET_IDX1]]
296282
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
@@ -301,7 +287,6 @@ define i64 @loop_contains_safe_div() #1 {
301287
; CHECK-NEXT: [[TMP15:%.*]] = xor <vscale x 4 x i1> [[TMP14]], splat (i1 true)
302288
; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP15]])
303289
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[INDEX1]]
304-
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
305290
; CHECK-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
306291
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
307292
; CHECK: middle.split:
@@ -311,7 +296,8 @@ define i64 @loop_contains_safe_div() #1 {
311296
; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_END:%.*]], label [[SCALAR_PH]]
312297
; CHECK: vector.early.exit:
313298
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> [[TMP15]], i1 true)
314-
; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <vscale x 4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
299+
; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX2]], [[FIRST_ACTIVE_LANE]]
300+
; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP16]]
315301
; CHECK-NEXT: br label [[LOOP_END]]
316302
; CHECK: scalar.ph:
317303
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[OFFSET_IDX]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
@@ -372,7 +358,6 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(
372358
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
373359
; CHECK: vector.body:
374360
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
375-
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 4, i64 5, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
376361
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
377362
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[OFFSET_IDX]]
378363
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
@@ -385,7 +370,6 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(
385370
; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true)
386371
; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
387372
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
388-
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
389373
; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
390374
; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
391375
; CHECK: middle.split:
@@ -395,7 +379,8 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(
395379
; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
396380
; CHECK: vector.early.exit:
397381
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
398-
; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
382+
; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
383+
; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP11]]
399384
; CHECK-NEXT: br label [[LOOP_END]]
400385
; CHECK: scalar.ph:
401386
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]

0 commit comments

Comments
 (0)