Skip to content

Commit 1c9fe8c

Browse files
authored
[LV] Optimise users of induction variables in early exit blocks (#130766)
This is the second of two PRs that attempts to improve the IR generated in the exit blocks of vectorised loops with uncountable early exits. It follows on from PR #128880. In this PR I am improving the generated code for users of induction variables in early exit blocks. This required using a newly add VPInstruction called FirstActiveLane, which calculates the index of the first active predicate in the mask operand. I have added a new function optimizeEarlyExitInductionUser that is called from optimizeInductionExitUsers when handling users in early exit blocks.
1 parent fed4727 commit 1c9fe8c

File tree

5 files changed

+684
-181
lines changed

5 files changed

+684
-181
lines changed

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 71 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -741,6 +741,69 @@ static VPWidenInductionRecipe *getOptimizableIVOf(VPValue *VPV) {
741741
return IsWideIVInc() ? WideIV : nullptr;
742742
}
743743

744+
/// Attempts to optimize the induction variable exit values for users in the
745+
/// early exit block.
746+
static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
747+
VPTypeAnalysis &TypeInfo,
748+
VPBlockBase *PredVPBB,
749+
VPValue *Op) {
750+
using namespace VPlanPatternMatch;
751+
752+
VPValue *Incoming, *Mask;
753+
if (!match(Op, m_VPInstruction<Instruction::ExtractElement>(
754+
m_VPValue(Incoming),
755+
m_VPInstruction<VPInstruction::FirstActiveLane>(
756+
m_VPValue(Mask)))))
757+
return nullptr;
758+
759+
auto *WideIV = getOptimizableIVOf(Incoming);
760+
if (!WideIV)
761+
return nullptr;
762+
763+
auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
764+
if (WideIntOrFp && WideIntOrFp->getTruncInst())
765+
return nullptr;
766+
767+
// Calculate the final index.
768+
VPValue *EndValue = Plan.getCanonicalIV();
769+
auto CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
770+
VPBuilder B(cast<VPBasicBlock>(PredVPBB));
771+
772+
DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
773+
VPValue *FirstActiveLane =
774+
B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
775+
Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
776+
if (CanonicalIVType != FirstActiveLaneType) {
777+
Instruction::CastOps CastOp =
778+
CanonicalIVType->getScalarSizeInBits() <
779+
FirstActiveLaneType->getScalarSizeInBits()
780+
? Instruction::Trunc
781+
: Instruction::ZExt;
782+
FirstActiveLane =
783+
B.createScalarCast(CastOp, FirstActiveLane, CanonicalIVType, DL);
784+
}
785+
EndValue = B.createNaryOp(Instruction::Add, {EndValue, FirstActiveLane}, DL);
786+
787+
// `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
788+
// changed it means the exit is using the incremented value, so we need to
789+
// add the step.
790+
if (Incoming != WideIV) {
791+
VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(CanonicalIVType, 1));
792+
EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);
793+
}
794+
795+
if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
796+
const InductionDescriptor &ID = WideIV->getInductionDescriptor();
797+
VPValue *Start = WideIV->getStartValue();
798+
VPValue *Step = WideIV->getStepValue();
799+
EndValue = B.createDerivedIV(
800+
ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
801+
Start, EndValue, Step);
802+
}
803+
804+
return EndValue;
805+
}
806+
744807
/// Attempts to optimize the induction variable exit values for users in the
745808
/// exit block coming from the latch in the original scalar loop.
746809
static VPValue *
@@ -803,12 +866,15 @@ void VPlanTransforms::optimizeInductionExitUsers(
803866
break;
804867

805868
for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
869+
VPValue *Escape = nullptr;
806870
if (PredVPBB == MiddleVPBB)
807-
if (VPValue *Escape = optimizeLatchExitInductionUser(
808-
Plan, TypeInfo, PredVPBB, ExitIRI->getOperand(Idx),
809-
EndValues))
810-
ExitIRI->setOperand(Idx, Escape);
811-
// TODO: Optimize early exit induction users in follow-on patch.
871+
Escape = optimizeLatchExitInductionUser(
872+
Plan, TypeInfo, PredVPBB, ExitIRI->getOperand(Idx), EndValues);
873+
else
874+
Escape = optimizeEarlyExitInductionUser(Plan, TypeInfo, PredVPBB,
875+
ExitIRI->getOperand(Idx));
876+
if (Escape)
877+
ExitIRI->setOperand(Idx, Escape);
812878
}
813879
}
814880
}

llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,12 @@ define i64 @same_exit_block_pre_inc_use1_sve() #1 {
1111
; CHECK: LV: Selecting VF: vscale x 16
1212
; CHECK: Calculating cost of work in exit block vector.early.exit
1313
; CHECK-NEXT: Cost of 4 for VF vscale x 16: EMIT vp<{{.*}}> = first-active-lane vp<{{.*}}>
14-
; CHECK-NEXT: Cost of 2 for VF vscale x 16: EMIT vp<{{.*}}> = extractelement ir<{{.*}}>, vp<{{.*}}>
15-
; CHECK-NEXT: Cost of 4 for VF vscale x 16: EMIT vp<{{.*}}>.1 = first-active-lane vp<{{.*}}>
16-
; CHECK-NEXT: Cost of 2 for VF vscale x 16: EMIT vp<{{.*}}>.1 = extractelement ir<{{.*}}>, vp<{{.*}}>.1
17-
; CHECK: LV: Minimum required TC for runtime checks to be profitable:32
14+
; CHECK-NEXT: Cost of 0 for VF vscale x 16: EMIT vp<{{.*}}> = add
15+
; CHECK-NEXT: Cost of 0 for VF vscale x 16: vp<{{.*}}> = DERIVED-IV
16+
; CHECK-NEXT: Cost of 4 for VF vscale x 16: EMIT vp<{{.*}}> = first-active-lane vp<{{.*}}>
17+
; CHECK-NEXT: Cost of 0 for VF vscale x 16: EMIT vp<{{.*}}> = add
18+
; CHECK-NEXT: Cost of 0 for VF vscale x 16: vp<{{.*}}> = DERIVED-IV
19+
; CHECK: LV: Minimum required TC for runtime checks to be profitable:16
1820
entry:
1921
%p1 = alloca [1024 x i8]
2022
%p2 = alloca [1024 x i8]
@@ -50,11 +52,13 @@ define i64 @same_exit_block_pre_inc_use1_nosve() {
5052
; CHECK: LV: Selecting VF: 16
5153
; CHECK: Calculating cost of work in exit block vector.early.exit
5254
; CHECK-NEXT: Cost of 48 for VF 16: EMIT vp<{{.*}}> = first-active-lane vp<{{.*}}>
53-
; CHECK-NEXT: Cost of 2 for VF 16: EMIT vp<{{.*}}> = extractelement ir<{{.*}}>, vp<{{.*}}>
54-
; CHECK-NEXT: Cost of 48 for VF 16: EMIT vp<{{.*}}>.1 = first-active-lane vp<{{.*}}>
55-
; CHECK-NEXT: Cost of 2 for VF 16: EMIT vp<{{.*}}>.1 = extractelement ir<{{.*}}>, vp<{{.*}}>.1
56-
; CHECK: LV: Minimum required TC for runtime checks to be profitable:176
57-
; CHECK-NEXT: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (64 < 176)
55+
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.*}}> = add
56+
; CHECK-NEXT: Cost of 0 for VF 16: vp<{{.*}}> = DERIVED-IV
57+
; CHECK-NEXT: Cost of 48 for VF 16: EMIT vp<{{.*}}> = first-active-lane vp<{{.*}}>
58+
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.*}}> = add
59+
; CHECK-NEXT: Cost of 0 for VF 16: vp<{{.*}}> = DERIVED-IV
60+
; CHECK: LV: Minimum required TC for runtime checks to be profitable:160
61+
; CHECK-NEXT: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (64 < 160)
5862
; CHECK-NEXT: LV: Too many memory checks needed.
5963
entry:
6064
%p1 = alloca [1024 x i8]

llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll

Lines changed: 10 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -25,16 +25,9 @@ define i64 @same_exit_block_pre_inc_use1() #1 {
2525
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
2626
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
2727
; CHECK-NEXT: [[TMP6:%.*]] = add i64 3, [[N_VEC]]
28-
; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 16 x i64> @llvm.stepvector.nxv16i64()
29-
; CHECK-NEXT: [[TMP8:%.*]] = mul <vscale x 16 x i64> [[TMP7]], splat (i64 1)
30-
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 16 x i64> splat (i64 3), [[TMP8]]
31-
; CHECK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP5]]
32-
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[TMP9]], i64 0
33-
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
3428
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
3529
; CHECK: vector.body:
3630
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
37-
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
3831
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
3932
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
4033
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
@@ -47,7 +40,6 @@ define i64 @same_exit_block_pre_inc_use1() #1 {
4740
; CHECK-NEXT: [[TMP16:%.*]] = xor <vscale x 16 x i1> [[TMP15]], splat (i1 true)
4841
; CHECK-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP16]])
4942
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT3]], [[N_VEC]]
50-
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i64> [[VEC_IND]], [[DOTSPLAT]]
5143
; CHECK-NEXT: [[TMP19:%.*]] = or i1 [[TMP17]], [[TMP18]]
5244
; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
5345
; CHECK: middle.split:
@@ -57,7 +49,8 @@ define i64 @same_exit_block_pre_inc_use1() #1 {
5749
; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_END:%.*]], label [[SCALAR_PH]]
5850
; CHECK: vector.early.exit:
5951
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP16]], i1 true)
60-
; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <vscale x 16 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
52+
; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
53+
; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP20]]
6154
; CHECK-NEXT: br label [[LOOP_END]]
6255
; CHECK: scalar.ph:
6356
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
@@ -138,7 +131,8 @@ define i64 @same_exit_block_pre_inc_use4() {
138131
; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
139132
; CHECK: vector.early.exit:
140133
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP4]], i1 true)
141-
; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <2 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
134+
; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
135+
; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP8]]
142136
; CHECK-NEXT: br label [[LOOP_END]]
143137
; CHECK: scalar.ph:
144138
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
@@ -195,7 +189,6 @@ define i64 @loop_contains_safe_call() #1 {
195189
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
196190
; CHECK: vector.body:
197191
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
198-
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 4, i64 5, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
199192
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
200193
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[P1]], i64 [[OFFSET_IDX]]
201194
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
@@ -206,7 +199,6 @@ define i64 @loop_contains_safe_call() #1 {
206199
; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP4]], splat (i1 true)
207200
; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
208201
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 64
209-
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
210202
; CHECK-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
211203
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
212204
; CHECK: middle.split:
@@ -215,7 +207,8 @@ define i64 @loop_contains_safe_call() #1 {
215207
; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
216208
; CHECK: vector.early.exit:
217209
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 true)
218-
; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
210+
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
211+
; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP9]]
219212
; CHECK-NEXT: br label [[LOOP_END]]
220213
; CHECK: scalar.ph:
221214
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
@@ -281,16 +274,9 @@ define i64 @loop_contains_safe_div() #1 {
281274
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
282275
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
283276
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
284-
; CHECK-NEXT: [[TMP16:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
285-
; CHECK-NEXT: [[TMP17:%.*]] = mul <vscale x 4 x i64> [[TMP16]], splat (i64 1)
286-
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> splat (i64 3), [[TMP17]]
287-
; CHECK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP5]]
288-
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
289-
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
290277
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
291278
; CHECK: vector.body:
292279
; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
293-
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
294280
; CHECK-NEXT: [[OFFSET_IDX1:%.*]] = add i64 3, [[INDEX2]]
295281
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[OFFSET_IDX1]]
296282
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
@@ -301,7 +287,6 @@ define i64 @loop_contains_safe_div() #1 {
301287
; CHECK-NEXT: [[TMP15:%.*]] = xor <vscale x 4 x i1> [[TMP14]], splat (i1 true)
302288
; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP15]])
303289
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[INDEX1]]
304-
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
305290
; CHECK-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
306291
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
307292
; CHECK: middle.split:
@@ -311,7 +296,8 @@ define i64 @loop_contains_safe_div() #1 {
311296
; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_END:%.*]], label [[SCALAR_PH]]
312297
; CHECK: vector.early.exit:
313298
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> [[TMP15]], i1 true)
314-
; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <vscale x 4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
299+
; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX2]], [[FIRST_ACTIVE_LANE]]
300+
; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP16]]
315301
; CHECK-NEXT: br label [[LOOP_END]]
316302
; CHECK: scalar.ph:
317303
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[OFFSET_IDX]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
@@ -372,7 +358,6 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(
372358
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
373359
; CHECK: vector.body:
374360
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
375-
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 4, i64 5, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
376361
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
377362
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[OFFSET_IDX]]
378363
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
@@ -385,7 +370,6 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(
385370
; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true)
386371
; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
387372
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
388-
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
389373
; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
390374
; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
391375
; CHECK: middle.split:
@@ -395,7 +379,8 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(
395379
; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
396380
; CHECK: vector.early.exit:
397381
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
398-
; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
382+
; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
383+
; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP11]]
399384
; CHECK-NEXT: br label [[LOOP_END]]
400385
; CHECK: scalar.ph:
401386
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]

0 commit comments

Comments
 (0)